def freqs_wrapper(inQueue, resultQueue, headerLine, genoFormat, sampleData, target, minData, asCounts, threshold, keepNanLines = False): while True: sliceNumber,fileSlice = inQueue.get() # retrieve slice if sliceNumber == -1: resultQueue.put((-1,None,)) # this is the way of telling everything we're done break window = genomics.parseGenoFile(fileSlice, headerLine, names=sampleData.indNames) #make alignment objects aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat) popAlns = dict([(popName, aln.subset(groups=[popName])) for popName in sampleData.popNames]) #this above replaced this below, as it should be faster #popAlns = dict(zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames])) #if there is no target, fetch all base counts if not target: popFreqs = [] for pop in sampleData.popNames: goodData = popAlns[pop].siteNonNan() >= minData sites = np.where(goodData)[0] baseFreqs = popAlns[pop].siteFreqs(asCounts=asCounts) popFreqs.append([",".join(row) for row in baseFreqs.astype(str)]) allFreqs = np.column_stack(popFreqs) else: #otherwise define the target base at each site if target == "derived": #use last pop as outgroup outgroup = sampleData.popNames[-1] inAln = aln.subset(groups = sampleData.popNames[:-1]) baseColumns = np.array([genomics.derivedAllele(inAln.numArray[:,i][inAln.nanMask[:,i]], popAlns[outgroup].numArray[:,i][popAlns[outgroup].nanMask[:,i]], numeric=True) for i in range(aln.l)]).reshape([aln.l,1]) else: #otherwise get minor allele. baseColumns = np.array([genomics.minorAllele(aln.numArray[:,i][aln.nanMask[:,i]]) for i in xrange(aln.l)]).reshape([aln.l,1]) goodSites = np.apply_along_axis(lambda x: ~np.any(np.isnan(x)),1,baseColumns) #get freqs per pop popFreqs = [] for pop in sampleData.popNames: #first find sites with sufficient data goodData = popAlns[pop].siteNonNan() >= minData sites = np.where(goodSites & goodData)[0] baseFreqs = popAlns[pop].siteFreqs(sites, asCounts=asCounts) popColumns = baseColumns[sites,:].astype(int) popRows = np.repeat(np.arange(len(sites))[:,np.newaxis],popColumns.shape[1], axis = 1) targetFreqs = np.zeros([aln.l, popColumns.shape[1]], dtype=int if asCounts else float) if not asCounts: targetFreqs.fill(np.nan) if len(sites) >= 1: targetFreqs[sites,:] = baseFreqs[popRows,popColumns] popFreqs.append(np.around(targetFreqs, 4)) allFreqs = np.hstack(popFreqs) if threshold and not asCounts: allFreqs[allFreqs >= threshold] = 1 allFreqs[allFreqs < threshold] = 0 #fetch scaffold and position scafPos = np.array([line.split(None, 2)[:2] for line in fileSlice], dtype="str") if not keepNanLines: if not asCounts: outSites = np.where(~np.apply_along_axis(np.all, 1, np.isnan(allFreqs)))[0] else: outSites = np.where(~np.apply_along_axis(np.all, 1, allFreqs==0))[0] else: outSites = range(aln.l) outArray = np.column_stack((scafPos[outSites,:], allFreqs[outSites,:].astype(str),)) resultStrings = ["\t".join(row) for row in outArray] resultQueue.put((sliceNumber, resultStrings,))
#extract each scaffold from the geno file, and the genes for each scaffold and write them out for scaffold in geneData.keys(): mRNAs = geneData[scaffold].keys() sys.stderr.write("Extracting " + str(len(mRNAs)) + " gene sequences from " + scaffold + "\n") for mRNA in mRNAs: sys.stderr.write(mRNA + "\n") region = scaffold + ":" + str( geneData[scaffold][mRNA]["start"]) + "-" + str( geneData[scaffold][mRNA]["end"]) sys.stderr.write("Getting region " + region + " from geno file...\n") genoStream = subprocess.Popen(['tabix', '-h', args.genoFile, region], stdout=subprocess.PIPE) window = genomics.parseGenoFile(genoStream.stdout, names=args.samples, includePositions=True, splitPhased=args.split) seqDict = window.seqDict() seqNames = seqDict.keys() sys.stderr.write("Extracting CDS...\n") CDSseqs = [ genomics.CDS(seqDict[name], window.positions, geneData[scaffold][mRNA]['cdsStarts'], geneData[scaffold][mRNA]['cdsEnds'], geneData[scaffold][mRNA]['strand']) for name in seqNames ] outputNames = [ name + "_" + mRNA + " " + scaffold + " " + str(geneData[scaffold][mRNA]['start']) + "-" +
worker.start() '''start background Thread that will run a loop to check run statistics and print We use thread, because I think this is necessary for a process that watches global variables like linesTested''' worker = Thread(target=checkStats) worker.daemon = True worker.start() ########################################################## if args.windType == "cat": window = genomics.parseGenoFile(genoFile, names=sampleData.indNames) windowQueue.put((windowsQueued,window)) windowsQueued += 1 else: #get windows and analyse if args.windType == "coordinate": windowGenerator = genomics.slidingCoordWindows(genoFile, windSize, stepSize, sampleData.indNames, include = scafsToInclude, exclude = scafsToExclude) elif args.windType == "sites": windowGenerator = genomics.slidingSitesWindows(genoFile, windSize, overlap, maxDist, minSites, sampleData.indNames, include = scafsToInclude, exclude = scafsToExclude) else: windowGenerator = genomics.predefinedCoordWindows(genoFile, windCoords, sampleData.indNames)
)) worker.daemon = True worker.start() '''start background Thread that will run a loop to check run statistics and print We use thread, because I think this is necessary for a process that watches global variables like linesTested''' worker = Thread(target=checkStats) worker.daemon = True worker.start() ########################################################## headerLine = "\t".join(args.headers) if args.headers else None if args.windType == "cat": window = genomics.parseGenoFile(genoFile, headerLine=headerLine, names=sampleData.indNames) windowQueue.put((windowsQueued, window)) windowsQueued += 1 else: #get windows and analyse if args.windType == "coordinate": windowGenerator = genomics.slidingCoordWindows( genoFile, windSize, stepSize, headerLine=headerLine, names=sampleData.indNames, include=scafsToInclude, exclude=scafsToExclude)
if not args.separateFiles: if args.seqFile: if args.seqFile[-3:] == ".gz": seqFile = gzip.open(args.seqFile, "w") elif args.gzip: seqFile = gzip.open(args.seqFile + ".gz", "w") else: seqFile = open(args.seqFile, "w") else: seqFile = sys.stdout ############################# samples = args.samples.split(",") if args.samples else None # if cating all contigs, just parse file and write if args.mode == "cat": #read file into window like object window = genomics.parseGenoFile(genoFile, names=samples, splitPhased=args.splitPhased) #write seqDict = window.seqDict() seqFile.write( genomics.makeAlnString(window.names, [seqDict[name] for name in window.names], outFormat=args.format)) genoFile.close() seqFile.close() exit() if args.mode == "windows" or args.mode == "contigs": if args.mode == "windows": windType = args.windType windSize = args.windSize