def freqs_wrapper(windowQueue, resultQueue, genoFormat, sampleData, minData, target, asCounts, keepNanLines = False): while True: windowNumber,window = windowQueue.get() # retrieve window #make alignment objects aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat) popAlns = dict(zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames])) #target base at each site if target == "derived": #use last pop as outgroup outgroup = sampleData.popNames[-1] inAln = aln.subset(groups = sampleData.popNames[:-1]) baseColumns = np.array([genomics.derivedAllele(inAln.numArray[:,i][inAln.nanMask[:,i]], popAlns[outgroup].numArray[:,i][popAlns[outgroup].nanMask[:,i]], numeric=True) for i in xrange(aln.l)]).reshape([aln.l,1]) else: #otherwise get minor allele. baseColumns = np.array([genomics.minorAllele(aln.numArray[:,i][aln.nanMask[:,i]]) for i in xrange(aln.l)]).reshape([aln.l,1]) goodSites = np.apply_along_axis(lambda(x): ~np.any(np.isnan(x)),1,baseColumns) #get freqs per pop popFreqs = [] for pop in sampleData.popNames: #first find sites with sufficient data goodData = popAlns[pop].siteNonNan() >= minData sites = np.where(goodSites & goodData)[0] baseFreqs = popAlns[pop].siteFreqs(sites, asCounts=asCounts) popColumns = baseColumns[sites,:].astype(int) popRows = np.repeat(np.arange(len(sites))[:,np.newaxis],popColumns.shape[1], axis = 1) targetFreqs = np.empty([aln.l, popColumns.shape[1]], dtype=int if asCounts else float) targetFreqs.fill(np.nan) if len(sites) >= 1: targetFreqs[sites,:] = baseFreqs[popRows,popColumns] popFreqs.append(np.around(targetFreqs, 4)) allFreqs = np.hstack(popFreqs) if not keepNanLines: outSites = np.where(~np.apply_along_axis(np.all, 1, np.isnan(allFreqs)))[0] else: outSites = range(aln.l) outArray = np.column_stack(([window.scaffold]*len(outSites), np.array(window.positions)[outSites].astype(str), allFreqs[outSites,:].astype(str),)) resultStrings = ["\t".join(row) for row in outArray] resultQueue.put((windowNumber, resultStrings,))
def freqs_wrapper(inQueue, resultQueue, headerLine, genoFormat, sampleData, target, minData, asCounts, threshold, keepNanLines = False): while True: sliceNumber,fileSlice = inQueue.get() # retrieve slice if sliceNumber == -1: resultQueue.put((-1,None,)) # this is the way of telling everything we're done break window = genomics.parseGenoFile(fileSlice, headerLine, names=sampleData.indNames) #make alignment objects aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat) popAlns = dict([(popName, aln.subset(groups=[popName])) for popName in sampleData.popNames]) #this above replaced this below, as it should be faster #popAlns = dict(zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames])) #if there is no target, fetch all base counts if not target: popFreqs = [] for pop in sampleData.popNames: goodData = popAlns[pop].siteNonNan() >= minData sites = np.where(goodData)[0] baseFreqs = popAlns[pop].siteFreqs(asCounts=asCounts) popFreqs.append([",".join(row) for row in baseFreqs.astype(str)]) allFreqs = np.column_stack(popFreqs) else: #otherwise define the target base at each site if target == "derived": #use last pop as outgroup outgroup = sampleData.popNames[-1] inAln = aln.subset(groups = sampleData.popNames[:-1]) baseColumns = np.array([genomics.derivedAllele(inAln.numArray[:,i][inAln.nanMask[:,i]], popAlns[outgroup].numArray[:,i][popAlns[outgroup].nanMask[:,i]], numeric=True) for i in range(aln.l)]).reshape([aln.l,1]) else: #otherwise get minor allele. baseColumns = np.array([genomics.minorAllele(aln.numArray[:,i][aln.nanMask[:,i]]) for i in xrange(aln.l)]).reshape([aln.l,1]) goodSites = np.apply_along_axis(lambda x: ~np.any(np.isnan(x)),1,baseColumns) #get freqs per pop popFreqs = [] for pop in sampleData.popNames: #first find sites with sufficient data goodData = popAlns[pop].siteNonNan() >= minData sites = np.where(goodSites & goodData)[0] baseFreqs = popAlns[pop].siteFreqs(sites, asCounts=asCounts) popColumns = baseColumns[sites,:].astype(int) popRows = np.repeat(np.arange(len(sites))[:,np.newaxis],popColumns.shape[1], axis = 1) targetFreqs = np.zeros([aln.l, popColumns.shape[1]], dtype=int if asCounts else float) if not asCounts: targetFreqs.fill(np.nan) if len(sites) >= 1: targetFreqs[sites,:] = baseFreqs[popRows,popColumns] popFreqs.append(np.around(targetFreqs, 4)) allFreqs = np.hstack(popFreqs) if threshold and not asCounts: allFreqs[allFreqs >= threshold] = 1 allFreqs[allFreqs < threshold] = 0 #fetch scaffold and position scafPos = np.array([line.split(None, 2)[:2] for line in fileSlice], dtype="str") if not keepNanLines: if not asCounts: outSites = np.where(~np.apply_along_axis(np.all, 1, np.isnan(allFreqs)))[0] else: outSites = np.where(~np.apply_along_axis(np.all, 1, allFreqs==0))[0] else: outSites = range(aln.l) outArray = np.column_stack((scafPos[outSites,:], allFreqs[outSites,:].astype(str),)) resultStrings = ["\t".join(row) for row in outArray] resultQueue.put((sliceNumber, resultStrings,))