def ABBABABA_wrapper(windowQueue, resultQueue, windType, genoFormat, sampleData, P1, P2, P3, O, minData, minSites, addWindowID=False, stats = ["ABBA","BABA","D","fd","fdM"]): while True: windowNumber,window = windowQueue.get() # retrieve window if windType == "coordinate" or windType == "predefined": scaf,start,end,mid,sites = (window.scaffold, window.limits[0], window.limits[1], window.midPos(),window.seqLen()) else: scaf,start,end,mid,sites = (window.scaffold, window.firstPos(), window.lastPos(),window.midPos(),window.seqLen()) sitesUsed = np.NaN if sites >= minSites: #make alignment object Aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat) statsDict = genomics.ABBABABA(Aln, P1, P2, P3, O, minData) sitesUsed = statsDict["sitesUsed"] if sitesUsed >= minSites: isGood = True values = [round(statsDict[stat],4) for stat in stats] else: isGood = False values = [np.NaN]*len(stats) else: isGood = False values = [np.NaN]*len(stats) results = [] if not addWindowID else [window.ID] results += [scaf,start,end,mid,sites,sitesUsed] + values resultString = ",".join([str(x) for x in results]) resultQueue.put((windowNumber, resultString, isGood))
def stats_wrapper(windowQueue, resultQueue, windType, genoFormat, sampleData, minSites, minPerInd, includeSameWithSame, outFormat, roundTo, outputWindowData, addWindowID=False): while True: nInd = len(sampleData.indNames) windowNumber, window = windowQueue.get() # retrieve window if windType == "coordinate" or windType == "predefined": scaf, start, end, mid, sites = (window.scaffold, window.limits[0], window.limits[1], window.midPos(), window.seqLen()) else: scaf, start, end, mid, sites = (window.scaffold, window.firstPos(), window.lastPos(), window.midPos(), window.seqLen()) if sites >= minSites: isGood = True #make alignment object aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat=genoFormat) if minPerInd and min(aln.seqNonNan()) < minPerInd: isGood = False else: pairDistDict = aln.indPairDists( includeSameWithSame=includeSameWithSame) distMat = np.zeros([nInd, nInd]) for i, j in itertools.combinations_with_replacement( range(nInd), 2): distMat[i, j] = distMat[j, i] = pairDistDict[ sampleData.indNames[i]][sampleData.indNames[j]] else: isGood = False if not isGood: distMat = np.empty([nInd, nInd]) distMat.fill(np.NaN) if outFormat == "nexus": distMatString = genomics.makeDistMatNexusString( distMat, names=sampleData.indNames, roundTo=roundTo) elif outFormat == "phylip": distMatString = genomics.makeDistMatPhylipString( distMat, names=sampleData.indNames, roundTo=roundTo) elif outFormat == "raw": distMatString = genomics.makeDistMatString(distMat, roundTo=roundTo) + "\n" result = {"main": distMatString} if outputWindowData: windowData = [] if not addWindowID else [window.ID] windowData += [scaf, start, end, mid, sites] windowDataString = "\t".join([str(x) for x in windowData]) + "\n" result["windows"] = windowDataString resultQueue.put((windowNumber, result, isGood))
def freqs_wrapper(windowQueue, resultQueue, genoFormat, sampleData): while True: windowNumber, window = windowQueue.get() # retrieve window #make alignment objects aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat=genoFormat) popAlns = dict( zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames])) popCounts = [] for pop in sampleData.popNames: baseCounts = popAlns[pop].siteFreqs(asCounts=True) popCounts.append([",".join(row) for row in baseCounts.astype(str)]) allCounts = np.vstack(popCounts) outArray = np.transpose( np.vstack(([window.scaffold] * aln.l, np.array(window.positions), allCounts))) resultStrings = ["\t".join(row) + "\n" for row in outArray] resultQueue.put(( windowNumber, resultStrings, ))
def stats_wrapper(windowQueue, resultQueue, windType, genoFormat, sampleData, minSites, analysis, stats, addWindowID=False, roundTo=4): while True: windowNumber, window = windowQueue.get() # retrieve window if windType == "coordinate" or windType == "predefined": scaf, start, end, mid, sites = (window.scaffold, window.limits[0], window.limits[1], window.midPos(), window.seqLen()) else: scaf, start, end, mid, sites = (window.scaffold, window.firstPos(), window.lastPos(), window.midPos(), window.seqLen()) if sites >= minSites: isGood = True #make alignment object Aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat=genoFormat) statsDict = {} if "popFreq" in analysis: statsDict.update(Aln.groupFreqStats()) if "popDist" in analysis or "popPairDist" in analysis: statsDict.update( Aln.groupDistStats(doPairs="popPairDist" in analysis)) if "indPairDist" in analysis: pairDistDict = Aln.indPairDists() for i, j in itertools.combinations_with_replacement( sorted(pairDistDict.keys()), 2): statsDict["_".join(["d", i, j])] = pairDistDict[i][j] if "indHet" in analysis: hetDict = Aln.sampleHet() for key in hetDict.keys(): statsDict["het_" + key] = hetDict[key] values = [round(statsDict[stat], roundTo) for stat in stats] else: isGood = False values = [np.NaN] * len(stats) results = [] if not addWindowID else [window.ID] results += [scaf, start, end, mid, sites] + values resultString = ",".join([str(x) for x in results]) resultQueue.put((windowNumber, resultString, isGood))
def freqs_wrapper(windowQueue, resultQueue, genoFormat, sampleData, minData, target, asCounts, keepNanLines = False): while True: windowNumber,window = windowQueue.get() # retrieve window #make alignment objects aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat) popAlns = dict(zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames])) #target base at each site if target == "derived": #use last pop as outgroup outgroup = sampleData.popNames[-1] inAln = aln.subset(groups = sampleData.popNames[:-1]) baseColumns = np.array([genomics.derivedAllele(inAln.numArray[:,i][inAln.nanMask[:,i]], popAlns[outgroup].numArray[:,i][popAlns[outgroup].nanMask[:,i]], numeric=True) for i in xrange(aln.l)]).reshape([aln.l,1]) else: #otherwise get minor allele. baseColumns = np.array([genomics.minorAllele(aln.numArray[:,i][aln.nanMask[:,i]]) for i in xrange(aln.l)]).reshape([aln.l,1]) goodSites = np.apply_along_axis(lambda(x): ~np.any(np.isnan(x)),1,baseColumns) #get freqs per pop popFreqs = [] for pop in sampleData.popNames: #first find sites with sufficient data goodData = popAlns[pop].siteNonNan() >= minData sites = np.where(goodSites & goodData)[0] baseFreqs = popAlns[pop].siteFreqs(sites, asCounts=asCounts) popColumns = baseColumns[sites,:].astype(int) popRows = np.repeat(np.arange(len(sites))[:,np.newaxis],popColumns.shape[1], axis = 1) targetFreqs = np.empty([aln.l, popColumns.shape[1]], dtype=int if asCounts else float) targetFreqs.fill(np.nan) if len(sites) >= 1: targetFreqs[sites,:] = baseFreqs[popRows,popColumns] popFreqs.append(np.around(targetFreqs, 4)) allFreqs = np.hstack(popFreqs) if not keepNanLines: outSites = np.where(~np.apply_along_axis(np.all, 1, np.isnan(allFreqs)))[0] else: outSites = range(aln.l) outArray = np.column_stack(([window.scaffold]*len(outSites), np.array(window.positions)[outSites].astype(str), allFreqs[outSites,:].astype(str),)) resultStrings = ["\t".join(row) for row in outArray] resultQueue.put((windowNumber, resultStrings,))
def raxml_wrapper(windowQueue, resultQueue, windType, model, outgroup, raxml, minSites, minPerInd, minSNPs=None, test=False): while True: windowNumber, window = windowQueue.get() Nsites = window.seqLen() if test or verbose: print >> sys.stderr, "Window", windowNumber, "received for analysis, length:", Nsites if windType == "coordinate" or windType == "predefined": scaf, start, end, mid = (window.scaffold, window.limits[0], window.limits[1], window.midPos()) else: scaf, start, end, mid = (window.scaffold, window.firstPos(), window.lastPos(), window.midPos()) data = [window.scaffold, str(start), str(end), str(mid), str(Nsites)] prefix = scaf + "_" + str(start) + "_" + str(end) + "_" if Nsites >= minSites: aln = genomics.genoToAlignment(window.seqDict(), genoFormat="phased") indNames = window.names sitesPerInd = aln.seqNonNan() if (min(sitesPerInd) >= minPerInd and (minSNPs is None or len( aln.varSites(indices=np.array([ i for i in range(aln.N) if aln.sampleNames[i] not in outgroup ]))) >= minSNPs)): tree = raxTree(aln.array, aln.names, model, raxml, outgroup, prefix, test=test, log=log) else: tree = "NA\n" else: tree = "NA\n" resultQueue.put((windowNumber, "\t".join(data), tree))
def phyml_wrapper(windowQueue, resultQueue, windType, model, opt, outgroup, phyml, minSites, minPerInd, minSNPs=None, maxLDphase=False, bootstraps=0, crossVal=False, test = False): while True: windowNumber,window = windowQueue.get() Nsites = window.seqLen() if test or verbose: print >> sys.stderr, "Window", windowNumber, "received for analysis, length:", Nsites if windType == "coordinate" or windType == "predefined": scaf,start,end,mid = (window.scaffold, window.limits[0], window.limits[1], window.midPos()) else: scaf,start,end,mid = (window.scaffold, window.firstPos(), window.lastPos(), window.midPos()) prefix = scaf + "_" + str(start) + "_" + str(end) + "_" if Nsites >= minSites: aln = genomics.genoToAlignment(window.seqDict(), genoFormat = "phased") if len(outgroup) >= 1: for seqName in aln.names: if seqName in outgroup: seqName +="*" sitesPerInd = aln.seqNonNan() if min(sitesPerInd) >= minPerInd and (minSNPs is None or len(aln.varSites(indices=np.array([i for i in range(aln.N) if aln.sampleNames[i] not in outgroup]))) >= minSNPs): if maxLDphase: aln = genomics.maxLDphase(aln) #if enough sites get tree tree,lnL = phymlTree(aln.array,aln.names,model,opt,phyml,prefix,tmpDir=tmpDir, test = test, log = log) bsTrees = [] for b in range(bootstraps): #get bootstrap trees if necessary positions = np.random.choice(range(Nsites), Nsites, replace=True) newArr = aln.array[:,positions] bsTree,bslnL = phymlTree(newArr,aln.names,model,opt,phyml,prefix + str(b) + "_",tmpDir=tmpDir, test = test, log = log) bsTrees.append(bsTree) trees = [tree] + bsTrees if crossVal: arr0 = aln.arr[:,range(int(round(Nsites/2)))] arr1 = aln.arr[:,range(int(round(Nsites/2)), Nsites)] cvlnL = phymlCrossVal(arr0,arr1,indNames,model,opt,phyml,prefix,tmpDir=tmpDir, test = test, log = log) else: trees = ["NA"] + ["NA"]*bootstraps lnL = cvlnL = "NA" else: trees = ["NA"] + ["NA"]*bootstraps lnL = cvlnL = "NA" data = [window.scaffold, str(start), str(end), str(mid), str(Nsites), str(lnL)] if crossVal: data.append(cvlnL) output = ["\t".join(data)] + trees resultQueue.put((windowNumber, tuple(output),))
def stats_wrapper(windowQueue, resultQueue, windType, genoFormat, sampleData, minSites, stats, doPops, skipPairs, indHet, addWindowID=False): while True: windowNumber, window = windowQueue.get() # retrieve window if windType == "coordinate" or windType == "predefined": scaf, start, end, mid, sites = (window.scaffold, window.limits[0], window.limits[1], window.midPos(), window.seqLen()) else: scaf, start, end, mid, sites = (window.scaffold, window.firstPos(), window.lastPos(), window.midPos(), window.seqLen()) if sites >= minSites: isGood = True #make alignment object Aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat=genoFormat) statsDict = {} if doPops: statsDict.update(genomics.popDiv(Aln, doPairs=not skipPairs)) if indHet: hetDict = Aln.sampleHet() for key in hetDict.keys(): hetDict["het_" + key] = hetDict.pop(key) statsDict.update(hetDict) values = [round(statsDict[stat], 4) for stat in stats] else: isGood = False values = [np.NaN] * len(stats) results = [] if not addWindowID else [window.ID] results += [scaf, start, end, mid, sites] + values resultString = ",".join([str(x) for x in results]) resultQueue.put((windowNumber, resultString, isGood))
def freqs_wrapper(inQueue, resultQueue, headerLine, genoFormat, sampleData, target, minData, asCounts, threshold, keepNanLines = False): while True: sliceNumber,fileSlice = inQueue.get() # retrieve slice if sliceNumber == -1: resultQueue.put((-1,None,)) # this is the way of telling everything we're done break window = genomics.parseGenoFile(fileSlice, headerLine, names=sampleData.indNames) #make alignment objects aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat) popAlns = dict([(popName, aln.subset(groups=[popName])) for popName in sampleData.popNames]) #this above replaced this below, as it should be faster #popAlns = dict(zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames])) #if there is no target, fetch all base counts if not target: popFreqs = [] for pop in sampleData.popNames: goodData = popAlns[pop].siteNonNan() >= minData sites = np.where(goodData)[0] baseFreqs = popAlns[pop].siteFreqs(asCounts=asCounts) popFreqs.append([",".join(row) for row in baseFreqs.astype(str)]) allFreqs = np.column_stack(popFreqs) else: #otherwise define the target base at each site if target == "derived": #use last pop as outgroup outgroup = sampleData.popNames[-1] inAln = aln.subset(groups = sampleData.popNames[:-1]) baseColumns = np.array([genomics.derivedAllele(inAln.numArray[:,i][inAln.nanMask[:,i]], popAlns[outgroup].numArray[:,i][popAlns[outgroup].nanMask[:,i]], numeric=True) for i in range(aln.l)]).reshape([aln.l,1]) else: #otherwise get minor allele. baseColumns = np.array([genomics.minorAllele(aln.numArray[:,i][aln.nanMask[:,i]]) for i in xrange(aln.l)]).reshape([aln.l,1]) goodSites = np.apply_along_axis(lambda x: ~np.any(np.isnan(x)),1,baseColumns) #get freqs per pop popFreqs = [] for pop in sampleData.popNames: #first find sites with sufficient data goodData = popAlns[pop].siteNonNan() >= minData sites = np.where(goodSites & goodData)[0] baseFreqs = popAlns[pop].siteFreqs(sites, asCounts=asCounts) popColumns = baseColumns[sites,:].astype(int) popRows = np.repeat(np.arange(len(sites))[:,np.newaxis],popColumns.shape[1], axis = 1) targetFreqs = np.zeros([aln.l, popColumns.shape[1]], dtype=int if asCounts else float) if not asCounts: targetFreqs.fill(np.nan) if len(sites) >= 1: targetFreqs[sites,:] = baseFreqs[popRows,popColumns] popFreqs.append(np.around(targetFreqs, 4)) allFreqs = np.hstack(popFreqs) if threshold and not asCounts: allFreqs[allFreqs >= threshold] = 1 allFreqs[allFreqs < threshold] = 0 #fetch scaffold and position scafPos = np.array([line.split(None, 2)[:2] for line in fileSlice], dtype="str") if not keepNanLines: if not asCounts: outSites = np.where(~np.apply_along_axis(np.all, 1, np.isnan(allFreqs)))[0] else: outSites = np.where(~np.apply_along_axis(np.all, 1, allFreqs==0))[0] else: outSites = range(aln.l) outArray = np.column_stack((scafPos[outSites,:], allFreqs[outSites,:].astype(str),)) resultStrings = ["\t".join(row) for row in outArray] resultQueue.put((sliceNumber, resultStrings,))