Ejemplo n.º 1
0
def freqs_wrapper(windowQueue, resultQueue, genoFormat, sampleData, minData, target, asCounts, keepNanLines = False):
    while True:
        
        windowNumber,window = windowQueue.get() # retrieve window
        
        #make alignment objects
        aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat)
        popAlns = dict(zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames]))
        
        #target base at each site
        
        if target == "derived":
            #use last pop as outgroup
            outgroup = sampleData.popNames[-1]
            inAln = aln.subset(groups = sampleData.popNames[:-1])
            baseColumns = np.array([genomics.derivedAllele(inAln.numArray[:,i][inAln.nanMask[:,i]],
                                                           popAlns[outgroup].numArray[:,i][popAlns[outgroup].nanMask[:,i]],
                                                           numeric=True)
                                    for i in xrange(aln.l)]).reshape([aln.l,1])
            
        else:
            #otherwise get minor allele.
                        
            baseColumns = np.array([genomics.minorAllele(aln.numArray[:,i][aln.nanMask[:,i]]) for i in xrange(aln.l)]).reshape([aln.l,1])
        
        goodSites = np.apply_along_axis(lambda(x): ~np.any(np.isnan(x)),1,baseColumns)
        
        #get freqs per pop
        popFreqs = []
        for pop in sampleData.popNames:
            #first find sites with sufficient data
            goodData = popAlns[pop].siteNonNan() >= minData
            sites = np.where(goodSites & goodData)[0]
            baseFreqs = popAlns[pop].siteFreqs(sites, asCounts=asCounts)
            popColumns = baseColumns[sites,:].astype(int)
            popRows = np.repeat(np.arange(len(sites))[:,np.newaxis],popColumns.shape[1], axis = 1)
            targetFreqs =  np.empty([aln.l, popColumns.shape[1]], dtype=int if asCounts else float)
            targetFreqs.fill(np.nan)
            if len(sites) >= 1: targetFreqs[sites,:] = baseFreqs[popRows,popColumns]
            popFreqs.append(np.around(targetFreqs, 4))
        
        allFreqs = np.hstack(popFreqs)
        
        if not keepNanLines:
            outSites = np.where(~np.apply_along_axis(np.all, 1, np.isnan(allFreqs)))[0]
        else: outSites = range(aln.l)
                
        outArray = np.column_stack(([window.scaffold]*len(outSites),
                                    np.array(window.positions)[outSites].astype(str),
                                    allFreqs[outSites,:].astype(str),))
        
        resultStrings = ["\t".join(row) for row in outArray]
        
        resultQueue.put((windowNumber, resultStrings,))
Ejemplo n.º 2
0
def freqs_wrapper(inQueue, resultQueue, headerLine, genoFormat, sampleData, target, minData, asCounts, threshold, keepNanLines = False):
    while True:
        
        sliceNumber,fileSlice = inQueue.get() # retrieve slice
        
        if sliceNumber == -1:
            resultQueue.put((-1,None,)) # this is the way of telling everything we're done
            break

        
        window = genomics.parseGenoFile(fileSlice, headerLine, names=sampleData.indNames)
        
        #make alignment objects
        aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat)
        popAlns = dict([(popName, aln.subset(groups=[popName])) for popName in sampleData.popNames])
        #this above replaced this below, as it should be faster
        #popAlns = dict(zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames]))
        
        #if there is no target, fetch all base counts
        
        if not target:
            popFreqs = []
            for pop in sampleData.popNames:
                goodData = popAlns[pop].siteNonNan() >= minData
                sites = np.where(goodData)[0]
                baseFreqs = popAlns[pop].siteFreqs(asCounts=asCounts)
                popFreqs.append([",".join(row) for row in baseFreqs.astype(str)])
            
            allFreqs = np.column_stack(popFreqs)
            
        else:
            #otherwise define the target base at each site
            if target == "derived":
                #use last pop as outgroup
                outgroup = sampleData.popNames[-1]
                inAln = aln.subset(groups = sampleData.popNames[:-1])
                baseColumns = np.array([genomics.derivedAllele(inAln.numArray[:,i][inAln.nanMask[:,i]],
                                                            popAlns[outgroup].numArray[:,i][popAlns[outgroup].nanMask[:,i]],
                                                            numeric=True)
                                        for i in range(aln.l)]).reshape([aln.l,1])
                
            else:
                #otherwise get minor allele.
                baseColumns = np.array([genomics.minorAllele(aln.numArray[:,i][aln.nanMask[:,i]]) for i in xrange(aln.l)]).reshape([aln.l,1])
            
            goodSites = np.apply_along_axis(lambda x: ~np.any(np.isnan(x)),1,baseColumns)
            
            #get freqs per pop
            popFreqs = []
            for pop in sampleData.popNames:
                #first find sites with sufficient data
                goodData = popAlns[pop].siteNonNan() >= minData
                sites = np.where(goodSites & goodData)[0]
                baseFreqs = popAlns[pop].siteFreqs(sites, asCounts=asCounts)
                popColumns = baseColumns[sites,:].astype(int)
                popRows = np.repeat(np.arange(len(sites))[:,np.newaxis],popColumns.shape[1], axis = 1)
                targetFreqs =  np.zeros([aln.l, popColumns.shape[1]], dtype=int if asCounts else float)
                if not asCounts: targetFreqs.fill(np.nan)
                if len(sites) >= 1: targetFreqs[sites,:] = baseFreqs[popRows,popColumns]
                popFreqs.append(np.around(targetFreqs, 4))
            
            allFreqs = np.hstack(popFreqs)
            
            if threshold and not asCounts:
                allFreqs[allFreqs >= threshold] = 1
                allFreqs[allFreqs < threshold] = 0
        
        #fetch scaffold and position
        scafPos = np.array([line.split(None, 2)[:2] for line in fileSlice], dtype="str")
        
        if not keepNanLines:
            if not asCounts:
                outSites = np.where(~np.apply_along_axis(np.all, 1, np.isnan(allFreqs)))[0]
            else: outSites = np.where(~np.apply_along_axis(np.all, 1, allFreqs==0))[0]
        else: outSites = range(aln.l)
                
        outArray = np.column_stack((scafPos[outSites,:],
                                    allFreqs[outSites,:].astype(str),))
        
        resultStrings = ["\t".join(row) for row in outArray]
        
        resultQueue.put((sliceNumber, resultStrings,))