Python parseGenoFile Examples, genomics.parseGenoFile Python Examples

Example #1

0

Show file

File: freq.py Project: xgdong/genomics_general

def freqs_wrapper(inQueue, resultQueue, headerLine, genoFormat, sampleData, target, minData, asCounts, threshold, keepNanLines = False):
    while True:
        
        sliceNumber,fileSlice = inQueue.get() # retrieve slice
        
        if sliceNumber == -1:
            resultQueue.put((-1,None,)) # this is the way of telling everything we're done
            break

        
        window = genomics.parseGenoFile(fileSlice, headerLine, names=sampleData.indNames)
        
        #make alignment objects
        aln = genomics.genoToAlignment(window.seqDict(), sampleData, genoFormat = genoFormat)
        popAlns = dict([(popName, aln.subset(groups=[popName])) for popName in sampleData.popNames])
        #this above replaced this below, as it should be faster
        #popAlns = dict(zip(sampleData.popNames, [aln.subset(groups=[pop]) for pop in sampleData.popNames]))
        
        #if there is no target, fetch all base counts
        
        if not target:
            popFreqs = []
            for pop in sampleData.popNames:
                goodData = popAlns[pop].siteNonNan() >= minData
                sites = np.where(goodData)[0]
                baseFreqs = popAlns[pop].siteFreqs(asCounts=asCounts)
                popFreqs.append([",".join(row) for row in baseFreqs.astype(str)])
            
            allFreqs = np.column_stack(popFreqs)
            
        else:
            #otherwise define the target base at each site
            if target == "derived":
                #use last pop as outgroup
                outgroup = sampleData.popNames[-1]
                inAln = aln.subset(groups = sampleData.popNames[:-1])
                baseColumns = np.array([genomics.derivedAllele(inAln.numArray[:,i][inAln.nanMask[:,i]],
                                                            popAlns[outgroup].numArray[:,i][popAlns[outgroup].nanMask[:,i]],
                                                            numeric=True)
                                        for i in range(aln.l)]).reshape([aln.l,1])
                
            else:
                #otherwise get minor allele.
                baseColumns = np.array([genomics.minorAllele(aln.numArray[:,i][aln.nanMask[:,i]]) for i in xrange(aln.l)]).reshape([aln.l,1])
            
            goodSites = np.apply_along_axis(lambda x: ~np.any(np.isnan(x)),1,baseColumns)
            
            #get freqs per pop
            popFreqs = []
            for pop in sampleData.popNames:
                #first find sites with sufficient data
                goodData = popAlns[pop].siteNonNan() >= minData
                sites = np.where(goodSites & goodData)[0]
                baseFreqs = popAlns[pop].siteFreqs(sites, asCounts=asCounts)
                popColumns = baseColumns[sites,:].astype(int)
                popRows = np.repeat(np.arange(len(sites))[:,np.newaxis],popColumns.shape[1], axis = 1)
                targetFreqs =  np.zeros([aln.l, popColumns.shape[1]], dtype=int if asCounts else float)
                if not asCounts: targetFreqs.fill(np.nan)
                if len(sites) >= 1: targetFreqs[sites,:] = baseFreqs[popRows,popColumns]
                popFreqs.append(np.around(targetFreqs, 4))
            
            allFreqs = np.hstack(popFreqs)
            
            if threshold and not asCounts:
                allFreqs[allFreqs >= threshold] = 1
                allFreqs[allFreqs < threshold] = 0
        
        #fetch scaffold and position
        scafPos = np.array([line.split(None, 2)[:2] for line in fileSlice], dtype="str")
        
        if not keepNanLines:
            if not asCounts:
                outSites = np.where(~np.apply_along_axis(np.all, 1, np.isnan(allFreqs)))[0]
            else: outSites = np.where(~np.apply_along_axis(np.all, 1, allFreqs==0))[0]
        else: outSites = range(aln.l)
                
        outArray = np.column_stack((scafPos[outSites,:],
                                    allFreqs[outSites,:].astype(str),))
        
        resultStrings = ["\t".join(row) for row in outArray]
        
        resultQueue.put((sliceNumber, resultStrings,))

Example #2

0

Show file

#extract each scaffold from the geno file, and the genes for each scaffold and write them out

for scaffold in geneData.keys():
    mRNAs = geneData[scaffold].keys()
    sys.stderr.write("Extracting " + str(len(mRNAs)) +
                     " gene sequences from " + scaffold + "\n")
    for mRNA in mRNAs:
        sys.stderr.write(mRNA + "\n")
        region = scaffold + ":" + str(
            geneData[scaffold][mRNA]["start"]) + "-" + str(
                geneData[scaffold][mRNA]["end"])
        sys.stderr.write("Getting region " + region + " from geno file...\n")
        genoStream = subprocess.Popen(['tabix', '-h', args.genoFile, region],
                                      stdout=subprocess.PIPE)
        window = genomics.parseGenoFile(genoStream.stdout,
                                        names=args.samples,
                                        includePositions=True,
                                        splitPhased=args.split)
        seqDict = window.seqDict()
        seqNames = seqDict.keys()
        sys.stderr.write("Extracting CDS...\n")
        CDSseqs = [
            genomics.CDS(seqDict[name], window.positions,
                         geneData[scaffold][mRNA]['cdsStarts'],
                         geneData[scaffold][mRNA]['cdsEnds'],
                         geneData[scaffold][mRNA]['strand'])
            for name in seqNames
        ]

        outputNames = [
            name + "_" + mRNA + " " + scaffold + " " +
            str(geneData[scaffold][mRNA]['start']) + "-" +

Example #3

0

Show file

File: distMat.py Project: zhanglzu/genomics_general

worker.start()


'''start background Thread that will run a loop to check run statistics and print
We use thread, because I think this is necessary for a process that watches global variables like linesTested'''
worker = Thread(target=checkStats)
worker.daemon = True
worker.start()




##########################################################

if args.windType == "cat":
    window = genomics.parseGenoFile(genoFile, names=sampleData.indNames)
    windowQueue.put((windowsQueued,window))
    windowsQueued += 1
    
else:
    #get windows and analyse
    if args.windType == "coordinate": windowGenerator = genomics.slidingCoordWindows(genoFile, windSize, stepSize,
                                                                                sampleData.indNames,
                                                                                include = scafsToInclude,
                                                                                exclude = scafsToExclude)
    elif args.windType == "sites": windowGenerator = genomics.slidingSitesWindows(genoFile, windSize, overlap,
                                                                            maxDist, minSites, sampleData.indNames,
                                                                            include = scafsToInclude,
                                                                            exclude = scafsToExclude)
    else: windowGenerator = genomics.predefinedCoordWindows(genoFile, windCoords, sampleData.indNames)

Example #4

0

Show file

File: distMat.py Project: nitinra/genomics_general

                ))
worker.daemon = True
worker.start()
'''start background Thread that will run a loop to check run statistics and print
We use thread, because I think this is necessary for a process that watches global variables like linesTested'''
worker = Thread(target=checkStats)
worker.daemon = True
worker.start()

##########################################################

headerLine = "\t".join(args.headers) if args.headers else None

if args.windType == "cat":
    window = genomics.parseGenoFile(genoFile,
                                    headerLine=headerLine,
                                    names=sampleData.indNames)
    windowQueue.put((windowsQueued, window))
    windowsQueued += 1

else:
    #get windows and analyse
    if args.windType == "coordinate":
        windowGenerator = genomics.slidingCoordWindows(
            genoFile,
            windSize,
            stepSize,
            headerLine=headerLine,
            names=sampleData.indNames,
            include=scafsToInclude,
            exclude=scafsToExclude)

Example #5

0

Show file

File: genoToSeq.py Project: cbrock2/genomics_general

if not args.separateFiles:
    if args.seqFile:
        if args.seqFile[-3:] == ".gz": seqFile = gzip.open(args.seqFile, "w")
        elif args.gzip: seqFile = gzip.open(args.seqFile + ".gz", "w")
        else: seqFile = open(args.seqFile, "w")
    else: seqFile = sys.stdout

#############################

samples = args.samples.split(",") if args.samples else None

# if cating all contigs, just parse file and write
if args.mode == "cat":
    #read file into window like object
    window = genomics.parseGenoFile(genoFile,
                                    names=samples,
                                    splitPhased=args.splitPhased)
    #write
    seqDict = window.seqDict()
    seqFile.write(
        genomics.makeAlnString(window.names,
                               [seqDict[name] for name in window.names],
                               outFormat=args.format))
    genoFile.close()
    seqFile.close()
    exit()

if args.mode == "windows" or args.mode == "contigs":
    if args.mode == "windows":
        windType = args.windType
        windSize = args.windSize