コード例 #1
0
def makeVCFline(scaffold,
                position,
                GTdict,
                names,
                refDict=None,
                genoFormat=None):
    genomeSite = genomics.GenomeSite(genoDict=GTdict,
                                     sampleNames=names,
                                     genoFormat=genoFormat)
    alleles = genomeSite.alleles(byFreq=True)
    if alleles == []: alleles = ["N"]

    if refDict:
        refBase = refDict[scaffold][int(position) - 1]
        if refBase in alleles: alleles.pop(alleles.index(refBase))
        alleles = [refBase] + alleles
    else:
        refBase = alleles[0]

    alt = alleles[1:]
    if alt == []: alt = ["."]

    codedGenos = genomeSite.asList(mode="coded", alleles=alleles)
    output = [
        scaffold,
        str(position), ".", refBase, ",".join(alt), ".", ".", ".", "GT"
    ] + codedGenos
    return "\t".join(output)
コード例 #2
0
def getPopIndBaseCounts(siteData, genoFormat, allSamples, popDict, ploidyDict):
    site = genomics.GenomeSite(genotypes=[siteData["GTs"][name] for name in allSamples], sampleNames=allSamples,
                               popDict=popDict, ploidyDict=ploidyDict, genoFormat=args.genoFormat)
    
    popIndBaseCounts = dict([(popName, np.array([site.genotypes[indName].asBaseCounts() for indName in popDict[popName]]),) for popName in popNames])
    
    return popIndBaseCounts
コード例 #3
0
def analysisWrapper(inQueue, outQueue, inputGenoFormat, outputGenoFormat,
                    headers, include, exclude, samples, minCalls, minPopCalls,
                    minAlleles, maxAlleles, minVarCount, maxHet, minFreq,
                    maxFreq, HWE_P, HWE_side, popDict, ploidyDict, fixed,
                    nearlyFixedDiff, forcePloidy, thinDist, noTest):
    sampleIndices = [headers.index(s) for s in samples]
    while True:
        podNumber, inPod = inQueue.get()
        if verbose:
            print >> sys.stderr, "Pod", podNumber, "received for analysis."
        outPod = []
        lastScaf = None
        for lineData in inPod:
            lineNumber, line = lineData
            #if verbose: print >> sys.stderr, "Analysing line", lineNumber
            objects = line.split()
            if (include and objects[0]
                    not in include) or (exclude and objects[0] in exclude):
                continue
            site = genomics.GenomeSite(
                genotypes=[objects[i] for i in sampleIndices],
                sampleNames=samples,
                popDict=popDict,
                ploidyDict=ploidyDict,
                genoFormat=inputGenoFormat,
                forcePloidy=forcePloidy)
            goodSite = True
            if thinDist:
                pos = int(objects[1])
                if lastScaf != objects[0]:
                    lastPos = pos
                    lastScaf = objects[0]
                    goodSite = False
                elif pos - lastPos < thinDist:
                    goodSite = False
            if goodSite and not noTest:
                goodSite = genomics.siteTest(site,
                                             samples=samples,
                                             minCalls=minCalls,
                                             minPopCalls=minPopCalls,
                                             minAlleles=minAlleles,
                                             maxAlleles=maxAlleles,
                                             minVarCount=minVarCount,
                                             maxHet=maxHet,
                                             minFreq=minFreq,
                                             maxFreq=maxFreq,
                                             HWE_P=HWE_P,
                                             HWE_side=HWE_side,
                                             fixed=fixed,
                                             nearlyFixedDiff=nearlyFixedDiff)
            if goodSite:
                outLine = "\t".join(objects[:2] + [
                    str(g) for g in site.asList(samples, mode=outputGenoFormat)
                ]) + "\n"
                outPod.append((lineNumber, outLine))
                if thinDist: lastPos = int(objects[1])
            #if verbose: print >> sys.stderr, objects[0], objects[1], "passed: ", goodSite
        outQueue.put((podNumber, outPod))
        if verbose:
            print >> sys.stderr, "Pod", podNumber, "analysed, sent to sorter."
コード例 #4
0
        continue

    #if there are intervals, check whether the site matches any

    if intervalsFile:
        siteIntervals = whichInterval(siteData["scaffold"],
                                      siteData["position"], scafIntervals,
                                      intervalPosDict)
    else:
        siteIntervals = [0]

    if not siteIntervals: continue

    site = genomics.GenomeSite(
        genotypes=[siteData["GTs"][name] for name in allSamples],
        sampleNames=allSamples,
        popDict=popDict,
        ploidyDict=ploidyDict,
        genoFormat=args.genoFormat)

    popIndBaseCounts = dict([(
        popName,
        np.array([
            site.genotypes[indName].asBaseCounts()
            for indName in popDict[popName]
        ]),
    ) for popName in popNames])

    # get population basec counts, and do the subsampling if necessary
    # This is currently conservative. If any one of the populations lacks sufficient good genotypes it will break
    # in theory we could modify this part to use info for the pops it can - might be necessary when sites are limited
コード例 #5
0
ファイル: sfs.py プロジェクト: zhanglzu/genomics_general
                                                   and objects[0] in exclude):
        continue

    #if there are intervals, check whether the site matches any

    if intervalsFile:
        siteIntervals = whichInterval(objects[0], int(objects[1]),
                                      scafIntervals, intervalPosDict)
    else:
        siteIntervals = [0]

    if not siteIntervals: continue

    site = genomics.GenomeSite(genotypes=[objects[i] for i in sampleIndices],
                               sampleNames=allSamples,
                               popDict=popDict,
                               ploidyDict=ploidyDict,
                               genoFormat=args.genoFormat)

    popIndBaseCounts = dict([(
        popName,
        np.array([
            site.genotypes[indName].asBaseCounts()
            for indName in popDict[popName]
        ]),
    ) for popName in popNames])

    # get population basec counts, and do the subsampling if necessary
    # This is currently conservative. If any one of the populations lacks sufficient good genotypes it will break
    # in theory we could modify this part to use info for the pops it can - might be necessary when sites are limited
コード例 #6
0
linesDone = 0
scaf = None
chrom = None
pos = 0

if args.cumulativePos:
    #dict giving the last known position of each chrom from the previous scaffold
    chromOffset = dict(zip(chromDict.keys(), [0] * len(chromDict)))
    chromOffset[str(args.nullChrom)] = 0

for line in genoFile:
    site = genomics.parseGenoLine(line)

    genomeSite = genomics.GenomeSite(genotypes=site.GTs,
                                     sampleNames=allNames,
                                     genoFormat=args.genoFormat)

    if len(genomeSite.alleles()) == 2:
        counts = genomeSite.asList(mode="count", samples=samples, missing=9)

        genoOut.write("".join([str(c) for c in counts]) + "\n")

        if site.scaffold != scaf:
            #different scaffold from the last site
            #if using cumulative positions, change the offset for the last chrom
            if chrom is not None and args.cumulativePos:
                chromOffset[chrom] = pos
            #now get new scaf, chrom and pos
            scaf = site.scaffold
            try:
コード例 #7
0

linesDone = 0
scaf = None
chrom = None
pos = 0

if args.cumulativePos:
    #dict giving the last known position of each chrom from the previous scaffold
    chromOffset = dict(zip(chromDict.keys(), [0]*len(chromDict)))
    chromOffset[str(args.nullChrom)] = 0


for siteData in reader.siteBySite():
    
    genomeSite = genomics.GenomeSite(genoDict = siteData["GTs"], genoFormat=args.genoFormat)
    
    alleles = genomeSite.alleles()
    
    if len(alleles) == 2:
        counts = genomeSite.asList(mode="count", samples=samples, missing = 9)
        
        genoOut.write("".join([str(c) for c in counts]) + "\n")
        
        if siteData["scaffold"] != scaf:
            #different scaffold from the last site
            #if using cumulative positions, change the offset for the last chrom
            if chrom is not None and args.cumulativePos: chromOffset[chrom] = pos
            #now get new scaf, chrom and pos
            scaf = siteData["scaffold"]
            try: chrom = chromDict[scaf]
コード例 #8
0
def analysisWrapper(inQueue, outQueue, inputGenoFormat, headers, include,
                    exclude, group1inds, group2inds, permutations,
                    permutationMaxP):

    samples = group1inds + group2inds
    sampleIndices = [headers.index(s) for s in samples]

    group1 = np.array([True] * len(group1inds) + [False] * len(group2inds))
    group2 = ~group1

    while True:
        podNumber, inPod = inQueue.get()
        if verbose:
            sys.stderr.write(
                "Pod {} received for analysis...\n".format(podNumber))

        outPod = []
        for lineData in inPod:
            lineNumber, line = lineData
            #if verbose: print >> sys.stderr, "Analysing line", lineNumber
            objects = line.split()
            if (include and objects[0]
                    not in include) or (exclude and objects[0] in exclude):
                continue
            site = genomics.GenomeSite(
                genotypes=[objects[i] for i in sampleIndices],
                sampleNames=samples,
                genoFormat=inputGenoFormat)

            alleles = site.alleles()

            if len(alleles) == 2:

                minorCount = np.array(
                    site.asList(mode="count",
                                countAllele=alleles[1],
                                missing=-1))
                majorCount = np.array(
                    site.asList(mode="count",
                                countAllele=alleles[0],
                                missing=-1))

                #get index for good genotypes and filter all by that
                idx = np.where(minorCount >= 0)[0]

                _group1_ = group1[idx]
                _group2_ = group2[idx]

                minorPresent = minorCount[idx] >= 1
                minorAbsent = ~minorPresent

                majorPresent = majorCount[idx] >= 1
                majorAbsent = ~majorPresent

                minorTable = np.array([[(minorPresent & _group1_).sum(),
                                        (minorAbsent & _group1_).sum()],
                                       [(minorPresent & _group2_).sum(),
                                        (minorAbsent & _group2_).sum()]])

                majorTable = np.array([[(majorPresent & _group1_).sum(),
                                        (majorAbsent & _group1_).sum()],
                                       [(majorPresent & _group2_).sum(),
                                        (majorAbsent & _group2_).sum()]])

                p_values = (
                    fisher_exact(minorTable)[1],
                    fisher_exact(majorTable)[1],
                )

                result = [min(p_values)]

                if permutations >= 1:
                    if permutationMaxP is None or result[0] <= permutationMaxP:

                        table = minorTable if p_values[0] <= p_values[
                            1] else majorTable

                        phi = chisquare(table, axis=None)[0] / table.sum()

                        phi_permuted = []
                        for i in range(permutations):
                            newGroup1 = np.random.permutation(_group1_)
                            newGroup2 = ~newGroup1

                            newTable = np.array(
                                [[(minorPresent & newGroup1).sum(),
                                  (minorAbsent & newGroup1).sum()],
                                 [(minorPresent & newGroup2).sum(),
                                  (minorAbsent & newGroup2).sum()]])

                            phi_permuted.append(
                                chisquare(newTable, axis=None)[0] /
                                table.sum())

                        p_emp = (len(
                            [_phi_
                             for _phi_ in phi_permuted if _phi_ >= phi]) +
                                 1.) / (permutations + 1.)

                    else:
                        p_emp = np.NaN

                    result.append(p_emp)

            elif permutations >= 1:
                result = [np.NaN] * 2

            else:
                result = [np.NaN]

            outLine = "\t".join(objects[:2] +
                                [str(round(x, 5)) for x in result]) + "\n"

            outPod.append((lineNumber, outLine))

        outQueue.put((podNumber, outPod))
        if verbose:
            sys.stderr.write(
                "Pod {} analysed, sent to sorter.\n".format(podNumber))