Esempi in Python per VcfFile.addMetaInfoHeader

Linguaggio di programmazione: Python

Classe/tipologia: VcfFile

Metodo/funzione: addMetaInfoHeader

Esempi su hotexamples.com: 2

VcfFile.addMetaInfoHeader in Python: 2 esempi trovati. Questi sono i migliori esempi reali in Python per VcfFile.addMetaInfoHeader da pachetto VcfPythonUtils, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

parseMetaAndHeaderLines(17)

yieldVcfRecordwithGenotypes(14)

returnHeader(11)

getSampleList(10)

getMetaInfoDescription(5)

yieldVcfDataLine(4)

parseMetaLines(4)

yieldVcfRecord(3)

parseHeaderLine(3)

addMetaInfoHeader(2)

setSampleList(2)

printHeaderLine(1)

printMetaAndHeaderLines(1)

printMetaLines(1)

getMetaFormatDescription(1)

getMetaFilterDescription(1)

addMetaFormatHeader(1)

Esempio n. 1

Mostra file

File: vcf_callrate.py Progetto: indapa/VcfPythonUtils

def main():
    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those  records matching filter (default is None)", default=None)
    parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
    parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    (options, args)=parser.parse_args()


    vcfilename=args[0]
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaLines(vcfh)
    vcfobj.addMetaInfoHeader("CR", "D", 1, "site call rate")
    vcfobj.printMetaLines()

    vcfh.seek(0)


    vcfobj.parseHeaderLine(vcfh)
    vcfobj.printHeaderLine()
    
    samplelist = vcfobj.getSampleList()
    sampleCalls={} #key sample name value #called genotypes
    for s in samplelist: sampleCalls[s]=0



    totalrecords=0
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        if vrec.getFilter() != options.filter and options.filter != None : continue
        totalrecords+=1
        sitecallrate=vrec.siteCallrate()
        vrec.appendInfoString("CR="+str(sitecallrate))
        vrec.sampleCallrate(samplelist,sampleCalls)
        #print vrec.toStringwithGenotypes()

    for s in samplelist:
        callrate=float(sampleCalls[s])/float(totalrecords)
        print s, sampleCalls[s], totalrecords, callrate

Esempio n. 2

Mostra file

File: vcf_pysam_allele_pileup.py Progetto: indapa/VcfPythonUtils

def main():

    """ given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags 
    to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF"""

    usage = "usage: %prog [option] file.vcf.gz"
    parser = OptionParser(usage)
    parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on")
    parser.add_option(
        "--mapq",
        type="float",
        dest="mapq",
        default=0.0,
        help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)",
    )
    parser.add_option(
        "--bq",
        type="float",
        dest="bq",
        default=0.0,
        help="Exclude bases from analysis if their supporting base quality is less that --bq (default is 0)",
    )
    parser.add_option(
        "--includeDuplicates",
        action="store_false",
        dest="duplicate",
        help="include duplicate marked reads in analysis (turned off by default) ",
    )
    (options, args) = parser.parse_args()
    if options.bam == None:
        sys.stderr.write("please provide a value to --bam option\n")
        sys.exit(1)

    vcfilename = args[0]

    bamfilename = options.bam

    ra_formatline = FormatLine("RA", number="1", type="Integer", description="number of reference alleles observed")
    aa_formatline = FormatLine("AA", number="1", type="Integer", description="number of alternate alleles observed")

    if os.path.exists(bamfilename + ".bai") == False:
        sys.stderr.write("please check for existence of bam index file (*.bai)\n")
        exit(1)

    vcfobj = VcfFile(vcfilename)

    vcfh = gzip.open(vcfilename, "r")

    vcfobj.parseMetaAndHeaderLines(vcfh)
    vcfobj.addMetaFormatHeader(ra_formatline)
    vcfobj.addMetaFormatHeader(aa_formatline)
    vcfobj.addMetaInfoHeader("RA", "Integer", "1", "total number of reference alleles observed")
    vcfobj.addMetaInfoHeader("AA", "Integer", "1", "total number of alternate alleles observed")
    header = vcfobj.returnHeader()

    print header
    readgroupdict = {}
    pybamfile = pysam.Samfile(bamfilename, "rb")
    rgdictlist = pybamfile.header["RG"]
    for dictionary in rgdictlist:
        readgroupdict[dictionary["ID"]] = dictionary["SM"]
    # print readgroupdict

    samples = vcfobj.getSampleList()

    # print samples

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        (chrom, start, end) = vrec.getChrom(), int(vrec.getPos()) - 1, int(vrec.getPos())
        # print chrom, str(start), str(end)
        # print vrec.getRef()
        # print vrec.toStringwithGenotypes()

        for pileupcolumn in pybamfile.pileup(chrom, start, end):
            if pileupcolumn.pos != end:
                continue
            # sys.stdout.write('chr'+chrom+ " " + str(start) +  " " + str(end) + " " + str(pileupcolumn.pos) + " ")
            # print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.n)

            seqdict = {}
            sampledict = {}
            for s in samples:
                sampledict[s] = []
            # print sampledict
            for (base, count) in (("A", 0), ("C", 0), ("G", 0), ("T", 0), ("N", 0)):
                seqdict[base] = count

            for pileupread in pileupcolumn.pileups:

                if pileupread.alignment.is_duplicate == True and options.duplicate == False:
                    continue
                if pileupread.alignment.mapq < options.mapq:
                    continue
                if (ord(pileupread.alignment.qual[pileupread.qpos - 1]) - 33) < options.bq:
                    continue
                seqdict[pileupread.alignment.seq[pileupread.qpos - 1]] += 1
                readgroup = dict(pileupread.alignment.tags)["RG"]

                sample = readgroupdict[readgroup]
                # print readgroup,sample, pileupread.alignment.seq[pileupread.qpos-1]
                sampledict[sample].append(pileupread.alignment.seq[pileupread.qpos - 1])
                # print pileupread.alignment.seq, len(pileupread.alignment.seq), pileupread.qpos

            vrec.addInfo("RA=" + str(seqdict[vrec.getRef()]))
            if vrec.getAlt() != ".":
                vrec.addInfo("AA=" + str(seqdict[vrec.getAlt()]))
            zip_genos = vrec.zipGenotypes(samples)
            for (sample, vcfgenobj) in zip_genos:

                if len(sampledict[sample]) == 0:
                    vcfgenobj.addFormat("RA")
                    vcfgenobj.addFormat("AA")
                    continue
                else:
                    ra = 0
                    aa = 0
                    c = dict(Counter(sampledict[sample]))
                    if vrec.getRef() in c.keys():
                        ra = c[vrec.getRef()]
                    if vrec.getAlt() in c.keys():
                        aa = c[vrec.getAlt()]
                    vcfgenobj.addFormatVal("RA", str(ra))
                    vcfgenobj.addFormatVal("AA", str(aa))

            # for nt in ('A', 'C', 'G', 'T', 'N'):
            #    sys.stdout.write( str(seqdict[nt]) + " ")
            # sys.stdout.write("\n")
            print vrec.toStringwithGenotypes()

    pybamfile.close()