def main():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    usage = "usage: %prog [options] file.vcf \n print summary information about site depth in records of a VCF file\n"
    parser = OptionParser(usage)
    parser.add_option("--max", type="int", dest="max", help="skip records that are greater than or equal to max (default sys.maxint)", default=sys.maxint)
    #parser.add_option("--v", action="store_true", dest="snp",  help="restrict analysis to SNPs (must have INFO ID SNP in header")

    (options, args)=parser.parse_args()

    vcfilename=args[0]
    fileName, fileExtension = os.path.splitext(vcfilename)
    #nuller.12:80717441..80717681.vcf
    regionpattern='nuller.(\d+):(\d+)..(\d+)'
    results=re.search(regionpattern,fileName ).groups()
    regionstr="\t".join(list(results))
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaLines(vcfh)
    descriptors = vcfobj.getMetaInfoDescription()
    infoids=[]
    for (tag, description) in descriptors:
        infoids.append(tag)

    if 'DP' not in infoids:
        sys.stderr.write("DP tag not in ##INFO headers!")
        exit(1)

    vcfh.seek(0)
    vcfobj.parseHeaderLine(vcfh)

    pattern='DP=(\d+)'
    depth_list=[]
    for vrec in vcfobj.yieldVcfRecord(vcfh):

        dp=re.search(pattern, vrec.getInfo() ).groups()[0]
        if dp == None:
            sys.stderr.write("unable to parse DP value from INFO field\n")
            continue
        else:
            if int(dp) >= options.max: continue
            depth_list.append(int(dp))

    maxDP=max( array (depth_list))
    minDP= min (array (depth_list))
    medianDP=median (array (depth_list))
    meanDP=mean( array(depth_list))
    length=len(depth_list)

    outstr="\t".join([regionstr, str(maxDP), str(minDP), str(medianDP), str(meanDP), str(length)])
    print outstr
Beispiel #2
0
def main():
    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those  records matching filter (default is None)", default=None)
    parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
    parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    (options, args)=parser.parse_args()


    vcfilename=args[0]
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaLines(vcfh)
    vcfobj.addMetaInfoHeader("CR", "D", 1, "site call rate")
    vcfobj.printMetaLines()

    vcfh.seek(0)


    vcfobj.parseHeaderLine(vcfh)
    vcfobj.printHeaderLine()
    
    samplelist = vcfobj.getSampleList()
    sampleCalls={} #key sample name value #called genotypes
    for s in samplelist: sampleCalls[s]=0



    totalrecords=0
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        if vrec.getFilter() != options.filter and options.filter != None : continue
        totalrecords+=1
        sitecallrate=vrec.siteCallrate()
        vrec.appendInfoString("CR="+str(sitecallrate))
        vrec.sampleCallrate(samplelist,sampleCalls)
        #print vrec.toStringwithGenotypes()

    for s in samplelist:
        callrate=float(sampleCalls[s])/float(totalrecords)
        print s, sampleCalls[s], totalrecords, callrate
Beispiel #3
0
def main():
    usage = "usage: %prog [options] maf file.vcf"
    parser = OptionParser(usage)
  
    parser.add_option("--maftag", type="string", dest="maftag", help="INFO tag id that annotates the allele freq of the record", default="AF")
    parser.add_option("--variantag", type="string", dest="vtag", help="INFO tag that annotates the type of variant type", default="VT")
    parser.add_option("--variantype", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    parser.add_option("--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None)
    parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file  has no header file", default=False)
    parser.add_option("--quiet", action="store_true", dest="quiet", help="don't print vcf output to stdout", default=False)
    parser.add_option("--leq", type="float", dest="leq", default=1.0, help="keep variants with AF <= (default 1)")
    parser.add_option("--geq", type="float", dest="geq", default=0.0, help="keep variants with AF >= (default 0)")
    (options, args)=parser.parse_args()

    

    if len(args)!=1:
        sys.stderr.write(usage+"\n")
        exit(1)
    vcfilename=args[0]
    #maf=float(args[0])

    freqfh=open('freq.log', 'w')

    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    if options.noheader == False:
        vcfobj.parseMetaLines(vcfh)
    #vcfobj.printMetaLines()
    descriptors = vcfobj.getMetaInfoDescription()
    infoids=[]
    for (tag, description) in descriptors:
        infoids.append(tag)

    if options.maftag  not in infoids and options.maftag != 'QUAL' and options.noheader == False:
        sys.stderr.write(options.maftag + " tag not in ##INFO headers!\n")
        exit(1)

    if options.vtag  not in infoids and options.vtag != 'QUAL' and options.noheader==False:
        sys.stderr.write(options.vtag + " tag not in ##INFO headers!\n")
        exit(1)

   
    #vcfh.seek(0)
    if options.noheader == False:
        vcfobj.parseHeaderLine(vcfh)
  


    if options.variantype==None:
        variantpattern=options.vtag+'=(\w+);'
    else:
        variantpattern=options.vtag+'=('+options.variantype+');'
    mafpattern=options.maftag+'=(0.\d+)'

    #print mafpattern, variantpattern


    for dataline in vcfobj.yieldVcfDataLine(vcfh):
        #print dataline
        fields=dataline.strip().split('\t')

        (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8]
        #if filtercode != options.filter and options.filter != None : continue

        
        if re.search(variantpattern, info ) == None:
            #sys.stderr.write("no variant pattern\n")
            continue
        
        variant_type=re.search(variantpattern, info ).groups()[0]
        
        
        if re.search(mafpattern, info ) == None:
            #sys.stderr.write("No mafpattern!\n")
            #sys.stderr.write(dataline+"\n")
            continue
        
        maf_value=re.search(mafpattern, info ).groups()[0]
        
        if float(maf_value) <= options.leq and float(maf_value) >= options.geq:

            if options.quiet == False:
                print dataline
            logstring="\t".join([chrom,pos,id,ref,alt,variant_type, options.maftag, maf_value])
            freqfh.write(logstring+'\n')