def main():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
    parser.add_option("--filter", type="string", dest="filter", help="only analyze records with matching filter (default is None)", default=None)

    (options, args)=parser.parse_args()
    if options.infotag == "":
        sys.stderr.write("provide a value for --info parameter!\n")
        exit(1)


    variant_dict={} #key variant type value VcfRecord object

    vcfilename=args[0]
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    
    descriptors = vcfobj.getMetaInfoDescription()
    infoids=[]
    for (tag, description) in descriptors:
        tag
        infoids.append(tag)

    if options.infotag  not in infoids and options.infotag != 'QUAL':
        sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n")
        exit(1)

    

    pattern=options.infotag+'=(\S+)'
    
    for vrec in vcfobj.yieldVcfRecord(vcfh):
        if vrec.getFilter() != options.filter and options.filter != None: continue
        
        searchresult=re.search(pattern, vrec.getInfo() )
        if re.search(pattern, vrec.getInfo() ) == None:
            continue
        else:
            value=re.search(pattern, vrec.getInfo() ).groups()[0]
            #rint value
            if value not in variant_dict.keys():
                variant_dict[value]=[]
                variant_dict[value].append( vrec )
            else:
                variant_dict[value].append( vrec )


    
    sum=0
    sys.stderr.write("types and count of different variant classes in " + vcfilename + "\n")
    for k in variant_dict.keys():
        print k, len( variant_dict[k] )
        sum+=len( variant_dict[k] )
    print "TOTAL:", sum
Beispiel #2
0
def binned_bitsets_from_vcffile( vcfilename, chrom_col=0, start_col=1,  upstream_pad=0, downstream_pad=0, lens={} ):
    """
    Read a vcffile into a dictionary of bitsets. The defaults arguments

    - 'vcfilename' should be a filename for vcf file
    - 'chrom_col', 'start_col', and 'end_col' must exist in each line.

    - if 'lens' is provided bitset sizes will be looked up from it, otherwise
      chromosomes will be assumed to be the maximum size

    - the bitset interval made into a   zero-based, half-open interval!!!!!!!

    """
    last_chrom = None
    last_bitset = None
    bitsets = dict()
    MAX=2147483647

    vcfobj=VcfFile(vcfilename)
    fh=open(vcfilename,'r')

    for vrec in vcfobj.yieldVcfRecord(fh):

        filtercode = vrec.getFilter()
        chrom = vrec.getChrom()
        pos=int( vrec.getPos() )


        #if filtercode != filtercodeoption and filtercodeoption != None:
        #    continue


        if filtercode != 'PASS':
            if filtercode == '.':
                pass
            else:
                continue


        chrom="chr"+chrom
        if chrom != last_chrom:
            if chrom not in bitsets:
                if chrom in lens:
                    size = lens[chrom]
                else:
                    size = MAX
                bitsets[chrom] = BinnedBitSet( size )
            last_chrom = chrom
            last_bitset = bitsets[chrom]
        start, end = (pos-1, pos)

        if upstream_pad: start = max( 0, start - upstream_pad )
        if downstream_pad: end = min( size, end + downstream_pad )
        if start > end: warn( "Interval start after end!" )
        last_bitset.set_range( start, end-start )
    fh.close()
    return bitsets
def main():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    usage = "usage: %prog [options] file.vcf \n print summary information about site depth in records of a VCF file\n"
    parser = OptionParser(usage)
    parser.add_option("--max", type="int", dest="max", help="skip records that are greater than or equal to max (default sys.maxint)", default=sys.maxint)
    #parser.add_option("--v", action="store_true", dest="snp",  help="restrict analysis to SNPs (must have INFO ID SNP in header")

    (options, args)=parser.parse_args()

    vcfilename=args[0]
    fileName, fileExtension = os.path.splitext(vcfilename)
    #nuller.12:80717441..80717681.vcf
    regionpattern='nuller.(\d+):(\d+)..(\d+)'
    results=re.search(regionpattern,fileName ).groups()
    regionstr="\t".join(list(results))
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaLines(vcfh)
    descriptors = vcfobj.getMetaInfoDescription()
    infoids=[]
    for (tag, description) in descriptors:
        infoids.append(tag)

    if 'DP' not in infoids:
        sys.stderr.write("DP tag not in ##INFO headers!")
        exit(1)

    vcfh.seek(0)
    vcfobj.parseHeaderLine(vcfh)

    pattern='DP=(\d+)'
    depth_list=[]
    for vrec in vcfobj.yieldVcfRecord(vcfh):

        dp=re.search(pattern, vrec.getInfo() ).groups()[0]
        if dp == None:
            sys.stderr.write("unable to parse DP value from INFO field\n")
            continue
        else:
            if int(dp) >= options.max: continue
            depth_list.append(int(dp))

    maxDP=max( array (depth_list))
    minDP= min (array (depth_list))
    medianDP=median (array (depth_list))
    meanDP=mean( array(depth_list))
    length=len(depth_list)

    outstr="\t".join([regionstr, str(maxDP), str(minDP), str(medianDP), str(meanDP), str(length)])
    print outstr