def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE") parser.add_option("--filter", type="string", dest="filter", help="only analyze records with matching filter (default is None)", default=None) (options, args)=parser.parse_args() if options.infotag == "": sys.stderr.write("provide a value for --info parameter!\n") exit(1) variant_dict={} #key variant type value VcfRecord object vcfilename=args[0] vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaAndHeaderLines(vcfh) descriptors = vcfobj.getMetaInfoDescription() infoids=[] for (tag, description) in descriptors: tag infoids.append(tag) if options.infotag not in infoids and options.infotag != 'QUAL': sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n") exit(1) pattern=options.infotag+'=(\S+)' for vrec in vcfobj.yieldVcfRecord(vcfh): if vrec.getFilter() != options.filter and options.filter != None: continue searchresult=re.search(pattern, vrec.getInfo() ) if re.search(pattern, vrec.getInfo() ) == None: continue else: value=re.search(pattern, vrec.getInfo() ).groups()[0] #rint value if value not in variant_dict.keys(): variant_dict[value]=[] variant_dict[value].append( vrec ) else: variant_dict[value].append( vrec ) sum=0 sys.stderr.write("types and count of different variant classes in " + vcfilename + "\n") for k in variant_dict.keys(): print k, len( variant_dict[k] ) sum+=len( variant_dict[k] ) print "TOTAL:", sum
def binned_bitsets_from_vcffile( vcfilename, chrom_col=0, start_col=1, upstream_pad=0, downstream_pad=0, lens={} ): """ Read a vcffile into a dictionary of bitsets. The defaults arguments - 'vcfilename' should be a filename for vcf file - 'chrom_col', 'start_col', and 'end_col' must exist in each line. - if 'lens' is provided bitset sizes will be looked up from it, otherwise chromosomes will be assumed to be the maximum size - the bitset interval made into a zero-based, half-open interval!!!!!!! """ last_chrom = None last_bitset = None bitsets = dict() MAX=2147483647 vcfobj=VcfFile(vcfilename) fh=open(vcfilename,'r') for vrec in vcfobj.yieldVcfRecord(fh): filtercode = vrec.getFilter() chrom = vrec.getChrom() pos=int( vrec.getPos() ) #if filtercode != filtercodeoption and filtercodeoption != None: # continue if filtercode != 'PASS': if filtercode == '.': pass else: continue chrom="chr"+chrom if chrom != last_chrom: if chrom not in bitsets: if chrom in lens: size = lens[chrom] else: size = MAX bitsets[chrom] = BinnedBitSet( size ) last_chrom = chrom last_bitset = bitsets[chrom] start, end = (pos-1, pos) if upstream_pad: start = max( 0, start - upstream_pad ) if downstream_pad: end = min( size, end + downstream_pad ) if start > end: warn( "Interval start after end!" ) last_bitset.set_range( start, end-start ) fh.close() return bitsets
def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) usage = "usage: %prog [options] file.vcf \n print summary information about site depth in records of a VCF file\n" parser = OptionParser(usage) parser.add_option("--max", type="int", dest="max", help="skip records that are greater than or equal to max (default sys.maxint)", default=sys.maxint) #parser.add_option("--v", action="store_true", dest="snp", help="restrict analysis to SNPs (must have INFO ID SNP in header") (options, args)=parser.parse_args() vcfilename=args[0] fileName, fileExtension = os.path.splitext(vcfilename) #nuller.12:80717441..80717681.vcf regionpattern='nuller.(\d+):(\d+)..(\d+)' results=re.search(regionpattern,fileName ).groups() regionstr="\t".join(list(results)) vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaLines(vcfh) descriptors = vcfobj.getMetaInfoDescription() infoids=[] for (tag, description) in descriptors: infoids.append(tag) if 'DP' not in infoids: sys.stderr.write("DP tag not in ##INFO headers!") exit(1) vcfh.seek(0) vcfobj.parseHeaderLine(vcfh) pattern='DP=(\d+)' depth_list=[] for vrec in vcfobj.yieldVcfRecord(vcfh): dp=re.search(pattern, vrec.getInfo() ).groups()[0] if dp == None: sys.stderr.write("unable to parse DP value from INFO field\n") continue else: if int(dp) >= options.max: continue depth_list.append(int(dp)) maxDP=max( array (depth_list)) minDP= min (array (depth_list)) medianDP=median (array (depth_list)) meanDP=mean( array(depth_list)) length=len(depth_list) outstr="\t".join([regionstr, str(maxDP), str(minDP), str(medianDP), str(meanDP), str(length)]) print outstr