def main(): """ prints the description of ##INFO metalines in a VCF """ usage = "usage: %prog [options] file.vcf" parser = OptionParser(usage) parser.add_option("--infotag", type="string", dest="infotag", help="prints the description for the INFO id infotag") parser.add_option("--all", action="store_true", dest="all", help="prints the description for *every* INFO tag in VCF") parser.add_option (options, args)=parser.parse_args() vcfilename=args[0] vcfh=open(vcfilename, 'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaLines(vcfh) descriptors = vcfobj.getMetaInfoDescription() found_tag=0 for (id, description) in descriptors: if options.all==True: print id, description found_tag=1 continue if id == options.infotag: print id, "\t", description found_tag=1 if found_tag ==0 : sys.stderr.write(options.infotag + " not in ##INFO headers\n")
def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE") parser.add_option("--filter", type="string", dest="filter", help="only analyze records with matching filter (default is None)", default=None) (options, args)=parser.parse_args() if options.infotag == "": sys.stderr.write("provide a value for --info parameter!\n") exit(1) variant_dict={} #key variant type value VcfRecord object vcfilename=args[0] vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaAndHeaderLines(vcfh) descriptors = vcfobj.getMetaInfoDescription() infoids=[] for (tag, description) in descriptors: tag infoids.append(tag) if options.infotag not in infoids and options.infotag != 'QUAL': sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n") exit(1) pattern=options.infotag+'=(\S+)' for vrec in vcfobj.yieldVcfRecord(vcfh): if vrec.getFilter() != options.filter and options.filter != None: continue searchresult=re.search(pattern, vrec.getInfo() ) if re.search(pattern, vrec.getInfo() ) == None: continue else: value=re.search(pattern, vrec.getInfo() ).groups()[0] #rint value if value not in variant_dict.keys(): variant_dict[value]=[] variant_dict[value].append( vrec ) else: variant_dict[value].append( vrec ) sum=0 sys.stderr.write("types and count of different variant classes in " + vcfilename + "\n") for k in variant_dict.keys(): print k, len( variant_dict[k] ) sum+=len( variant_dict[k] ) print "TOTAL:", sum
def binned_bitsets_from_vcffile( vcfilename, chrom_col=0, start_col=1, upstream_pad=0, downstream_pad=0, lens={} ): """ Read a vcffile into a dictionary of bitsets. The defaults arguments - 'vcfilename' should be a filename for vcf file - 'chrom_col', 'start_col', and 'end_col' must exist in each line. - if 'lens' is provided bitset sizes will be looked up from it, otherwise chromosomes will be assumed to be the maximum size - the bitset interval made into a zero-based, half-open interval!!!!!!! """ last_chrom = None last_bitset = None bitsets = dict() MAX=2147483647 vcfobj=VcfFile(vcfilename) fh=open(vcfilename,'r') for vrec in vcfobj.yieldVcfRecord(fh): filtercode = vrec.getFilter() chrom = vrec.getChrom() pos=int( vrec.getPos() ) #if filtercode != filtercodeoption and filtercodeoption != None: # continue if filtercode != 'PASS': if filtercode == '.': pass else: continue chrom="chr"+chrom if chrom != last_chrom: if chrom not in bitsets: if chrom in lens: size = lens[chrom] else: size = MAX bitsets[chrom] = BinnedBitSet( size ) last_chrom = chrom last_bitset = bitsets[chrom] start, end = (pos-1, pos) if upstream_pad: start = max( 0, start - upstream_pad ) if downstream_pad: end = min( size, end + downstream_pad ) if start > end: warn( "Interval start after end!" ) last_bitset.set_range( start, end-start ) fh.close() return bitsets
def main(): usage = "usage: %prog [options] nrd.log.vcf\n" parser = OptionParser(usage) # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) (options, args) = parser.parse_args() vcfilename = args[0] basename = os.path.splitext(vcfilename)[0] vcfobj = VcfFile(vcfilename) vcfh = open(vcfilename, "r") nrdallfh = open(basename + ".allgenos.nrd.txt", "w") nrdtwofh = open(basename + ".twogenos.nrd.txt", "w") nrdonefh = open(basename + ".onegenos.nrd.txt", "w") vcfobj.parseMetaAndHeaderLines(vcfh) samples = vcfobj.getSampleList() # print samples # print "#setname\t" + "\t".join(samples) for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): outputline = [[vrec.getPos()]] setname = vrec.returnInfoDict()["set"] # which callset does the site belong to? outputline.append([setname]) # we aggregate genotypes per sample heere vrec_ziptuple = vrec.zipGenotypes(samples) # print vrec_ziptuple """ Since I'm testing against trio, NRD count can be 1 2 or 3 We keep track of the nrd count and print those records to the appropriate file: nrdallfh, nrdtwofh, nrdonefh """ nrd_count = 0 for (compare, eval) in grouper(2, vrec_ziptuple): (comp_allele1, comp_allele2) = compare[1].getAlleles() (eval_allele1, eval_allele2) = eval[1].getAlleles() eval_alleletype = typeofGenotype(eval_allele1, eval_allele2) comp_alleletype = typeofGenotype(comp_allele1, comp_allele2) if eval_alleletype == comp_alleletype: continue outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)]) nrd_count += 1 output = "\t".join(melt_lol(outputline)) """ depending on the nrd count, print the records to appropirate file(s) """ if nrd_count == 3: nrdallfh.write(output + "\n") if nrd_count == 2: nrdtwofh.write(output + "\n") if nrd_count == 1: nrdonefh.write(output + "\n")
def main(): """ This program adds non-reference positions to a VCF file with variant positions. It does this by the following. Given a bed file of non-variant intervals and a 2bit file of the reference genome, it retrieves the refernce alllele, and prints out the VCF data line with the ref/ref genotypes. Then it prints a single line from the segregating VCF file, and then start the loop again. It assumes that the input vcf is position sorted. To generate the non-segrgating bed interval file, run the following program from bx-python: bed_subtract_basewise.py reference_genome.bed segregating.sites.bed bed_subtract_basewise.py ~/software/Pgmsnp/PythonNotebook/simref.1.bed Simulation1.segregating.bed """ usage = "usage: %prog [options] file.vcf" parser = OptionParser(usage) parser.add_option("--bed", type="string", dest="bed", help="bed file with non-variant intervals") parser.add_option("--tbf", type="string", dest="tbf", help="2bit file of reference genome", default='/Users/amit/data/MySimulations/Simulation1/Reference/simref.1.2bit') (options, args)=parser.parse_args() try: sys.stderr.write("opening twobitfile...\n") twobit=bx.seq.twobit.TwoBitFile( open( options.tbf ) ) except: sys.stderr.write("unable to open twobit file!\n") segregatingVcf=args[0] bedfh=open(options.bed,'r') vcfh=open(segregatingVcf,'r') vcfobj=VcfFile(segregatingVcf) vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() formatstring="GT" print header for (chrom,start,end) in yield_bedcoordinate(bedfh): start=int(start) end=int(end) for i in range(start,end): begin=i end=i+1 refseq=twobit[chrom][begin:end] vrec=VcfRecord(chrom,str(end),'.',refseq,'.','.','.','NS=3') vrec.addGenotype( VcfGenotype(formatstring,'0/0') ) vrec.addGenotype( VcfGenotype(formatstring,'0/0') ) vrec.addGenotype( VcfGenotype(formatstring,'0/0') ) print vrec.toStringwithGenotypes() vcf_gen=vcfobj.yieldVcfRecordwithGenotypes(vcfh) print vcf_gen.next().toStringwithGenotypes()
def main(): """ This program extracts out records matching the set=(\S+). Typically, the VCF is derived from GATK CombineVariants, but any vcf with set=(\S+) can be examined with this program """ usage = "usage: %prog [options] file.vcf.gz " #parser = OptionParser(usage) parser = argparse.ArgumentParser(description=' extract records with matching set=(\S+) tag') parser.add_argument('vcfile', metavar='vcfile', type=str, help='file.vcf.gz') #parser.add_argument('-filter', dest='filter', type=str, default=".", help='filter value') parser.add_argument('-set', dest='set', type=str, default=None, help="name of set to extract") args = parser.parse_args() if args.set == None: sys.stderr.write("please provide value to -set option!\n") sys.exit(1) (path, vcfile)=os.path.split(args.vcfile ) basename=return_file_basename( return_file_basename(vcfile) ) sys.stderr.write( basename +"\n") outvcf=".".join([basename, args.set, 'vcf']) sys.stderr.write( outvcf +"\n") outfh=open(outvcf, 'w') if args.vcfile.endswith(".gz"): vcfh=gzip.open(args.vcfile,'r') else: vcfh=open(args.vcfile,'r') vcfobj=VcfFile(args.vcfile) pattern=';set=(\S+)' vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() outfh.write( header +"\n") for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): searchresult=re.search(pattern, vrec.getInfo() ) if re.search(pattern, vrec.getInfo() ) == None: continue value=re.search(pattern, vrec.getInfo() ).groups()[0] #print value if value == args.set: outfh.write( vrec.toStringwithGenotypes() +"\n" )
def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option("--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None) parser.add_option("--addchr", action="store_true", dest="addchr", help="pre-pend 'chr' to chrom column ", default=False) parser.add_option("--siteinfo", action="store_true", dest="siteinfo", help="use if vcf only has site information and lacks FORMAT column") parser.add_option("--dump", action="store_true", dest="dump", help="dump everything after teh ID column in the 4th bed column") parser.add_option("--chr", type="string", dest="chr", default=None, help="restrct to chromosome number specified by --chr") (options, args)=parser.parse_args() vcfilename=args[0] #basename, extension = os.path.splitext(vcfilename) #bedfile=basename+".bed" #bedfh=open(bedfile,'w') vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaAndHeaderLines(vcfh) for dataline in vcfobj.yieldVcfDataLine(vcfh): fields=dataline.strip().split('\t') if options.siteinfo == True: (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8] else: (chrom,pos,id,ref,alt,qual,filtercode,info,format)=fields[0:9] if options.chr != None and chrom != options.chr: continue if options.addchr ==True: chrom='chr'+chrom if filtercode != options.filter and options.filter != None : continue (start,end) = (int(pos)-1, int(pos)) if options.dump == True: # @type options if options.siteinfo == True: gstrings=",".join(fields[8::]) else: gstrings=",".join(fields[9::]) dumpstring="".join([ref,alt,qual,filtercode,info,gstrings]) bedstring= "\t".join( [ chrom, str(start), str(end), id ,dumpstring] ) else: bedstring= "\t".join( [ chrom, str(start), str(end), id] ) print bedstring
def main(): usage = "usage: %prog [options] file.vcf \n output format values from genotype data field in a VCF for suitabale plotting/dataviz" parser = OptionParser(usage) parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False) parser.add_option("--formatTag", dest="format", default="GT", help="format tag to compare (default GT)") (options, args)=parser.parse_args() vcfilename=args[0] #vcfilename='/Users/indapa/software/Pgmsnp/PythonNotebook/child5x.nrs.sites.calledWith20x_bam.child5x.nrs.sites.calledWith5x_bam.combineVariants.vcf' basename=os.path.splitext(vcfilename)[0] vcfobj=VcfFile(vcfilename) vcfh=open(vcfilename,'r') vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() +"\n" samples=vcfobj.getSampleList() print "\t".join(samples) for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): vrec_ziptuple=vrec.zipGenotypes(samples) outputs=[] for (sample, geno_obj) in vrec_ziptuple: outputs.append( "\t".join( [geno_obj.getFormatVal(options.format) ] ) ) print "\t".join(outputs)
def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) usage = "usage: %prog [options] file.vcf \n print summary information about site depth in records of a VCF file\n" parser = OptionParser(usage) parser.add_option("--max", type="int", dest="max", help="skip records that are greater than or equal to max (default sys.maxint)", default=sys.maxint) #parser.add_option("--v", action="store_true", dest="snp", help="restrict analysis to SNPs (must have INFO ID SNP in header") (options, args)=parser.parse_args() vcfilename=args[0] fileName, fileExtension = os.path.splitext(vcfilename) #nuller.12:80717441..80717681.vcf regionpattern='nuller.(\d+):(\d+)..(\d+)' results=re.search(regionpattern,fileName ).groups() regionstr="\t".join(list(results)) vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaLines(vcfh) descriptors = vcfobj.getMetaInfoDescription() infoids=[] for (tag, description) in descriptors: infoids.append(tag) if 'DP' not in infoids: sys.stderr.write("DP tag not in ##INFO headers!") exit(1) vcfh.seek(0) vcfobj.parseHeaderLine(vcfh) pattern='DP=(\d+)' depth_list=[] for vrec in vcfobj.yieldVcfRecord(vcfh): dp=re.search(pattern, vrec.getInfo() ).groups()[0] if dp == None: sys.stderr.write("unable to parse DP value from INFO field\n") continue else: if int(dp) >= options.max: continue depth_list.append(int(dp)) maxDP=max( array (depth_list)) minDP= min (array (depth_list)) medianDP=median (array (depth_list)) meanDP=mean( array(depth_list)) length=len(depth_list) outstr="\t".join([regionstr, str(maxDP), str(minDP), str(medianDP), str(meanDP), str(length)]) print outstr
def main(): usage = "usage: %prog [options] " parser = argparse.ArgumentParser(description='Given a gzipped vcf file and pedigree file, generate a new vcf with only those samples present in the pedigree (ped file) ') parser.add_argument('-ped', dest='pedfile', type=str, help="*.ped file") parser.add_argument('vcfile', type=str,help='*.vcf.gz file') args=parser.parse_args() """ parse the pedfile and return the list of iids to keep from the VCF file """ pedobj=Pedfile(args.pedfile) pedobj.parsePedfile() keeplist= pedobj.returnIndivids() #open the VCFfile vcfh=gzip.open(args.vcfile,'r') vcfobj=VcfFile(args.vcfile) vcfobj.parseMetaAndHeaderLines(vcfh) samples=vcfobj.getSampleList() newsamples= [ s for s in samples if s in keeplist] print newsamples vcfobj.setSampleList(newsamples) header=vcfobj.returnHeader() print header for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): keepGenotypes=[] vrec_ziptuple=vrec.zipGenotypes(samples) for (s, genObj) in vrec_ziptuple: if s in keeplist: keepGenotypes.append( genObj ) vrec.addGenotypeList( keepGenotypes ) print vrec.toStringwithGenotypes()
def main(): """ remove samples from a vcf file """ usage = "usage: %prog [options] file.vcf.gz " # parser = OptionParser(usage) parser = argparse.ArgumentParser(description="remove samples from vcf file") parser.add_argument("removesamples", metavar="sample", type=str, nargs="+", help="sample names to remove") parser.add_argument("-vcf", dest="vcfile", type=str, help="vcf file to remove samples from") # parser.add_argument("vcf", help="vcf file to analyze") args = parser.parse_args() # print 'remove these samples: ', args.samples # print args.vcfile vcfh = gzip.open(args.vcfile, "r") vcfobj = VcfFile(args.vcfile) vcfobj.parseMetaAndHeaderLines(vcfh) # print header samples = vcfobj.getSampleList() newsamples = [s for s in samples if s not in args.removesamples] # print 'keep these samples: ', newsamples vcfobj.setSampleList(newsamples) header = vcfobj.returnHeader() print header for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): keepGenotypes = [] vrec_ziptuple = vrec.zipGenotypes(samples) for (s, genObj) in vrec_ziptuple: if s not in args.removesamples: # print s keepGenotypes.append(genObj) # print keepGenotypes vrec.addGenotypeList(keepGenotypes) print vrec.toStringwithGenotypes()
def main(): usage = "usage: %prog [options] file.vcf" parser = OptionParser(usage) parser.add_option("--filter", type="string", dest="filter", help="analyze only those records with matching filter") (options, args)=parser.parse_args() vcfilename=args[0] if vcfilename.endswith(".gz"): vcfh=gzip.open(vcfilename,'r') else: vcfh=open(vcfilename,'r') #vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaAndHeaderLines(vcfh) TsTv_counter=collections.Counter() RefAlt_counter=collections.Counter() samples=vcfobj.getSampleList() genotype_dict={} for s in samples: genotype_dict[s]=[0,0,0,0] counter=0 for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): if vrec.getFilter() != options.filter and options.filter != None: sys.stderr.write("skipped filter..\n") continue #print vrec.toString() counter+=1 if vrec.getAlt() == ".": continue ref=vrec.getRef() numAlleles=vrec.getAlt().split(',') if len(numAlleles) > 1: sys.stderr.write("multi alleleic record\n") for alt in numAlleles: if len(alt) ==1 and len(ref) ==1: if isTransition(ref,alt) == True: TsTv_counter['transition']+=1 else: TsTv_counter['transversion']+=1 refalt_string=" ".join( [ ref, alt]) #since the number of alleles on indels is unbounded, we only keep track of single nucleotide substitutions RefAlt_counter[ refalt_string ]+=1 vrec_ziptuple=vrec.zipGenotypes(samples) genotype_typecounts=get_genotype_counts(vrec_ziptuple) for (g, sample) in genotype_typecounts: #print g,sample if g == None: sys.stderr.write("skipped genotype\n") continue genotype_dict[sample][g]+=1 print print " ".join( ['sample', 'homoz_ref', 'het', 'homoz_nonref', 'nocall', 'total']) for sample in genotype_dict.keys(): """ http://docs.python.org/library/functions.html#reduce """ tota=reduce(lambda x, y: x+y,genotype_dict[sample]) outstring = " ".join( map(str,genotype_dict[sample]) ) print " ".join ( [sample, outstring,str(tota)]) print for (type,count) in TsTv_counter.items(): print type, count TsTvratio=float(TsTv_counter['transition'])/float(TsTv_counter['transversion']) print "TsTv: ", round( TsTvratio,2) totalpercent=0 for a1,a2 in combinations('ACGT',2): count1 = RefAlt_counter[ ' '.join ( [ a1, a2] ) ] count2 = RefAlt_counter[ ' '.join ( [ a2, a1] ) ] total=count1 + count2 try: percent= round ( float(total) / float(sum(RefAlt_counter.values()) ), 4) print ' '.join ( [ a1, a2] ), str(total), str(percent) totalpercent+=percent except ZeroDivisionError: sys.stderr.write( " integer division or modulo by zero\n") #for (type, count) in RefAlt_counter.items(): # print type, count print sum(RefAlt_counter.values()), str(totalpercent) print "Total vcf records: " + str(counter) + "\n"
def main(): """ This program bins the records of a VCF file according to a user defined range and number of bins. For example if -start 10 and -end 100 and -num of 10 it would make 10 bins: 10, 20, 30, 40,50,60,70,80,90,100 Then for each record if the QUAL is >=x, then that record is written to *.qual_x.vcf file """ usage = "usage: %prog [options] file.vcf.gz " #parser = OptionParser(usage) parser = argparse.ArgumentParser(description=' bin vcf records according to QUAL') parser.add_argument('vcfile', metavar='vcfile', type=str, help='file.vcf.gz') parser.add_argument('-filter', dest='filter', type=str, default=".", help='filter value') parser.add_argument('-start', dest='start', type=int, help="starting point for QUAL range") parser.add_argument('-end', dest='end', type=int, help="ending pint for QUAL range") parser.add_argument('-num', dest='num', type=int, help='number of bins') #parser.add_argument("vcf", help="vcf file to analyze") args = parser.parse_args() #print args (path, vcfile)=os.path.split(args.vcfile ) basename=return_file_basename( return_file_basename(vcfile) ) print basename if args.start == None or args.end == None or args.num == None: sys.stderr.write("please give start stop and number of bins for QUAL") sys.exit(1) bins=np.linspace(args.start, args.end, args.num) binstring=bins binstring=binstring.astype(int).tolist() print binstring binned_vcfilenames=[ ".".join( [ basename, "qual_"+ str(s), "vcf"]) for s in binstring ] print binned_vcfilenames #binned_fh = itertools.chain(*(open(f, "w") for f in binned_vcfilenames)) """ we create a list of filehandles for the binned VCFs """ binned_fh=list(itertools.imap(lambda x:open(x,'w'), binned_vcfilenames)) if args.vcfile.endswith(".gz"): vcfh=gzip.open(args.vcfile,'r') else: vcfh=open(args.vcfile,'r') vcfobj=VcfFile(args.vcfile) #vcf_reader = vcf.Reader(open(args.vcfile, 'r')) #print vcf_reader.metadata vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() map(lambda x: x.write(header+"\n"),binned_fh) #vcfrecord_bins= [ [] for i in xrange(len(bins)) ] sys.stderr.write("binning vcf records based on quality ....\n") #for vrec in vcf_reader: for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): """ skip records that do not have PASS in filter column """ if vrec.getFilter() != args.filter: continue QUAL=float(vrec.getQual()) vcfstring=vrec.toStringwithGenotypes() for i in xrange(len(bins)): if QUAL >= bins[i]: binned_fh[i].write(vcfstring+"\n") #vcfrecord_bins[i].append(vrec) else: break map(lambda x: x.close(),binned_fh)
def main(): usage = "usage: %prog [options] file.vcf" parser = OptionParser(usage) parser.add_option("--tbf", type="string", dest="tbf", help="2bit file") parser.add_option( "--pad", type="int", dest="pad", default=0, help="extract sequence upstream and downstream of position by pad value", ) parser.add_option( "--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE", ) parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default="snp") (options, args) = parser.parse_args() # open 2bitfile try: sys.stderr.write("opening twobitfile...\n") twobit = bx.seq.twobit.TwoBitFile(open(options.tbf)) except: sys.stderr.write("unable to open twobit file!\n") exit(1) # open the vcf file vcfile = args[0] vcfh = open(vcfile, "r") vcfobj = VcfFile(vcfh) vcfobj.parseMetaAndHeaderLines(vcfh) pattern = options.infotag + "=(" + options.variantype + ")" sequence = "" downstream_seq = "" upstream_seq = "" for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): pos = vrec.getPos() start = int(pos) - 1 end = int(pos) info = vrec.getInfo() if re.search(pattern, info) == None: continue else: value = re.search(pattern, info).groups()[0] # print vrec.toString() assert end > start, "end greater than start!" try: sequence = twobit["chr" + vrec.getChrom()][start:end] sequence = sequence.upper() except: sys.stderr.write("unable to fetch sequence from 2bit file!\n") if options.pad != 0: downstream_start = int(pos) upstream_end = int(pos) - 1 downstream_seq = twobit["chr" + vrec.getChrom()][downstream_start : downstream_start + options.pad] upstream_seq = twobit["chr" + vrec.getChrom()][upstream_end - options.pad : upstream_end] outstr = "\t".join( [ "chr" + vrec.getChrom(), str(start), str(end), sequence, str(upstream_end - options.pad), str(upstream_end), upstream_seq, str(downstream_start), str(downstream_start + options.pad), downstream_seq, ] ) else: outstr = "\t".join(["chr" + vrec.getChrom(), str(start), str(end), sequence]) print outstr
def main(): """ given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF""" usage = "usage: %prog [option] file.vcf.gz" parser = OptionParser(usage) parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on") parser.add_option( "--mapq", type="float", dest="mapq", default=0.0, help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)", ) parser.add_option( "--bq", type="float", dest="bq", default=0.0, help="Exclude bases from analysis if their supporting base quality is less that --bq (default is 0)", ) parser.add_option( "--includeDuplicates", action="store_false", dest="duplicate", help="include duplicate marked reads in analysis (turned off by default) ", ) (options, args) = parser.parse_args() if options.bam == None: sys.stderr.write("please provide a value to --bam option\n") sys.exit(1) vcfilename = args[0] bamfilename = options.bam ra_formatline = FormatLine("RA", number="1", type="Integer", description="number of reference alleles observed") aa_formatline = FormatLine("AA", number="1", type="Integer", description="number of alternate alleles observed") if os.path.exists(bamfilename + ".bai") == False: sys.stderr.write("please check for existence of bam index file (*.bai)\n") exit(1) vcfobj = VcfFile(vcfilename) vcfh = gzip.open(vcfilename, "r") vcfobj.parseMetaAndHeaderLines(vcfh) vcfobj.addMetaFormatHeader(ra_formatline) vcfobj.addMetaFormatHeader(aa_formatline) vcfobj.addMetaInfoHeader("RA", "Integer", "1", "total number of reference alleles observed") vcfobj.addMetaInfoHeader("AA", "Integer", "1", "total number of alternate alleles observed") header = vcfobj.returnHeader() print header readgroupdict = {} pybamfile = pysam.Samfile(bamfilename, "rb") rgdictlist = pybamfile.header["RG"] for dictionary in rgdictlist: readgroupdict[dictionary["ID"]] = dictionary["SM"] # print readgroupdict samples = vcfobj.getSampleList() # print samples for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): (chrom, start, end) = vrec.getChrom(), int(vrec.getPos()) - 1, int(vrec.getPos()) # print chrom, str(start), str(end) # print vrec.getRef() # print vrec.toStringwithGenotypes() for pileupcolumn in pybamfile.pileup(chrom, start, end): if pileupcolumn.pos != end: continue # sys.stdout.write('chr'+chrom+ " " + str(start) + " " + str(end) + " " + str(pileupcolumn.pos) + " ") # print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.n) seqdict = {} sampledict = {} for s in samples: sampledict[s] = [] # print sampledict for (base, count) in (("A", 0), ("C", 0), ("G", 0), ("T", 0), ("N", 0)): seqdict[base] = count for pileupread in pileupcolumn.pileups: if pileupread.alignment.is_duplicate == True and options.duplicate == False: continue if pileupread.alignment.mapq < options.mapq: continue if (ord(pileupread.alignment.qual[pileupread.qpos - 1]) - 33) < options.bq: continue seqdict[pileupread.alignment.seq[pileupread.qpos - 1]] += 1 readgroup = dict(pileupread.alignment.tags)["RG"] sample = readgroupdict[readgroup] # print readgroup,sample, pileupread.alignment.seq[pileupread.qpos-1] sampledict[sample].append(pileupread.alignment.seq[pileupread.qpos - 1]) # print pileupread.alignment.seq, len(pileupread.alignment.seq), pileupread.qpos vrec.addInfo("RA=" + str(seqdict[vrec.getRef()])) if vrec.getAlt() != ".": vrec.addInfo("AA=" + str(seqdict[vrec.getAlt()])) zip_genos = vrec.zipGenotypes(samples) for (sample, vcfgenobj) in zip_genos: if len(sampledict[sample]) == 0: vcfgenobj.addFormat("RA") vcfgenobj.addFormat("AA") continue else: ra = 0 aa = 0 c = dict(Counter(sampledict[sample])) if vrec.getRef() in c.keys(): ra = c[vrec.getRef()] if vrec.getAlt() in c.keys(): aa = c[vrec.getAlt()] vcfgenobj.addFormatVal("RA", str(ra)) vcfgenobj.addFormatVal("AA", str(aa)) # for nt in ('A', 'C', 'G', 'T', 'N'): # sys.stdout.write( str(seqdict[nt]) + " ") # sys.stdout.write("\n") print vrec.toStringwithGenotypes() pybamfile.close()
def main(): usage = "usage: %prog [options] file.vcf" parser = OptionParser(usage) parser.add_option("--filter", type="string", dest="filter", help="analyze only those records matching filter (default is None)", default=None) parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE") parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None) (options, args)=parser.parse_args() vcfilename=args[0] vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaLines(vcfh) vcfobj.addMetaInfoHeader("CR", "D", 1, "site call rate") vcfobj.printMetaLines() vcfh.seek(0) vcfobj.parseHeaderLine(vcfh) vcfobj.printHeaderLine() samplelist = vcfobj.getSampleList() sampleCalls={} #key sample name value #called genotypes for s in samplelist: sampleCalls[s]=0 totalrecords=0 for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ): if vrec.getFilter() != options.filter and options.filter != None : continue totalrecords+=1 sitecallrate=vrec.siteCallrate() vrec.appendInfoString("CR="+str(sitecallrate)) vrec.sampleCallrate(samplelist,sampleCalls) #print vrec.toStringwithGenotypes() for s in samplelist: callrate=float(sampleCalls[s])/float(totalrecords) print s, sampleCalls[s], totalrecords, callrate
def main(): usage = "usage: %prog [options] maf file.vcf" parser = OptionParser(usage) parser.add_option("--maftag", type="string", dest="maftag", help="INFO tag id that annotates the allele freq of the record", default="AF") parser.add_option("--variantag", type="string", dest="vtag", help="INFO tag that annotates the type of variant type", default="VT") parser.add_option("--variantype", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None) parser.add_option("--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None) parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file has no header file", default=False) parser.add_option("--quiet", action="store_true", dest="quiet", help="don't print vcf output to stdout", default=False) parser.add_option("--leq", type="float", dest="leq", default=1.0, help="keep variants with AF <= (default 1)") parser.add_option("--geq", type="float", dest="geq", default=0.0, help="keep variants with AF >= (default 0)") (options, args)=parser.parse_args() if len(args)!=1: sys.stderr.write(usage+"\n") exit(1) vcfilename=args[0] #maf=float(args[0]) freqfh=open('freq.log', 'w') vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) if options.noheader == False: vcfobj.parseMetaLines(vcfh) #vcfobj.printMetaLines() descriptors = vcfobj.getMetaInfoDescription() infoids=[] for (tag, description) in descriptors: infoids.append(tag) if options.maftag not in infoids and options.maftag != 'QUAL' and options.noheader == False: sys.stderr.write(options.maftag + " tag not in ##INFO headers!\n") exit(1) if options.vtag not in infoids and options.vtag != 'QUAL' and options.noheader==False: sys.stderr.write(options.vtag + " tag not in ##INFO headers!\n") exit(1) #vcfh.seek(0) if options.noheader == False: vcfobj.parseHeaderLine(vcfh) if options.variantype==None: variantpattern=options.vtag+'=(\w+);' else: variantpattern=options.vtag+'=('+options.variantype+');' mafpattern=options.maftag+'=(0.\d+)' #print mafpattern, variantpattern for dataline in vcfobj.yieldVcfDataLine(vcfh): #print dataline fields=dataline.strip().split('\t') (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8] #if filtercode != options.filter and options.filter != None : continue if re.search(variantpattern, info ) == None: #sys.stderr.write("no variant pattern\n") continue variant_type=re.search(variantpattern, info ).groups()[0] if re.search(mafpattern, info ) == None: #sys.stderr.write("No mafpattern!\n") #sys.stderr.write(dataline+"\n") continue maf_value=re.search(mafpattern, info ).groups()[0] if float(maf_value) <= options.leq and float(maf_value) >= options.geq: if options.quiet == False: print dataline logstring="\t".join([chrom,pos,id,ref,alt,variant_type, options.maftag, maf_value]) freqfh.write(logstring+'\n')
def main(): usage = "usage: %prog [options] file.vcf.gz" parser = OptionParser(usage) parser.add_option("--model", type="string", dest="model", default = "dominant", help=" inheritance model [dominant|recessive], default is dominant ") parser.add_option("--ped", type="string", dest="pedfile", default=None, help="ped file of samples with phenotype (disease) status") parser.add_option("--filter", type="string", dest="filter", help="analyze only those records matching filter (default is PASS)", default='PASS') (options, args)=parser.parse_args() if options.pedfile==None: sys.stderr.write("please provide a value to --ped parameter!\n") exit(1) affecteds=[] # list of affected samples unaffecteds=[] # list of unaffected samples pedobjects=[] #list of pedobjects, represents lines in a pedfile pedfh=open(options.pedfile, 'r') for line in pedfh: fields=line.strip().split('\t') (fid,iid,pid,mid,sex,phenotype)=fields[0:6] phenotype=int(phenotype) pedobjects.append( Ped(fid,iid,pid,mid,sex,phenotype) ) #the phenotype status is set to 2 if the sample is affected: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped affecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 2 ] unaffecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 1 ] #check if any overlapping samples between unaffected and affected if len( list( set(unaffecteds).intersection( set(affecteds) ) ) ) != 0: sys.stderr.write("check list of affected and unaffecteds for overlapping samples!\n") exit(1) # sys.stderr.write("check list of affected and unaffected for overlapping samples!\n") # exit(1) vcfilename=args[0] vcfh=gzip.open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() samplelist=vcfobj.getSampleList() print header for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ): affected_genotypes=[] #list of tuples (sample, VcfGenotype object) with samples that are affected unaffected_genotypes=[] # list of tuples (sample, VcfGenotype object) with samples that are unaffected if vrec.getFilter() != options.filter and options.filter != None : continue genotype_tuple= vrec.zipGenotypes(samplelist) # get a list of tuples [ (sample, VcfGenotype object) ... ] for (sample, genotype) in genotype_tuple: #iterate thru and see if they are in affected or unaffected list if options.model == 'dominant': if sample in affecteds: # if so ... affected_genotypes.append( ( sample, genotype.toString(), genotype.isSegregating() ) ) # are they segregating for a non-ref allele? if sample in unaffecteds: unaffected_genotypes.append( (sample, genotype.toString(), genotype.isSegregating() ) ) # are they segregating for a non-ref allele? elif options.model == 'recessive': if sample in affecteds: affected_genotypes.append( ( sample, genotype.toString(), genotype.isNonRefHomz() ) ) # are they segregating for a non-ref homoz? if sample in unaffecteds: unaffected_genotypes.append( (sample, genotype.toString(), genotype.isNonRefHomz() ) ) # are they segregating for a non-ref non-refhomoz? else: sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n") if options.model == 'dominant': #under dominant model, all affecteds should be #segrgating for non-ref allele and all UN-affecteds should *NOT* be segregating for non-ref allele #how many affected individuals are segregating for non-ref allele? count_segregating_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True) #how many UN-affected individuals are *NOT* segregating for non-ref allele? count_segregating_unaffected = [ tpl[2] == False for tpl in unaffected_genotypes ].count(True) #now if all affects are segregating for the site # and all the un-affecteds are *not* segregating for the site # it is a candidate if count_segregating_affected == len(affecteds): if count_segregating_unaffected == len(unaffecteds): print vrec.toStringwithGenotypes() elif options.model == 'recessive': #how many affected individuals are segregating for non-ref allele? #http://stackoverflow.com/a/5684324/1735942 count_homoz_nonref_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True) #how many UN-affected individuals are *NOT* segregating for non-ref allele? count_homoz_ref_unaffected = [ tpl[2] == False for tpl in unaffected_genotypes ].count(True) #now if all affects are homoz nonref for the site # and all the un-affecteds are homoz ref for the site # it is a candidate if count_homoz_nonref_affected == len(affecteds): if count_homoz_ref_unaffected == len(unaffecteds): print vrec.toStringwithGenotypes() else: sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n")
def main(): usage = "usage: %prog [options] vcf_file_one vcf|bed_file_two\n\nFind regions in the first vcf file that overlap regions of the second vcf or bed file\n" parser = OptionParser(usage) parser.add_option("--minCols", type="int", dest="mincols", default=1, help="mininum basepair overlap (default is one)") parser.add_option("--v", action="store_true", dest="reverse", help="Print regions in first vcf that DO NOT overlap second vcf|bed file") parser.add_option("--filter", type="string", dest="filter", default=None, help="intersect records only set with filter (default is None") parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE") parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None) parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file one has no header line", default=False) parser.add_option("--nochrprefix", action="store_false", dest="chrprefix", help="use if the bed doesn't have chr prefix in chrom column", default=True) (options, args)=parser.parse_args() sys.stderr.write("intersecting two files ...\n") vcf_file_one=args[0] in2_fname=args[1] in2_fname_ext= os.path.splitext(in2_fname)[1][1:] if "bed" == in2_fname_ext: bitsets = binned_bitsets_from_file( open( in2_fname ) ) if "vcf" == in2_fname_ext: bitsets = binned_bitsets_from_vcffile( in2_fname , options.filter) vcfobj=VcfFile(vcf_file_one) vcfh=open(vcf_file_one,'r') if options.noheader == False: vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() #print header #vcfobj.parseMetaAndHeaderLines(vcfh) #descriptors = vcfobj.getMetaInfoDescription() #infoids=[] #for (tag, description) in descriptors: # infoids.append(tag) #if options.infotag not in infoids and options.infotag != 'QUAL' and options.infotag != "" and options.noheader == False: # sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n") # exit(1) print header for dataline in vcfobj.yieldVcfDataLine(vcfh): fields=dataline.strip().split('\t') (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8] (start,end) = (int(pos)-1, int(pos)) #pass the filter code if filtercode != options.filter and options.filter != None: continue #check to see if record is the correct variant TYPE if options.variantype != None: pattern=options.infotag+'=('+options.variantype+')' if re.search(pattern, info ) == None: continue if options.chrprefix == True: chrom="chr"+chrom if chrom in bitsets and bitsets[chrom].count_range( start, end-start ) >= options.mincols: if not options.reverse: print dataline else: if options.reverse == True: print dataline
def main(): usage = "usage: %prog [options] file.vcf.gz " parser = argparse.ArgumentParser(description='filter records based on genotypes') parser.add_argument('vcf', metavar='vcf', type=str, help='vcf.gz file') """ http://stackoverflow.com/a/15008806/1735942 """ parser.add_argument('--no-header',dest='header',action='store_false') parser.add_argument('-gt', metavar='gt', type=str, nargs='*', action='append', help='sample 0/0') args = parser.parse_args() """ http://stackoverflow.com/q/12460989/1735942 """ args.gt = [el for elements in args.gt for el in elements] #print args.gq gt_filter=[ tuple(x.split(' ')) for x in args.gt ] gt_dict=defaultdict(list) for (k,v) in gt_filter: gt_dict[k].append(v) #print gt_dict vcfh=gzip.open(args.vcf,'r') vcfobj=VcfFile(args.vcf) vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() if args.header == True: print header samplelist=vcfobj.getSampleList() for s in gt_dict.keys(): if s not in samplelist: print s ," not in samples!\n" sys.exit(1) #print header #print header #print gt_dict.keys() for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ): genotypes_toFilter=[] #list [ (sample,genoObj) ] to be filtered genotype_tuple= vrec.zipGenotypes(samplelist) ## get a list of tuples [ (sample, VcfGenotype object) ... ] for (s,g) in genotype_tuple: if s in gt_dict.keys(): #print s if len(gt_dict[s]) > 1: # logical or if any( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ): genotypes_toFilter.append(True) else: genotypes_toFilter.append(False) else: if all( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ): genotypes_toFilter.append(True) else:genotypes_toFilter.append(False) # all gt filters need to evaluate to True in order for record to print if all(item == True for item in genotypes_toFilter): print vrec.toStringwithGenotypes()
def main(): usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n" parser = OptionParser(usage) parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False) (options, args)=parser.parse_args() vcfilename=args[0] basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0] """ row is eval, column is comparison make a numpy matrix to represent genotype concordance matrix """ concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] ) calledtable = np.matrix ( [ [0 ,0] , [0,0] ] ) #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix outputfile=".".join([basename, 'variantEval','txt']) outputfh=open(outputfile, 'w') #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set nrslog=".".join([basename, 'nrs','log']) nrdlog=".".join([basename, 'nrd','log']) filterlog=".".join([basename, 'filtered','log']) multialleliclog=".".join([basename, 'multiallelic','log']) concordancelog=".".join([basename, 'concordance','log']) fieldslog=".".join([basename, 'fields', 'log']) nrsfh=open(nrslog, 'w') nrdfh=open(nrdlog, 'w') filteredfh=open(filterlog, 'w') multifh=open(multialleliclog, 'w') concordancefh=open(concordancelog, 'w') fieldsfh=open(fieldslog, 'w') fieldsfh.write('set'+"\n") vcfobj=VcfFile(vcfilename) vcfh=gzip.open(vcfilename,'r') vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() +"\n" nrsfh.write(header) nrdfh.write(header) filteredfh.write(header) concordancefh.write(header) multifh.write(header) #outputfh.write(header) #multifh.write(header) samples=vcfobj.getSampleList() #for (comparename, evalname) in grouper(2,samples): # print comparename, evalname vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for (compare,eval) in grouper(2,samples) ] for evalObj in vcf_sample_eval_objects: evalObj.writeHeaders(header) totalrecords=0 pattern=';set=(\S+)' for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): if ',' in vrec.getAlt() > 1: outstring=vrec.toStringwithGenotypes() + "\n" multifh.write(outstring) #continue """ skip homoz reference calls unless you want to include them! """ if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False: continue """ if variant is filtered, skip it! """ if 'filterIn' in vrec.getInfo() and options.includeFilter == False: outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue if 'FilteredInAll' in vrec.getInfo(): outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue #returns a list [ (samplename, vcfgenotype) , ... () ] vrec_ziptuple=vrec.zipGenotypes(samples) """ we make a hack and make a list like so: [(sample.variant, compare_genotype, sample.variant2, eval_genotype) ... ] basically it halves the length of vrec_ziptuple and gives it the same structure as the list of VcfSampleEval objects""" compare_eval =[ compare+evalu for (compare,evalu) in grouper(2,vrec_ziptuple) ] #what set are you in? field=re.search(pattern, vrec.getInfo()).groups()[0] fieldsfh.write(field+"\n") totalrecords+=1 """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype """ for (genotype_tuple, evalObj) in izip(compare_eval, vcf_sample_eval_objects): #print genotype_tuple compare=genotype_tuple[0:2] eval=genotype_tuple[2::] #print compare #print eval (comp_allele1, comp_allele2)=compare[1].getAlleles() (eval_allele1, eval_allele2)=eval[1].getAlleles() eval_alleletype=typeofGenotype(eval_allele1, eval_allele2) comp_alleletype=typeofGenotype(comp_allele1, comp_allele2) """ increment the cell count """ concordancetable[eval_alleletype, comp_alleletype]+=1 evalObj.incrementcellcount(eval_alleletype,comp_alleletype) """write gentoype record to log appropriate log file """ #print records that contirubut the NRS penalty if eval_alleletype == 3: if comp_alleletype == 1 or comp_alleletype==2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring) evalObj.writeNrs(outstring) if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring ) evalObj.writeNrs(outstring) #print records that contribute to NRD penalty if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 0: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) if eval_alleletype == 1: if comp_alleletype == 0 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 1: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) if eval_alleletype == 2: if comp_alleletype == 0 or comp_alleletype ==1: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) for evalObj in vcf_sample_eval_objects: evalObj.writeEvalOutput() outputfh.write("total records analyzed: " + str(totalrecords) + "\n" ) outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write("\t".join(['','AA','AB','BB', './.' ]) +"\n") rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.'] for (i, gt) in grouper(2,rownames): row=concordancetable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(concordancetable) outputfh.write( str(sum) +"\n") #now we figure out how many sites were called or not called calledtable[0,0]=concordancetable[0:3,0:3].sum() calledtable[0,1]=concordancetable[0:3,3].sum() calledtable[1,0]=concordancetable[3,0:3].sum() calledtable[1,1]=concordancetable[3,3] outputfh.write("\n") rownames=[ 0,'called', 1,'./.' ] outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write( "\t".join(['','called','./.' ]) +"\n" ) for (i, gt) in grouper(2,rownames): row=calledtable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(calledtable) outputfh.write( str(sum) +"\n") outputfh.write("\n") if options.matrixonly == False: discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2] nrd=round( (float(discordance)/float(total)) * 100, 2) variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2] variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2] nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2) outputfh.write( "NRD: " + str(nrd) +" \n") outputfh.write( "NRS " + str(nrs) +" \n")
def main(): usage = "usage: %prog [options] file.vcf\n print records belonging to a certain type of variant class (e.g. SNP) in a VCF file\n\n" parser = OptionParser(usage) parser.add_option( "--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE", ) parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None) parser.add_option( "--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None ) parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file has no header file") (options, args) = parser.parse_args() if options.infotag == "": sys.stderr.write("provide a value for --info parameter!\n") exit(1) if options.variantype == "": sys.stderr.write("provide a value of --type parameter!\n") exit(1) variant_dict = {} vcfilename = args[0] vcfh = open(vcfilename, "r") # instantiate a VcfFile object vcfobj = VcfFile(vcfilename) # parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaAndHeaderLines(vcfh) vcfobj.printMetaAndHeaderLines() descriptors = vcfobj.getMetaInfoDescription() infoids = [] for (tag, description) in descriptors: infoids.append(tag) if options.infotag not in infoids and options.infotag != "QUAL": sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n") exit(1) if options.variantype != None: pattern = options.infotag + "=(" + options.variantype + ")" for dataline in vcfobj.yieldVcfDataLine(vcfh): fields = dataline.strip().split("\t") (chrom, pos, id, ref, alt, qual, filtercode, info) = fields[0:8] if filtercode != options.filter and options.filter != None: continue if options.variantype != None: if re.search(pattern, info) == None: continue else: value = re.search(pattern, info).groups()[0] print dataline else: print dataline