def main(): usage = "usage: %prog [options] file.vcf \n output format values from genotype data field in a VCF for suitabale plotting/dataviz" parser = OptionParser(usage) parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False) parser.add_option("--formatTag", dest="format", default="GT", help="format tag to compare (default GT)") (options, args)=parser.parse_args() vcfilename=args[0] #vcfilename='/Users/indapa/software/Pgmsnp/PythonNotebook/child5x.nrs.sites.calledWith20x_bam.child5x.nrs.sites.calledWith5x_bam.combineVariants.vcf' basename=os.path.splitext(vcfilename)[0] vcfobj=VcfFile(vcfilename) vcfh=open(vcfilename,'r') vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() +"\n" samples=vcfobj.getSampleList() print "\t".join(samples) for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): vrec_ziptuple=vrec.zipGenotypes(samples) outputs=[] for (sample, geno_obj) in vrec_ziptuple: outputs.append( "\t".join( [geno_obj.getFormatVal(options.format) ] ) ) print "\t".join(outputs)
def main(): usage = "usage: %prog [options] nrd.log.vcf\n" parser = OptionParser(usage) # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) (options, args) = parser.parse_args() vcfilename = args[0] basename = os.path.splitext(vcfilename)[0] vcfobj = VcfFile(vcfilename) vcfh = open(vcfilename, "r") nrdallfh = open(basename + ".allgenos.nrd.txt", "w") nrdtwofh = open(basename + ".twogenos.nrd.txt", "w") nrdonefh = open(basename + ".onegenos.nrd.txt", "w") vcfobj.parseMetaAndHeaderLines(vcfh) samples = vcfobj.getSampleList() # print samples # print "#setname\t" + "\t".join(samples) for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): outputline = [[vrec.getPos()]] setname = vrec.returnInfoDict()["set"] # which callset does the site belong to? outputline.append([setname]) # we aggregate genotypes per sample heere vrec_ziptuple = vrec.zipGenotypes(samples) # print vrec_ziptuple """ Since I'm testing against trio, NRD count can be 1 2 or 3 We keep track of the nrd count and print those records to the appropriate file: nrdallfh, nrdtwofh, nrdonefh """ nrd_count = 0 for (compare, eval) in grouper(2, vrec_ziptuple): (comp_allele1, comp_allele2) = compare[1].getAlleles() (eval_allele1, eval_allele2) = eval[1].getAlleles() eval_alleletype = typeofGenotype(eval_allele1, eval_allele2) comp_alleletype = typeofGenotype(comp_allele1, comp_allele2) if eval_alleletype == comp_alleletype: continue outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)]) nrd_count += 1 output = "\t".join(melt_lol(outputline)) """ depending on the nrd count, print the records to appropirate file(s) """ if nrd_count == 3: nrdallfh.write(output + "\n") if nrd_count == 2: nrdtwofh.write(output + "\n") if nrd_count == 1: nrdonefh.write(output + "\n")
def main(): usage = "usage: %prog [options] file.vcf" parser = OptionParser(usage) parser.add_option("--filter", type="string", dest="filter", help="analyze only those records matching filter (default is None)", default=None) parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE") parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None) (options, args)=parser.parse_args() vcfilename=args[0] vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaLines(vcfh) vcfobj.addMetaInfoHeader("CR", "D", 1, "site call rate") vcfobj.printMetaLines() vcfh.seek(0) vcfobj.parseHeaderLine(vcfh) vcfobj.printHeaderLine() samplelist = vcfobj.getSampleList() sampleCalls={} #key sample name value #called genotypes for s in samplelist: sampleCalls[s]=0 totalrecords=0 for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ): if vrec.getFilter() != options.filter and options.filter != None : continue totalrecords+=1 sitecallrate=vrec.siteCallrate() vrec.appendInfoString("CR="+str(sitecallrate)) vrec.sampleCallrate(samplelist,sampleCalls) #print vrec.toStringwithGenotypes() for s in samplelist: callrate=float(sampleCalls[s])/float(totalrecords) print s, sampleCalls[s], totalrecords, callrate
def main(): usage = "usage: %prog [options] " parser = argparse.ArgumentParser(description='Given a gzipped vcf file and pedigree file, generate a new vcf with only those samples present in the pedigree (ped file) ') parser.add_argument('-ped', dest='pedfile', type=str, help="*.ped file") parser.add_argument('vcfile', type=str,help='*.vcf.gz file') args=parser.parse_args() """ parse the pedfile and return the list of iids to keep from the VCF file """ pedobj=Pedfile(args.pedfile) pedobj.parsePedfile() keeplist= pedobj.returnIndivids() #open the VCFfile vcfh=gzip.open(args.vcfile,'r') vcfobj=VcfFile(args.vcfile) vcfobj.parseMetaAndHeaderLines(vcfh) samples=vcfobj.getSampleList() newsamples= [ s for s in samples if s in keeplist] print newsamples vcfobj.setSampleList(newsamples) header=vcfobj.returnHeader() print header for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): keepGenotypes=[] vrec_ziptuple=vrec.zipGenotypes(samples) for (s, genObj) in vrec_ziptuple: if s in keeplist: keepGenotypes.append( genObj ) vrec.addGenotypeList( keepGenotypes ) print vrec.toStringwithGenotypes()
def main(): """ remove samples from a vcf file """ usage = "usage: %prog [options] file.vcf.gz " # parser = OptionParser(usage) parser = argparse.ArgumentParser(description="remove samples from vcf file") parser.add_argument("removesamples", metavar="sample", type=str, nargs="+", help="sample names to remove") parser.add_argument("-vcf", dest="vcfile", type=str, help="vcf file to remove samples from") # parser.add_argument("vcf", help="vcf file to analyze") args = parser.parse_args() # print 'remove these samples: ', args.samples # print args.vcfile vcfh = gzip.open(args.vcfile, "r") vcfobj = VcfFile(args.vcfile) vcfobj.parseMetaAndHeaderLines(vcfh) # print header samples = vcfobj.getSampleList() newsamples = [s for s in samples if s not in args.removesamples] # print 'keep these samples: ', newsamples vcfobj.setSampleList(newsamples) header = vcfobj.returnHeader() print header for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): keepGenotypes = [] vrec_ziptuple = vrec.zipGenotypes(samples) for (s, genObj) in vrec_ziptuple: if s not in args.removesamples: # print s keepGenotypes.append(genObj) # print keepGenotypes vrec.addGenotypeList(keepGenotypes) print vrec.toStringwithGenotypes()
def main(): usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n" parser = OptionParser(usage) parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False) (options, args)=parser.parse_args() vcfilename=args[0] basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0] """ row is eval, column is comparison make a numpy matrix to represent genotype concordance matrix """ concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] ) calledtable = np.matrix ( [ [0 ,0] , [0,0] ] ) #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix outputfile=".".join([basename, 'variantEval','txt']) outputfh=open(outputfile, 'w') #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set nrslog=".".join([basename, 'nrs','log']) nrdlog=".".join([basename, 'nrd','log']) filterlog=".".join([basename, 'filtered','log']) multialleliclog=".".join([basename, 'multiallelic','log']) concordancelog=".".join([basename, 'concordance','log']) fieldslog=".".join([basename, 'fields', 'log']) nrsfh=open(nrslog, 'w') nrdfh=open(nrdlog, 'w') filteredfh=open(filterlog, 'w') multifh=open(multialleliclog, 'w') concordancefh=open(concordancelog, 'w') fieldsfh=open(fieldslog, 'w') fieldsfh.write('set'+"\n") vcfobj=VcfFile(vcfilename) vcfh=gzip.open(vcfilename,'r') vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() +"\n" nrsfh.write(header) nrdfh.write(header) filteredfh.write(header) concordancefh.write(header) multifh.write(header) #outputfh.write(header) #multifh.write(header) samples=vcfobj.getSampleList() #for (comparename, evalname) in grouper(2,samples): # print comparename, evalname vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for (compare,eval) in grouper(2,samples) ] for evalObj in vcf_sample_eval_objects: evalObj.writeHeaders(header) totalrecords=0 pattern=';set=(\S+)' for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): if ',' in vrec.getAlt() > 1: outstring=vrec.toStringwithGenotypes() + "\n" multifh.write(outstring) #continue """ skip homoz reference calls unless you want to include them! """ if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False: continue """ if variant is filtered, skip it! """ if 'filterIn' in vrec.getInfo() and options.includeFilter == False: outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue if 'FilteredInAll' in vrec.getInfo(): outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue #returns a list [ (samplename, vcfgenotype) , ... () ] vrec_ziptuple=vrec.zipGenotypes(samples) """ we make a hack and make a list like so: [(sample.variant, compare_genotype, sample.variant2, eval_genotype) ... ] basically it halves the length of vrec_ziptuple and gives it the same structure as the list of VcfSampleEval objects""" compare_eval =[ compare+evalu for (compare,evalu) in grouper(2,vrec_ziptuple) ] #what set are you in? field=re.search(pattern, vrec.getInfo()).groups()[0] fieldsfh.write(field+"\n") totalrecords+=1 """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype """ for (genotype_tuple, evalObj) in izip(compare_eval, vcf_sample_eval_objects): #print genotype_tuple compare=genotype_tuple[0:2] eval=genotype_tuple[2::] #print compare #print eval (comp_allele1, comp_allele2)=compare[1].getAlleles() (eval_allele1, eval_allele2)=eval[1].getAlleles() eval_alleletype=typeofGenotype(eval_allele1, eval_allele2) comp_alleletype=typeofGenotype(comp_allele1, comp_allele2) """ increment the cell count """ concordancetable[eval_alleletype, comp_alleletype]+=1 evalObj.incrementcellcount(eval_alleletype,comp_alleletype) """write gentoype record to log appropriate log file """ #print records that contirubut the NRS penalty if eval_alleletype == 3: if comp_alleletype == 1 or comp_alleletype==2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring) evalObj.writeNrs(outstring) if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring ) evalObj.writeNrs(outstring) #print records that contribute to NRD penalty if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 0: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) if eval_alleletype == 1: if comp_alleletype == 0 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 1: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) if eval_alleletype == 2: if comp_alleletype == 0 or comp_alleletype ==1: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) for evalObj in vcf_sample_eval_objects: evalObj.writeEvalOutput() outputfh.write("total records analyzed: " + str(totalrecords) + "\n" ) outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write("\t".join(['','AA','AB','BB', './.' ]) +"\n") rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.'] for (i, gt) in grouper(2,rownames): row=concordancetable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(concordancetable) outputfh.write( str(sum) +"\n") #now we figure out how many sites were called or not called calledtable[0,0]=concordancetable[0:3,0:3].sum() calledtable[0,1]=concordancetable[0:3,3].sum() calledtable[1,0]=concordancetable[3,0:3].sum() calledtable[1,1]=concordancetable[3,3] outputfh.write("\n") rownames=[ 0,'called', 1,'./.' ] outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write( "\t".join(['','called','./.' ]) +"\n" ) for (i, gt) in grouper(2,rownames): row=calledtable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(calledtable) outputfh.write( str(sum) +"\n") outputfh.write("\n") if options.matrixonly == False: discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2] nrd=round( (float(discordance)/float(total)) * 100, 2) variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2] variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2] nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2) outputfh.write( "NRD: " + str(nrd) +" \n") outputfh.write( "NRS " + str(nrs) +" \n")
def main(): usage = "usage: %prog [options] file.vcf" parser = OptionParser(usage) parser.add_option("--filter", type="string", dest="filter", help="analyze only those records with matching filter") (options, args)=parser.parse_args() vcfilename=args[0] if vcfilename.endswith(".gz"): vcfh=gzip.open(vcfilename,'r') else: vcfh=open(vcfilename,'r') #vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaAndHeaderLines(vcfh) TsTv_counter=collections.Counter() RefAlt_counter=collections.Counter() samples=vcfobj.getSampleList() genotype_dict={} for s in samples: genotype_dict[s]=[0,0,0,0] counter=0 for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): if vrec.getFilter() != options.filter and options.filter != None: sys.stderr.write("skipped filter..\n") continue #print vrec.toString() counter+=1 if vrec.getAlt() == ".": continue ref=vrec.getRef() numAlleles=vrec.getAlt().split(',') if len(numAlleles) > 1: sys.stderr.write("multi alleleic record\n") for alt in numAlleles: if len(alt) ==1 and len(ref) ==1: if isTransition(ref,alt) == True: TsTv_counter['transition']+=1 else: TsTv_counter['transversion']+=1 refalt_string=" ".join( [ ref, alt]) #since the number of alleles on indels is unbounded, we only keep track of single nucleotide substitutions RefAlt_counter[ refalt_string ]+=1 vrec_ziptuple=vrec.zipGenotypes(samples) genotype_typecounts=get_genotype_counts(vrec_ziptuple) for (g, sample) in genotype_typecounts: #print g,sample if g == None: sys.stderr.write("skipped genotype\n") continue genotype_dict[sample][g]+=1 print print " ".join( ['sample', 'homoz_ref', 'het', 'homoz_nonref', 'nocall', 'total']) for sample in genotype_dict.keys(): """ http://docs.python.org/library/functions.html#reduce """ tota=reduce(lambda x, y: x+y,genotype_dict[sample]) outstring = " ".join( map(str,genotype_dict[sample]) ) print " ".join ( [sample, outstring,str(tota)]) print for (type,count) in TsTv_counter.items(): print type, count TsTvratio=float(TsTv_counter['transition'])/float(TsTv_counter['transversion']) print "TsTv: ", round( TsTvratio,2) totalpercent=0 for a1,a2 in combinations('ACGT',2): count1 = RefAlt_counter[ ' '.join ( [ a1, a2] ) ] count2 = RefAlt_counter[ ' '.join ( [ a2, a1] ) ] total=count1 + count2 try: percent= round ( float(total) / float(sum(RefAlt_counter.values()) ), 4) print ' '.join ( [ a1, a2] ), str(total), str(percent) totalpercent+=percent except ZeroDivisionError: sys.stderr.write( " integer division or modulo by zero\n") #for (type, count) in RefAlt_counter.items(): # print type, count print sum(RefAlt_counter.values()), str(totalpercent) print "Total vcf records: " + str(counter) + "\n"
def main(): usage = "usage: %prog [options] file.vcf.gz " parser = argparse.ArgumentParser(description='filter records based on genotypes') parser.add_argument('vcf', metavar='vcf', type=str, help='vcf.gz file') """ http://stackoverflow.com/a/15008806/1735942 """ parser.add_argument('--no-header',dest='header',action='store_false') parser.add_argument('-gt', metavar='gt', type=str, nargs='*', action='append', help='sample 0/0') args = parser.parse_args() """ http://stackoverflow.com/q/12460989/1735942 """ args.gt = [el for elements in args.gt for el in elements] #print args.gq gt_filter=[ tuple(x.split(' ')) for x in args.gt ] gt_dict=defaultdict(list) for (k,v) in gt_filter: gt_dict[k].append(v) #print gt_dict vcfh=gzip.open(args.vcf,'r') vcfobj=VcfFile(args.vcf) vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() if args.header == True: print header samplelist=vcfobj.getSampleList() for s in gt_dict.keys(): if s not in samplelist: print s ," not in samples!\n" sys.exit(1) #print header #print header #print gt_dict.keys() for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ): genotypes_toFilter=[] #list [ (sample,genoObj) ] to be filtered genotype_tuple= vrec.zipGenotypes(samplelist) ## get a list of tuples [ (sample, VcfGenotype object) ... ] for (s,g) in genotype_tuple: if s in gt_dict.keys(): #print s if len(gt_dict[s]) > 1: # logical or if any( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ): genotypes_toFilter.append(True) else: genotypes_toFilter.append(False) else: if all( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ): genotypes_toFilter.append(True) else:genotypes_toFilter.append(False) # all gt filters need to evaluate to True in order for record to print if all(item == True for item in genotypes_toFilter): print vrec.toStringwithGenotypes()
def main(): """ given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF""" usage = "usage: %prog [option] file.vcf.gz" parser = OptionParser(usage) parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on") parser.add_option( "--mapq", type="float", dest="mapq", default=0.0, help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)", ) parser.add_option( "--bq", type="float", dest="bq", default=0.0, help="Exclude bases from analysis if their supporting base quality is less that --bq (default is 0)", ) parser.add_option( "--includeDuplicates", action="store_false", dest="duplicate", help="include duplicate marked reads in analysis (turned off by default) ", ) (options, args) = parser.parse_args() if options.bam == None: sys.stderr.write("please provide a value to --bam option\n") sys.exit(1) vcfilename = args[0] bamfilename = options.bam ra_formatline = FormatLine("RA", number="1", type="Integer", description="number of reference alleles observed") aa_formatline = FormatLine("AA", number="1", type="Integer", description="number of alternate alleles observed") if os.path.exists(bamfilename + ".bai") == False: sys.stderr.write("please check for existence of bam index file (*.bai)\n") exit(1) vcfobj = VcfFile(vcfilename) vcfh = gzip.open(vcfilename, "r") vcfobj.parseMetaAndHeaderLines(vcfh) vcfobj.addMetaFormatHeader(ra_formatline) vcfobj.addMetaFormatHeader(aa_formatline) vcfobj.addMetaInfoHeader("RA", "Integer", "1", "total number of reference alleles observed") vcfobj.addMetaInfoHeader("AA", "Integer", "1", "total number of alternate alleles observed") header = vcfobj.returnHeader() print header readgroupdict = {} pybamfile = pysam.Samfile(bamfilename, "rb") rgdictlist = pybamfile.header["RG"] for dictionary in rgdictlist: readgroupdict[dictionary["ID"]] = dictionary["SM"] # print readgroupdict samples = vcfobj.getSampleList() # print samples for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): (chrom, start, end) = vrec.getChrom(), int(vrec.getPos()) - 1, int(vrec.getPos()) # print chrom, str(start), str(end) # print vrec.getRef() # print vrec.toStringwithGenotypes() for pileupcolumn in pybamfile.pileup(chrom, start, end): if pileupcolumn.pos != end: continue # sys.stdout.write('chr'+chrom+ " " + str(start) + " " + str(end) + " " + str(pileupcolumn.pos) + " ") # print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.n) seqdict = {} sampledict = {} for s in samples: sampledict[s] = [] # print sampledict for (base, count) in (("A", 0), ("C", 0), ("G", 0), ("T", 0), ("N", 0)): seqdict[base] = count for pileupread in pileupcolumn.pileups: if pileupread.alignment.is_duplicate == True and options.duplicate == False: continue if pileupread.alignment.mapq < options.mapq: continue if (ord(pileupread.alignment.qual[pileupread.qpos - 1]) - 33) < options.bq: continue seqdict[pileupread.alignment.seq[pileupread.qpos - 1]] += 1 readgroup = dict(pileupread.alignment.tags)["RG"] sample = readgroupdict[readgroup] # print readgroup,sample, pileupread.alignment.seq[pileupread.qpos-1] sampledict[sample].append(pileupread.alignment.seq[pileupread.qpos - 1]) # print pileupread.alignment.seq, len(pileupread.alignment.seq), pileupread.qpos vrec.addInfo("RA=" + str(seqdict[vrec.getRef()])) if vrec.getAlt() != ".": vrec.addInfo("AA=" + str(seqdict[vrec.getAlt()])) zip_genos = vrec.zipGenotypes(samples) for (sample, vcfgenobj) in zip_genos: if len(sampledict[sample]) == 0: vcfgenobj.addFormat("RA") vcfgenobj.addFormat("AA") continue else: ra = 0 aa = 0 c = dict(Counter(sampledict[sample])) if vrec.getRef() in c.keys(): ra = c[vrec.getRef()] if vrec.getAlt() in c.keys(): aa = c[vrec.getAlt()] vcfgenobj.addFormatVal("RA", str(ra)) vcfgenobj.addFormatVal("AA", str(aa)) # for nt in ('A', 'C', 'G', 'T', 'N'): # sys.stdout.write( str(seqdict[nt]) + " ") # sys.stdout.write("\n") print vrec.toStringwithGenotypes() pybamfile.close()
def main(): usage = "usage: %prog [options] file.vcf.gz" parser = OptionParser(usage) parser.add_option("--model", type="string", dest="model", default = "dominant", help=" inheritance model [dominant|recessive], default is dominant ") parser.add_option("--ped", type="string", dest="pedfile", default=None, help="ped file of samples with phenotype (disease) status") parser.add_option("--filter", type="string", dest="filter", help="analyze only those records matching filter (default is PASS)", default='PASS') (options, args)=parser.parse_args() if options.pedfile==None: sys.stderr.write("please provide a value to --ped parameter!\n") exit(1) affecteds=[] # list of affected samples unaffecteds=[] # list of unaffected samples pedobjects=[] #list of pedobjects, represents lines in a pedfile pedfh=open(options.pedfile, 'r') for line in pedfh: fields=line.strip().split('\t') (fid,iid,pid,mid,sex,phenotype)=fields[0:6] phenotype=int(phenotype) pedobjects.append( Ped(fid,iid,pid,mid,sex,phenotype) ) #the phenotype status is set to 2 if the sample is affected: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped affecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 2 ] unaffecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 1 ] #check if any overlapping samples between unaffected and affected if len( list( set(unaffecteds).intersection( set(affecteds) ) ) ) != 0: sys.stderr.write("check list of affected and unaffecteds for overlapping samples!\n") exit(1) # sys.stderr.write("check list of affected and unaffected for overlapping samples!\n") # exit(1) vcfilename=args[0] vcfh=gzip.open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() samplelist=vcfobj.getSampleList() print header for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ): affected_genotypes=[] #list of tuples (sample, VcfGenotype object) with samples that are affected unaffected_genotypes=[] # list of tuples (sample, VcfGenotype object) with samples that are unaffected if vrec.getFilter() != options.filter and options.filter != None : continue genotype_tuple= vrec.zipGenotypes(samplelist) # get a list of tuples [ (sample, VcfGenotype object) ... ] for (sample, genotype) in genotype_tuple: #iterate thru and see if they are in affected or unaffected list if options.model == 'dominant': if sample in affecteds: # if so ... affected_genotypes.append( ( sample, genotype.toString(), genotype.isSegregating() ) ) # are they segregating for a non-ref allele? if sample in unaffecteds: unaffected_genotypes.append( (sample, genotype.toString(), genotype.isSegregating() ) ) # are they segregating for a non-ref allele? elif options.model == 'recessive': if sample in affecteds: affected_genotypes.append( ( sample, genotype.toString(), genotype.isNonRefHomz() ) ) # are they segregating for a non-ref homoz? if sample in unaffecteds: unaffected_genotypes.append( (sample, genotype.toString(), genotype.isNonRefHomz() ) ) # are they segregating for a non-ref non-refhomoz? else: sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n") if options.model == 'dominant': #under dominant model, all affecteds should be #segrgating for non-ref allele and all UN-affecteds should *NOT* be segregating for non-ref allele #how many affected individuals are segregating for non-ref allele? count_segregating_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True) #how many UN-affected individuals are *NOT* segregating for non-ref allele? count_segregating_unaffected = [ tpl[2] == False for tpl in unaffected_genotypes ].count(True) #now if all affects are segregating for the site # and all the un-affecteds are *not* segregating for the site # it is a candidate if count_segregating_affected == len(affecteds): if count_segregating_unaffected == len(unaffecteds): print vrec.toStringwithGenotypes() elif options.model == 'recessive': #how many affected individuals are segregating for non-ref allele? #http://stackoverflow.com/a/5684324/1735942 count_homoz_nonref_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True) #how many UN-affected individuals are *NOT* segregating for non-ref allele? count_homoz_ref_unaffected = [ tpl[2] == False for tpl in unaffected_genotypes ].count(True) #now if all affects are homoz nonref for the site # and all the un-affecteds are homoz ref for the site # it is a candidate if count_homoz_nonref_affected == len(affecteds): if count_homoz_ref_unaffected == len(unaffecteds): print vrec.toStringwithGenotypes() else: sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n")