def main(): usage = "usage: %prog [options] nrd.log.vcf\n" parser = OptionParser(usage) # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) (options, args) = parser.parse_args() vcfilename = args[0] basename = os.path.splitext(vcfilename)[0] vcfobj = VcfFile(vcfilename) vcfh = open(vcfilename, "r") nrdallfh = open(basename + ".allgenos.nrd.txt", "w") nrdtwofh = open(basename + ".twogenos.nrd.txt", "w") nrdonefh = open(basename + ".onegenos.nrd.txt", "w") vcfobj.parseMetaAndHeaderLines(vcfh) samples = vcfobj.getSampleList() # print samples # print "#setname\t" + "\t".join(samples) for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): outputline = [[vrec.getPos()]] setname = vrec.returnInfoDict()["set"] # which callset does the site belong to? outputline.append([setname]) # we aggregate genotypes per sample heere vrec_ziptuple = vrec.zipGenotypes(samples) # print vrec_ziptuple """ Since I'm testing against trio, NRD count can be 1 2 or 3 We keep track of the nrd count and print those records to the appropriate file: nrdallfh, nrdtwofh, nrdonefh """ nrd_count = 0 for (compare, eval) in grouper(2, vrec_ziptuple): (comp_allele1, comp_allele2) = compare[1].getAlleles() (eval_allele1, eval_allele2) = eval[1].getAlleles() eval_alleletype = typeofGenotype(eval_allele1, eval_allele2) comp_alleletype = typeofGenotype(comp_allele1, comp_allele2) if eval_alleletype == comp_alleletype: continue outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)]) nrd_count += 1 output = "\t".join(melt_lol(outputline)) """ depending on the nrd count, print the records to appropirate file(s) """ if nrd_count == 3: nrdallfh.write(output + "\n") if nrd_count == 2: nrdtwofh.write(output + "\n") if nrd_count == 1: nrdonefh.write(output + "\n")
def main(): usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n" parser = OptionParser(usage) parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False) (options, args)=parser.parse_args() vcfilename=args[0] basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0] """ row is eval, column is comparison make a numpy matrix to represent genotype concordance matrix """ concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] ) calledtable = np.matrix ( [ [0 ,0] , [0,0] ] ) #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix outputfile=".".join([basename, 'variantEval','txt']) outputfh=open(outputfile, 'w') #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set nrslog=".".join([basename, 'nrs','log']) nrdlog=".".join([basename, 'nrd','log']) filterlog=".".join([basename, 'filtered','log']) multialleliclog=".".join([basename, 'multiallelic','log']) concordancelog=".".join([basename, 'concordance','log']) fieldslog=".".join([basename, 'fields', 'log']) nrsfh=open(nrslog, 'w') nrdfh=open(nrdlog, 'w') filteredfh=open(filterlog, 'w') multifh=open(multialleliclog, 'w') concordancefh=open(concordancelog, 'w') fieldsfh=open(fieldslog, 'w') fieldsfh.write('set'+"\n") vcfobj=VcfFile(vcfilename) vcfh=gzip.open(vcfilename,'r') vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() +"\n" nrsfh.write(header) nrdfh.write(header) filteredfh.write(header) concordancefh.write(header) multifh.write(header) #outputfh.write(header) #multifh.write(header) samples=vcfobj.getSampleList() #for (comparename, evalname) in grouper(2,samples): # print comparename, evalname vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for (compare,eval) in grouper(2,samples) ] for evalObj in vcf_sample_eval_objects: evalObj.writeHeaders(header) totalrecords=0 pattern=';set=(\S+)' for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): if ',' in vrec.getAlt() > 1: outstring=vrec.toStringwithGenotypes() + "\n" multifh.write(outstring) #continue """ skip homoz reference calls unless you want to include them! """ if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False: continue """ if variant is filtered, skip it! """ if 'filterIn' in vrec.getInfo() and options.includeFilter == False: outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue if 'FilteredInAll' in vrec.getInfo(): outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue #returns a list [ (samplename, vcfgenotype) , ... () ] vrec_ziptuple=vrec.zipGenotypes(samples) """ we make a hack and make a list like so: [(sample.variant, compare_genotype, sample.variant2, eval_genotype) ... ] basically it halves the length of vrec_ziptuple and gives it the same structure as the list of VcfSampleEval objects""" compare_eval =[ compare+evalu for (compare,evalu) in grouper(2,vrec_ziptuple) ] #what set are you in? field=re.search(pattern, vrec.getInfo()).groups()[0] fieldsfh.write(field+"\n") totalrecords+=1 """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype """ for (genotype_tuple, evalObj) in izip(compare_eval, vcf_sample_eval_objects): #print genotype_tuple compare=genotype_tuple[0:2] eval=genotype_tuple[2::] #print compare #print eval (comp_allele1, comp_allele2)=compare[1].getAlleles() (eval_allele1, eval_allele2)=eval[1].getAlleles() eval_alleletype=typeofGenotype(eval_allele1, eval_allele2) comp_alleletype=typeofGenotype(comp_allele1, comp_allele2) """ increment the cell count """ concordancetable[eval_alleletype, comp_alleletype]+=1 evalObj.incrementcellcount(eval_alleletype,comp_alleletype) """write gentoype record to log appropriate log file """ #print records that contirubut the NRS penalty if eval_alleletype == 3: if comp_alleletype == 1 or comp_alleletype==2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring) evalObj.writeNrs(outstring) if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring ) evalObj.writeNrs(outstring) #print records that contribute to NRD penalty if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 0: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) if eval_alleletype == 1: if comp_alleletype == 0 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 1: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) if eval_alleletype == 2: if comp_alleletype == 0 or comp_alleletype ==1: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) evalObj.writeNrd(outstring) if comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) evalObj.writeConcordance( outstring) for evalObj in vcf_sample_eval_objects: evalObj.writeEvalOutput() outputfh.write("total records analyzed: " + str(totalrecords) + "\n" ) outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write("\t".join(['','AA','AB','BB', './.' ]) +"\n") rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.'] for (i, gt) in grouper(2,rownames): row=concordancetable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(concordancetable) outputfh.write( str(sum) +"\n") #now we figure out how many sites were called or not called calledtable[0,0]=concordancetable[0:3,0:3].sum() calledtable[0,1]=concordancetable[0:3,3].sum() calledtable[1,0]=concordancetable[3,0:3].sum() calledtable[1,1]=concordancetable[3,3] outputfh.write("\n") rownames=[ 0,'called', 1,'./.' ] outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write( "\t".join(['','called','./.' ]) +"\n" ) for (i, gt) in grouper(2,rownames): row=calledtable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(calledtable) outputfh.write( str(sum) +"\n") outputfh.write("\n") if options.matrixonly == False: discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2] nrd=round( (float(discordance)/float(total)) * 100, 2) variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2] variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2] nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2) outputfh.write( "NRD: " + str(nrd) +" \n") outputfh.write( "NRS " + str(nrs) +" \n")