Esempio n. 1
0
    def writeEvalOutput(self):
        
        self.outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
        self.outputfh.write("\t".join(['','AA','AB','BB', './.'  ])  +"\n")
   
        rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.']
        for (i, gt) in grouper(2,rownames):
            row=self.concordancetable[i,:].tolist()
            for r in row:
                outstr="\t".join(map(str,r))
                self.outputfh.write( gt +"\t"+outstr+"\n")

        self.outputfh.write( "matrix sum: \n")
        summy=np.sum(self.concordancetable)
        self.outputfh.write( str(summy) +"\n")
        
        
        
        #now we figure out how many sites were called or not called
        self.calledtable[0,0]=self.concordancetable[0:3,0:3].sum()
        self.calledtable[0,1]=self.concordancetable[0:3,3].sum()
        self.calledtable[1,0]=self.concordancetable[3,0:3].sum()
        self.calledtable[1,1]=self.concordancetable[3,3]
        self.outputfh.write("\n")
        rownames=[ 0,'called', 1,'./.' ]
        self.outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
        self.outputfh.write(  "\t".join(['','called','./.' ]) +"\n" )
    
        for (i, gt) in grouper(2,rownames):
            row=self.calledtable[i,:].tolist()
            for r in row:
                outstr="\t".join(map(str,r))
                self.outputfh.write( gt +"\t"+outstr+"\n")
        self.outputfh.write( "matrix sum: \n")
        summy=np.sum(self.calledtable)
        self.outputfh.write( str(summy) +"\n")
   
        self.outputfh.write("\n")
        
        
        discordance=self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,0]+self.concordancetable[1,2]+self.concordancetable[2,0]+self.concordancetable[2,1]
        total=self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,0]+self.concordancetable[1,1]+ self.concordancetable[1,2]+self.concordancetable[2,0]+self.concordancetable[2,1] +self.concordancetable[2,2]
    
        nrd=round( (float(discordance)/float(total)) * 100, 2)
    
        variant_count_evaluation= self.concordancetable[1,1]+ self.concordancetable[1,2]+ self.concordancetable[2,1]+ self.concordancetable[2,2]
    
        variant_count_comparison= self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,1]+self.concordancetable[1,2]+self.concordancetable[2,1]+self.concordancetable[2,2]+self.concordancetable[3,1]+self.concordancetable[3,2]
        nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2)
    
        self.outputfh.write( "NRD: " + str(nrd) +" \n")
        self.outputfh.write( "NRS " + str(nrs) +" \n")
        
        outstring=",".join( map(str,melt_lol(self.concordancetable.tolist())) )
        self.genotypematrixfh.write(outstring+"\n")
def main():
    usage = "usage: %prog [options]  nrd.log.vcf\n"
    parser = OptionParser(usage)
    # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)

    (options, args) = parser.parse_args()
    vcfilename = args[0]
    basename = os.path.splitext(vcfilename)[0]

    vcfobj = VcfFile(vcfilename)
    vcfh = open(vcfilename, "r")
    nrdallfh = open(basename + ".allgenos.nrd.txt", "w")
    nrdtwofh = open(basename + ".twogenos.nrd.txt", "w")
    nrdonefh = open(basename + ".onegenos.nrd.txt", "w")
    vcfobj.parseMetaAndHeaderLines(vcfh)
    samples = vcfobj.getSampleList()
    # print samples
    # print "#setname\t" + "\t".join(samples)
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        outputline = [[vrec.getPos()]]

        setname = vrec.returnInfoDict()["set"]  # which callset does the site belong to?

        outputline.append([setname])  # we aggregate genotypes per sample heere

        vrec_ziptuple = vrec.zipGenotypes(samples)
        # print vrec_ziptuple
        """ Since I'm testing against trio, NRD count can be 1 2 or 3
            We keep track of the nrd count and print those records to the appropriate file:
            nrdallfh, nrdtwofh, nrdonefh  """
        nrd_count = 0
        for (compare, eval) in grouper(2, vrec_ziptuple):
            (comp_allele1, comp_allele2) = compare[1].getAlleles()
            (eval_allele1, eval_allele2) = eval[1].getAlleles()
            eval_alleletype = typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype = typeofGenotype(comp_allele1, comp_allele2)
            if eval_alleletype == comp_alleletype:
                continue
            outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)])
            nrd_count += 1

        output = "\t".join(melt_lol(outputline))
        """ depending on the nrd count, print the records to appropirate file(s) """
        if nrd_count == 3:
            nrdallfh.write(output + "\n")
        if nrd_count == 2:
            nrdtwofh.write(output + "\n")
        if nrd_count == 1:
            nrdonefh.write(output + "\n")
def main():
    usage = "usage: %prog [options]  "
    
    parser = argparse.ArgumentParser(description='Calculate non-reference sensitivity (NRS) and non-reference discrepancy (NRD) of VCF files with the same records')
    parser.add_argument("-goldvcf", dest='gold', help="VCF with gold standard genotypes you want to compare to")
    parser.add_argument("-evalvcf", dest='eval', help="VCF you want to evaluate against the gold standard")
   
    args=parser.parse_args()
                                            

    nrsfh=open('NRS.log', 'w')
    nrdfh=open('NRD.log', 'w')
    #matrixfh=open('overall.wes.array.genotype.matrix.csv', 'w')
    concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] )
    calledtable = np.matrix ( [ [0 ,0] , [0,0] ] )


    vcf_readerOne = vcf.Reader(open(args.eval, 'r'),compressed=True)
    vcf_readerTwo = vcf.Reader(open(args.gold, 'r'),compressed=True)




    vcf_gen1=py_recordgen(vcf_readerOne)
    vcf_gen2=py_recordgen(vcf_readerTwo)


    FLAG=True
    vcf_sample_eval_objects=[]
    common_samples=[]


    sys.stderr.write("computing per-sample concordance ....\n")
    for vrec1, vrec2 in itertools.izip(vcf_gen1, vcf_gen2):

        vrec1_samples=[elem.sample for elem in vrec1.samples ]
        vrec2_samples=[elem.sample for elem in vrec2.samples ]

        if vrec1.CHROM != vrec2.CHROM:
            sys.stderr.write("chromosome number does not match!\n")
            sys.stderr.write(vrec1.CHROM + " " + vrec2.CHROM + "\n")
            sys.exit(1)

        if vrec1.POS != vrec2.POS:
            sys.stderr.write("chromosome POS  does not match!\n")
            sys.stderr.write(vrec1.POS + " " + vrec2.POS + "\n")
            sys.exit(1)
    
        if vrec1.ID == None: vrec1.ID='.'
        if vrec2.ID == None: vrec2.ID='.'
    
        common_samples= [x for x in vrec1_samples if x in vrec2_samples  ]
        #print len(common_samples)
    
        if FLAG == True:
            #vcf_sample_eval_objects = [ VcfSampleEval ('array', 'wes', x) for x in common_samples ]
            vcf_sample_eval_objects = [ VcfSampleEval ('gold', 'eval', x) for x in common_samples ]
       
            FLAG=False
    
        gold_eval_genotypes=[] # list of tuples where (sample_name, eval_gt, compare(gold).gt is the order
        for s in common_samples:
       
            gold_eval_genotypes.append( [s, vrec1.genotype(s).gt_type, vrec2.genotype(s).gt_type] )
        
        
    #print gold_eval_genotypes
    
        for eval_obj, eval_genotypes in itertools.izip(vcf_sample_eval_objects, gold_eval_genotypes):
            if eval_genotypes[1] == None:
                eval_genotypes[1]=3
        
            if eval_genotypes[2] == None:
                eval_genotypes[2] = 3
        
            eval_obj.incrementcellcount(eval_genotypes[1],eval_genotypes[2])
            concordancetable[eval_genotypes[1], eval_genotypes[2] ]+=1
        
            if eval_genotypes[1] != eval_genotypes[2]:
                if (eval_genotypes[1] == 0 or eval_genotypes[1] == 3) and (eval_genotypes[2] == 1 or eval_genotypes[2] == 2):
                    nrsout="\t".join( [str(vrec1.CHROM), str(vrec1.POS),eval_genotypes[0], vrec1.ID, eval_genotypes[0], str(eval_genotypes[1]), str(eval_genotypes[2]) ] )
                    nrsfh.write(nrsout+"\n")
                if eval_genotypes[1] != 3:
                    nrdout="\t".join([str(vrec1.CHROM), str(vrec1.POS),eval_genotypes[0], vrec1.ID, eval_genotypes[0], str(eval_genotypes[1]), str(eval_genotypes[2]) ])
                    nrdfh.write(nrdout+"\n")
        #print
    
    concordancefh=open("concordance.txt", 'w')
    matrixfh=open("genotype.matrix.csv", 'w')

    print "Sample\tNRS\tNRD"
    for (eval_obj, sample)  in itertools.izip(vcf_sample_eval_objects, common_samples):
        (NRS, NRD)=eval_obj.returnNRS_NRD()
        outstring="\t".join( [sample, str(NRS), str(NRD)])
        print outstring
        eval_obj.write_genotype_matrix()
    
    outstring=",".join( map(str,melt_lol(concordancetable.tolist())) )  
def main():
    usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
    parser = OptionParser(usage)
    parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    
    (options, args)=parser.parse_args()


    vcfilename=args[0]
    #basename=os.path.splitext(vcfilename)[0]
    basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
    """ row is eval, column is comparison 
        make a numpy matrix to represent genotype concordance matrix """
    
    concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] )
    calledtable = np.matrix ( [ [0 ,0] , [0,0] ] )
    
    #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix
    outputfile=".".join([basename, 'variantEval','txt'])
    outputfh=open(outputfile, 'w')
    #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set
    nrslog=".".join([basename, 'nrs','log'])
    nrdlog=".".join([basename, 'nrd','log'])
    filterlog=".".join([basename, 'filtered','log'])
    multialleliclog=".".join([basename, 'multiallelic','log'])
    concordancelog=".".join([basename, 'concordance','log'])
    genotypematrix=".".join([basename, 'genotype.matrix', 'csv'])
    fieldslog=".".join([basename, 'fields', 'log'])
    nrsfh=open(nrslog, 'w')
    nrdfh=open(nrdlog, 'w')
    filteredfh=open(filterlog, 'w')
    multifh=open(multialleliclog, 'w')
    concordancefh=open(concordancelog, 'w')
    genotypematrixfh=open(genotypematrix, 'w')
    fieldsfh=open(fieldslog, 'w')
    fieldsfh.write('set'+"\n")
    vcfobj=VcfFile(vcfilename)
    vcfh=gzip.open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"
    
    nrsfh.write(header)
    nrdfh.write(header)
    filteredfh.write(header)
    concordancefh.write(header)
    multifh.write(header)
    #outputfh.write(header)
    #multifh.write(header)

    samples=vcfobj.getSampleList()
    
    #for (comparename, evalname) in grouper(2,samples):
    #    print comparename, evalname
    vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for  (compare,eval) in grouper(2,samples) ] 
    
    totalrecords=0

    pattern=';set=(\S+)'
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        if ',' in vrec.getAlt() > 1:
            outstring=vrec.toStringwithGenotypes() + "\n"
            multifh.write(outstring)
            #continue


        """ skip homoz reference calls unless you want to include them!  """
        if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False:
            continue

        """ if variant is filtered, skip it! """
        if 'filterIn' in vrec.getInfo() and options.includeFilter == False:
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        if 'FilteredInAll' in vrec.getInfo():
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        #returns a list [ (samplename, vcfgenotype) , ... () ]
        vrec_ziptuple=vrec.zipGenotypes(samples)
        #compare_eval =[ compare+evalu  for (compare,evalu) in grouper(2,vrec_ziptuple) ]
        
        #what set are you in?
        field=re.search(pattern, vrec.getInfo()).groups()[0]
        fieldsfh.write(field+"\n")
        totalrecords+=1
        """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype  """
        for (compare, eval) in grouper(2,vrec_ziptuple):

            
            (comp_allele1, comp_allele2)=compare[1].getAlleles()
            (eval_allele1, eval_allele2)=eval[1].getAlleles()

            eval_alleletype=typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype=typeofGenotype(comp_allele1, comp_allele2)
           
            """ increment the cell count  """
            concordancetable[eval_alleletype, comp_alleletype]+=1


            """write gentoype record to log appropriate log file """
            #print records that contirubut the NRS penalty
            if eval_alleletype == 3:
                if comp_alleletype == 1 or comp_alleletype==2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring)
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring )
    
        
            #print records that contribute to NRD penalty
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                if comp_alleletype == 0:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
            if eval_alleletype == 1:
                if comp_alleletype == 0 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                if comp_alleletype == 1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
            if eval_alleletype == 2:
                if comp_alleletype == 0 or comp_alleletype ==1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                if comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )


    outputfh.write("total records analyzed: " + str(totalrecords) + "\n" )

    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write("\t".join(['','AA','AB','BB', './.'  ])  +"\n")
   
    rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.']
    for (i, gt) in grouper(2,rownames):
        row=concordancetable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")

    outputfh.write( "matrix sum: \n")
    sum=np.sum(concordancetable)
    outputfh.write( str(sum) +"\n")

    #now we figure out how many sites were called or not called
    calledtable[0,0]=concordancetable[0:3,0:3].sum()
    calledtable[0,1]=concordancetable[0:3,3].sum()
    calledtable[1,0]=concordancetable[3,0:3].sum()
    calledtable[1,1]=concordancetable[3,3]
    outputfh.write("\n")
    rownames=[ 0,'called', 1,'./.' ]
    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write(  "\t".join(['','called','./.' ]) +"\n" )
    
    for (i, gt) in grouper(2,rownames):
        row=calledtable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")
    outputfh.write( "matrix sum: \n")
    sum=np.sum(calledtable)
    outputfh.write( str(sum) +"\n")
   
    outputfh.write("\n")
    
    outstring=",".join( map(str,melt_lol(concordancetable.tolist())) )
    genotypematrixfh.write(outstring+"\n")

    if options.matrixonly == False:
        discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1]
        total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2]
    
        nrd=round( (float(discordance)/float(total)) * 100, 2)
    
        variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2]
    
        variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2]
        nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2)
    
        outputfh.write( "NRD: " + str(nrd) +" \n")
        outputfh.write( "NRS " + str(nrs) +" \n")
Esempio n. 5
0
 def write_genotype_matrix(self):
     """ melt the genotypematrix into a CSV of numbers  """
     outstring=",".join( map(str,melt_lol(self.concordancetable.tolist())) )
     self.genotypematrixfh.write(outstring+"\n")