def testRectify2(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr2') pred_vars = vcf_to_ChromVariants(pred_str, 'chr2') gtdict = _genotype_concordance_dict() cvs = ChromVariantStats(true_vars, pred_vars, [], [3, 6], [3], gtdict) cvs.rectify(get_reference(), 100) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH], 1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH], 0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
def testTruePosRectify(self): true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 3 . TC T 20 PASS . GT 1/1\n chr4 5 . TC T 20 PASS . GT 1/1\n chr4 8 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C T 20 PASS . GT 1/1\n chr4 5 . TC T 20 PASS . GT 1/1\n chr4 7 . TC T 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr4') pred_vars = vcf_to_ChromVariants(pred_str, 'chr4') gtdict = _genotype_concordance_dict() cvs = ChromVariantStats(true_vars, pred_vars, [5], [4, 7], [3, 8], gtdict) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 1) cvs.rectify(get_reference(), 100) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 2) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0)
def testInit(self): # test counts of false positive, false negative, true positive true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 5 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 7 . G C 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr2') pred_vars = vcf_to_ChromVariants(pred_str, 'chr2') gtdict = _genotype_concordance_dict() gtdict[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][ GENOTYPE_TYPE.HOM_VAR] += 1 cvs = ChromVariantStats(true_vars, pred_vars, [3], [7], [5], gtdict) self.assertEqual(cvs.num_true[VARIANT_TYPE.SNP], 2) self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 2) self.assertEqual(len(cvs.false_positives.all_locations), 1) self.assertEqual(len(cvs.false_negatives.all_locations), 1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 1)
def testRectify(self): # rectify CVS with a rescue-able indel true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr2') pred_vars = vcf_to_ChromVariants(pred_str, 'chr2') gtdict = _genotype_concordance_dict( ) # leave empty, we aren't testing this yet cvs = ChromVariantStats(true_vars, pred_vars, [], [3, 4], [2], gtdict) # before rectify, no true positives self.assertTrue(all(map(lambda x: x == 0, cvs.num_tp.itervalues()))) # one false negative indel self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 1) # two false positives SNPs self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 2) cvs.rectify(get_reference(), 100) # after rectify, one true positive indel self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1) # no false positives or false negatives self.assertTrue(all(map(lambda x: x == 0, cvs.num_fp.itervalues()))) self.assertTrue(all(map(lambda x: x == 0, cvs.num_fn.itervalues())))
def testTruePosRectify(self): true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 3 . TC T 20 PASS . GT 1/1\n chr4 5 . TC T 20 PASS . GT 1/1\n chr4 8 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C T 20 PASS . GT 1/1\n chr4 5 . TC T 20 PASS . GT 1/1\n chr4 7 . TC T 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr4') pred_vars = vcf_to_ChromVariants(pred_str,'chr4') gtdict = _genotype_concordance_dict() cvs = ChromVariantStats(true_vars,pred_vars,[5],[4,7],[3,8],gtdict) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],1) cvs.rectify(get_reference(),100) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],2) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0)
def testRectify2(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr2') pred_vars = vcf_to_ChromVariants(pred_str,'chr2') gtdict = _genotype_concordance_dict() cvs = ChromVariantStats(true_vars, pred_vars, [], [3,6], [3],gtdict) cvs.rectify(get_reference(),100) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH],1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH],0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
def testRectify(self): # rectify CVS with a rescue-able indel true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr2') pred_vars = vcf_to_ChromVariants(pred_str,'chr2') gtdict = _genotype_concordance_dict() # leave empty, we aren't testing this yet cvs = ChromVariantStats(true_vars,pred_vars,[],[3,4],[2],gtdict) # before rectify, no true positives self.assertTrue(all(map(lambda x: x == 0,cvs.num_tp.itervalues()))) # one false negative indel self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],1) # two false positives SNPs self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2) cvs.rectify(get_reference(),100) # after rectify, one true positive indel self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1) # no false positives or false negatives self.assertTrue(all(map(lambda x: x == 0, cvs.num_fp.itervalues()))) self.assertTrue(all(map(lambda x: x ==0, cvs.num_fn.itervalues())))
def testInit(self): # test counts of false positive, false negative, true positive true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 5 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 7 . G C 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr2') pred_vars = vcf_to_ChromVariants(pred_str,'chr2') gtdict = _genotype_concordance_dict() gtdict[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR] += 1 cvs = ChromVariantStats(true_vars,pred_vars,[3],[7],[5],gtdict) self.assertEqual(cvs.num_true[VARIANT_TYPE.SNP],2) self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],2) self.assertEqual(len(cvs.false_positives.all_locations),1) self.assertEqual(len(cvs.false_negatives.all_locations),1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],1)
def testAggregate(self): # build two ChromVariantStats objects true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 5 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 7 . G C 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr2') pred_vars = vcf_to_ChromVariants(pred_str, 'chr2') gtdict = _genotype_concordance_dict() # leave empty for now cvs2 = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 3 . G A 20 PASS . GT 1/1\n chr3 5 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 3 . G A 20 PASS . GT 1/1\n chr3 4 . T A 20 PASS . GT 1/1\n chr3 7 . G C 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr3') pred_vars = vcf_to_ChromVariants(pred_str, 'chr3') cvs3 = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) #cvs5 = ChromVariantStats(true_vars,pred_vars,[31],[49,79],[52],_genotype_concordance_dict()) aggregator, errors = _aggregate([cvs2, cvs3]) # test some sums self.assertEqual(cvs2.num_true[VARIANT_TYPE.SNP], 2) self.assertEqual(cvs3.num_true[VARIANT_TYPE.SNP], 2) self.assertEqual(aggregator(VARIANT_TYPE.SNP)['num_true'], 4) self.assertEqual(cvs2.num_tp[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs3.num_tp[VARIANT_TYPE.SNP], 1) self.assertEqual(aggregator(VARIANT_TYPE.SNP)['good_predictions'], 2)
def testAggregate(self): # build two ChromVariantStats objects true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 5 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 7 . G C 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr2') pred_vars = vcf_to_ChromVariants(pred_str,'chr2') gtdict = _genotype_concordance_dict() # leave empty for now cvs2 = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 3 . G A 20 PASS . GT 1/1\n chr3 5 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 3 . G A 20 PASS . GT 1/1\n chr3 4 . T A 20 PASS . GT 1/1\n chr3 7 . G C 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr3') pred_vars = vcf_to_ChromVariants(pred_str,'chr3') cvs3 = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) #cvs5 = ChromVariantStats(true_vars,pred_vars,[31],[49,79],[52],_genotype_concordance_dict()) aggregator,errors = _aggregate([cvs2,cvs3]) # test some sums self.assertEqual(cvs2.num_true[VARIANT_TYPE.SNP],2) self.assertEqual(cvs3.num_true[VARIANT_TYPE.SNP],2) self.assertEqual(aggregator(VARIANT_TYPE.SNP)['num_true'],4) self.assertEqual(cvs2.num_tp[VARIANT_TYPE.SNP],1) self.assertEqual(cvs3.num_tp[VARIANT_TYPE.SNP],1) self.assertEqual(aggregator(VARIANT_TYPE.SNP)['good_predictions'],2)