def testLeftNormalize(self): #left normalize deletion norm_pos, norm_ref, norm_alts = left_normalize(get_reference(),'chr1',2,'CGCCG',['CG']) self.assertEqual(norm_pos,0) self.assertEqual(norm_ref,'AACGC') self.assertEqual(norm_alts[0],'AA') #left normalize insertion norm_pos, norm_ref, norm_alts = left_normalize(get_reference(),'chr4',12,'G',['GGG']) self.assertEqual(norm_pos,7) self.assertEqual(norm_ref,'C') self.assertEqual(norm_alts[0],'CGG')
def testChromEvaluateGenotypeConcordance(self): true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n chr1 5 . C T 20 PASS . GT 0/1\n chr1 9 . A G 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 1/1\n chr1 6 . C G 20 PASS . GT 0/1\n chr1 9 . A G 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr1') pred_vars = vcf_to_ChromVariants(pred_str, 'chr1') cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) self.assertEqual( cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][ GENOTYPE_TYPE.HOM_VAR], 1) self.assertEqual( cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][ GENOTYPE_TYPE.HOM_VAR], 1) # anything other than TP don't get counted in genotype concordance self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP), (1, 2)) # phased variants should be counted correctly true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0|1\n chr1 9 . A G 20 PASS . GT 1|1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 1|0\n chr1 9 . A G 20 PASS . GT 1|1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr1') pred_vars = vcf_to_ChromVariants(pred_str, 'chr1') cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) self.assertEqual( cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][ GENOTYPE_TYPE.HET], 1) self.assertEqual( cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][ GENOTYPE_TYPE.HOM_VAR], 1) self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP), (0, 2))
def testLeftNormalize(self): #left normalize deletion norm_pos, norm_ref, norm_alts = left_normalize(get_reference(), 'chr1', 2, 'CGCCG', ['CG']) self.assertEqual(norm_pos, 0) self.assertEqual(norm_ref, 'AACGC') self.assertEqual(norm_alts[0], 'AA') #left normalize insertion norm_pos, norm_ref, norm_alts = left_normalize(get_reference(), 'chr2', 4, 'CGGA', ['CTTGGA']) self.assertEqual(norm_pos, 1) self.assertEqual(norm_ref, 'TGCC') self.assertEqual(norm_alts[0], 'TGCCTT')
def testChromEvaluateVariantsSV(self): #NB: SVs aren't rescued, just checked for within breakpoint tolerance true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 6 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA 20 PASS . GT 0/1 """ #SV with exact position, exact allele match pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 6 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA 20 PASS . GT 0/1 """ true_vars = vcf_to_ChromVariants(true_str, 'chr1') pred_vars = vcf_to_ChromVariants(pred_str, 'chr1') cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1) #SV with exact position, difference allele match pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 6 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAATGC 20 PASS . GT 0/1 """ pred_vars = vcf_to_ChromVariants(pred_str, 'chr1') cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1) #SV with position within tolerance, exact allele match pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 4 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA 20 PASS . GT 0/1 """ pred_vars = vcf_to_ChromVariants(pred_str, 'chr1') cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1) #SV outside of tolerance pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 110 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA 20 PASS . GT 0/1 """ pred_vars = vcf_to_ChromVariants(pred_str, 'chr1') cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 0)
def testRectify2(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr2') pred_vars = vcf_to_ChromVariants(pred_str,'chr2') gtdict = _genotype_concordance_dict() cvs = ChromVariantStats(true_vars, pred_vars, [], [3,6], [3],gtdict) cvs.rectify(get_reference(),100) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH],1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH],0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
def testRectify(self): # rectify CVS with a rescue-able indel true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr2') pred_vars = vcf_to_ChromVariants(pred_str,'chr2') gtdict = _genotype_concordance_dict() # leave empty, we aren't testing this yet cvs = ChromVariantStats(true_vars,pred_vars,[],[3,4],[2],gtdict) # before rectify, no true positives self.assertTrue(all(map(lambda x: x == 0,cvs.num_tp.itervalues()))) # one false negative indel self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],1) # two false positives SNPs self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2) cvs.rectify(get_reference(),100) # after rectify, one true positive indel self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1) # no false positives or false negatives self.assertTrue(all(map(lambda x: x == 0, cvs.num_fp.itervalues()))) self.assertTrue(all(map(lambda x: x ==0, cvs.num_fn.itervalues())))
def testRescueChromEvalVariants(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr2') pred_vars = vcf_to_ChromVariants(pred_str,'chr2') cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH],1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH],0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
def testTooManyPaths(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n chr1 10053 . TGCGT T 20 PASS . GT 0/1\n chr1 10055 . GCTAA G 20 PASS . GT 0/1\n chr1 10057 . TA T 20 PASS . GT 1/1\n chr1 10058 . GC G 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10025 . CTTAAGCT C 20 PASS . GT 1/1\n chr1 10028 . TGCGT T 20 PASS . GT 0/1\n chr1 10029 . GCTAA G 20 PASS . GT 0/1\n chr1 10032 . TA T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr1') fp_vars = vcf_to_ChromVariants(fp_str, 'chr1') rescuer = SequenceRescuer('chr1', 10000, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertFalse(rescuer.rescued)
def testChromEvaluateVariantsKnownFP(self): # one known true variant true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n """ # call var where known fp is, where true var is, where nothing is known pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n chr1 4 . G C 20 PASS . GT 1/1\n chr1 7 . G A 20 PASS . GT 0/1\n """ # known locations with NO variant known_fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 1 . A T 20 PASS . GT ./.\n chr1 7 . G . 20 PASS . GT 0/0\n """ true_vars = vcf_to_ChromVariants(true_str,'chr1') pred_vars = vcf_to_ChromVariants(pred_str,'chr1') known_fp_io = StringIO.StringIO(known_fp_str) known_fp = Variants(vcf.Reader(known_fp_io),MAX_INDEL_LEN,knownFP=True) cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50,known_fp.on_chrom('chr1')) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2) # usual definition, in pred vars but not in true self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP],1) # call at location known to NOT have SNP
def testTruePosRescueMission(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 3 . TC T 20 PASS . GT 1/1\n chr4 8 . C T 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C T 20 PASS . GT 1/1\n chr4 7 . TC T 20 PASS . GT 1/1\n """ tp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 5 . TC T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr4') fp_vars = vcf_to_ChromVariants(fp_str, 'chr4') tp_vars = vcf_to_ChromVariants(tp_str, 'chr4') num_new_tp, num_removed_fn = rescue_mission(fn_vars, fp_vars, tp_vars, 3, get_reference(), 100) self.assertEqual(num_new_tp[VARIANT_TYPE.SNP], 1) self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_DEL], 1) self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP], 1) self.assertEqual(num_removed_fn[VARIANT_TYPE.INDEL_DEL], 1) self.assertFalse(fn_vars.all_locations) self.assertFalse(fp_vars.all_locations)
def testGetSeq(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 2 . TCGA T 20 PASS . GT 1/1\n chr3 9 . A AAAA 20 PASS . GT 0/1\n """ variants = vcf_to_ChromVariants(pred_str,'chr3') window_tup = (1,13,'chr3') sequence = _get_seq(window_tup,variants.getAllVariants(),get_reference(),False) self.assertEqual(sequence[0],'ATTCGAAAATCG') self.assertEqual(sequence[1],'') sequence = _get_seq(window_tup,variants.getAllVariants(),get_reference(),True) self.assertEqual(sequence[0],'ATTCGATCG') self.assertEqual(sequence[1],'ATCGATCGAAAATCG')
def testChromEvaluateVariantsKnownFP(self): # one known true variant true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n """ # call var where known fp is, where true var is, where nothing is known pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n chr1 4 . G C 20 PASS . GT 1/1\n chr1 7 . G A 20 PASS . GT 0/1\n """ # known locations with NO variant known_fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 1 . A T 20 PASS . GT ./.\n chr1 7 . G . 20 PASS . GT 0/0\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr1') pred_vars = vcf_to_ChromVariants(pred_str, 'chr1') known_fp_io = StringIO.StringIO(known_fp_str) known_fp = Variants(vcf.Reader(known_fp_io), MAX_INDEL_LEN, knownFP=True) cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50, known_fp.on_chrom('chr1')) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 2) # usual definition, in pred vars but not in true self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP], 1) # call at location known to NOT have SNP
def testRectify2(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr2') pred_vars = vcf_to_ChromVariants(pred_str, 'chr2') gtdict = _genotype_concordance_dict() cvs = ChromVariantStats(true_vars, pred_vars, [], [3, 6], [3], gtdict) cvs.rectify(get_reference(), 100) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH], 1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH], 0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
def testTruePosRectify(self): true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 3 . TC T 20 PASS . GT 1/1\n chr4 5 . TC T 20 PASS . GT 1/1\n chr4 8 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C T 20 PASS . GT 1/1\n chr4 5 . TC T 20 PASS . GT 1/1\n chr4 7 . TC T 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr4') pred_vars = vcf_to_ChromVariants(pred_str, 'chr4') gtdict = _genotype_concordance_dict() cvs = ChromVariantStats(true_vars, pred_vars, [5], [4, 7], [3, 8], gtdict) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 1) cvs.rectify(get_reference(), 100) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 2) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0)
def testTruePosRectify(self): true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 3 . TC T 20 PASS . GT 1/1\n chr4 5 . TC T 20 PASS . GT 1/1\n chr4 8 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C T 20 PASS . GT 1/1\n chr4 5 . TC T 20 PASS . GT 1/1\n chr4 7 . TC T 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr4') pred_vars = vcf_to_ChromVariants(pred_str,'chr4') gtdict = _genotype_concordance_dict() cvs = ChromVariantStats(true_vars,pred_vars,[5],[4,7],[3,8],gtdict) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],1) cvs.rectify(get_reference(),100) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],2) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0)
def testRescueChromEvalVariants(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr2') pred_vars = vcf_to_ChromVariants(pred_str, 'chr2') cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH], 1) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH], 0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0) self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
def testRectify(self): # rectify CVS with a rescue-able indel true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr2') pred_vars = vcf_to_ChromVariants(pred_str, 'chr2') gtdict = _genotype_concordance_dict( ) # leave empty, we aren't testing this yet cvs = ChromVariantStats(true_vars, pred_vars, [], [3, 4], [2], gtdict) # before rectify, no true positives self.assertTrue(all(map(lambda x: x == 0, cvs.num_tp.itervalues()))) # one false negative indel self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 1) # two false positives SNPs self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 2) cvs.rectify(get_reference(), 100) # after rectify, one true positive indel self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1) # no false positives or false negatives self.assertTrue(all(map(lambda x: x == 0, cvs.num_fp.itervalues()))) self.assertTrue(all(map(lambda x: x == 0, cvs.num_fn.itervalues())))
def testChromEvaluateVariantsSV(self): #NB: SVs aren't rescued, just checked for within breakpoint tolerance true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 6 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA 20 PASS . GT 0/1 """ #SV with exact position, exact allele match pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 6 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA 20 PASS . GT 0/1 """ true_vars = vcf_to_ChromVariants(true_str,'chr1') pred_vars = vcf_to_ChromVariants(pred_str,'chr1') cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1) #SV with exact position, difference allele match pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 6 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAATGC 20 PASS . GT 0/1 """ pred_vars = vcf_to_ChromVariants(pred_str,'chr1') cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1) #SV with position within tolerance, exact allele match pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 4 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA 20 PASS . GT 0/1 """ pred_vars = vcf_to_ChromVariants(pred_str,'chr1') cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1) #SV outside of tolerance pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 110 . C CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA 20 PASS . GT 0/1 """ pred_vars = vcf_to_ChromVariants(pred_str,'chr1') cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],0)
def normalizeStringToWriter(self,vcf_str): vcf_io = StringIO.StringIO(vcf_str) test_vcf = vcf.Reader(vcf_io) output_io = StringIO.StringIO() output_writer = VCFWriter('ref.fasta','name',output_io) map(lambda r: write(r,output_writer),normalize(get_reference(),test_vcf)) outputStr = output_io.getvalue() outputStr = outputStr.replace('\n','\n\n') return vcf.Reader(StringIO.StringIO(outputStr))
def testNBaseNormalization(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTT 20 PASS . GT 0/1\n chr4 3 . NN N 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_iter),1)
def testNBaseNormalization(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTT 20 PASS . GT 0/1\n chr4 3 . NN N 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_iter), 1)
def testGetSeq(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 2 . TCGA T 20 PASS . GT 1/1\n chr3 9 . A AAAA 20 PASS . GT 0/1\n """ variants = vcf_to_ChromVariants(pred_str, 'chr3') window_tup = (1, 13, 'chr3') sequence = _get_seq(window_tup, variants.getAllVariants(), get_reference(), False) self.assertEqual(sequence[0], 'ATTCGAAAATCG') self.assertEqual(sequence[1], '') sequence = _get_seq(window_tup, variants.getAllVariants(), get_reference(), True) self.assertEqual(sequence[0], 'ATTCGATCG') self.assertEqual(sequence[1], 'ATCGATCGAAAATCG')
def testFullRescue(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr2') fp_vars = vcf_to_ChromVariants(fp_str,'chr2') rescuer = SequenceRescuer('chr2',2,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertTrue(rescuer.rescued) self.assertEqual(rescuer.windowsRescued,(0,0)) fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ fp_vars = vcf_to_ChromVariants(fp_str,'chr2') fn_vars = vcf_to_ChromVariants(fn_str,'chr2') rescuer = SequenceRescuer('chr2',3,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertTrue(rescuer.rescued) fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 3 . TC T 20 PASS . GT 1/1\n chr4 8 . C T 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C T 20 PASS . GT 1/1\n chr4 7 . TC T 20 PASS . GT 1/1\n """ tp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 5 . TC T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr4') fp_vars = vcf_to_ChromVariants(fp_str,'chr4') tp_vars = vcf_to_ChromVariants(tp_str,'chr4') rescuer = SequenceRescuer('chr4',3,fn_vars,fp_vars,tp_vars,get_reference(),50) self.assertTrue(rescuer.rescued)
def testAggregate(self): # build two ChromVariantStats objects true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 5 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 7 . G C 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr2') pred_vars = vcf_to_ChromVariants(pred_str,'chr2') gtdict = _genotype_concordance_dict() # leave empty for now cvs2 = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 3 . G A 20 PASS . GT 1/1\n chr3 5 . C T 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 3 . G A 20 PASS . GT 1/1\n chr3 4 . T A 20 PASS . GT 1/1\n chr3 7 . G C 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr3') pred_vars = vcf_to_ChromVariants(pred_str,'chr3') cvs3 = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) #cvs5 = ChromVariantStats(true_vars,pred_vars,[31],[49,79],[52],_genotype_concordance_dict()) aggregator,errors = _aggregate([cvs2,cvs3]) # test some sums self.assertEqual(cvs2.num_true[VARIANT_TYPE.SNP],2) self.assertEqual(cvs3.num_true[VARIANT_TYPE.SNP],2) self.assertEqual(aggregator(VARIANT_TYPE.SNP)['num_true'],4) self.assertEqual(cvs2.num_tp[VARIANT_TYPE.SNP],1) self.assertEqual(cvs3.num_tp[VARIANT_TYPE.SNP],1) self.assertEqual(aggregator(VARIANT_TYPE.SNP)['good_predictions'],2)
def testCollidingVariants(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 5 . A TGC 20 PASS . GT 1/1\n chr1 5 . A GGG 20 PASS . GT 1/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) count = self.countRecords(norm_iter) self.assertEqual(count,1)
def testCollidingVariants(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 5 . A TGC 20 PASS . GT 1/1\n chr1 5 . A GGG 20 PASS . GT 1/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) count = self.countRecords(norm_iter) self.assertEqual(count, 1)
def testNormalizedToCollision(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 4 . C T 20 PASS . GT 0/1\n chr2 5 . C CGC 20 PASS . GT 0/1\n chr4 2 . A AGG 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() r3 = norm_iter.next() r4 = norm_iter.next() self.assertEqual(r1.POS, 4) # chr2 SNP doesn't change self.assertEqual( r2.POS, 5 ) # chr2 insertion gets normed forward 1 base and slid back to original pos self.assertEqual(r2.REF, "C") self.assertEqual(r2.ALT, ["CGC"]) self.assertEqual(r3.POS, 2) self.assertEqual(r3.REF, "A") self.assertEqual(r3.ALT, ["AGG"]) self.assertEqual(r4.POS, 3) self.assertEqual(r4.REF, "T") self.assertEqual(r4.ALT, ["TCT"]) vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . ATC A 20 PASS . GT 0/1\n chr4 6 . CTC C 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() self.assertEqual(r1.POS, 2) self.assertEqual(r1.REF, "ATC") self.assertEqual(r1.ALT, ["A"]) self.assertEqual(r2.POS, 5) self.assertEqual(r2.REF, "TCT") self.assertEqual(r2.ALT, ["T"])
def testGenotypes(self): # keep genotype info for a compound heterozygous call vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A C,T 20 PASS . GT 1/2\n """ vcf = self.getVcf(vcf_str) record = normalize(get_reference(), vcf).next() self.assertEqual(record.samples[0].gt_nums, "1/2")
def testGenotypes(self): # keep genotype info for a compound heterozygous call vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A C,T 20 PASS . GT 1/2\n """ vcf = self.getVcf(vcf_str) record = normalize(get_reference(),vcf).next() self.assertEqual(record.samples[0].gt_nums, "1/2")
def testChromEvaluateGenotypeConcordance(self): true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n chr1 5 . C T 20 PASS . GT 0/1\n chr1 9 . A G 20 PASS . GT 1/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 1/1\n chr1 6 . C G 20 PASS . GT 0/1\n chr1 9 . A G 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr1') pred_vars = vcf_to_ChromVariants(pred_str,'chr1') cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_VAR],1) self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR],1) # anything other than TP don't get counted in genotype concordance self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP),(1,2)) # phased variants should be counted correctly true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0|1\n chr1 9 . A G 20 PASS . GT 1|1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 1|0\n chr1 9 . A G 20 PASS . GT 1|1\n """ true_vars = vcf_to_ChromVariants(true_str,'chr1') pred_vars = vcf_to_ChromVariants(pred_str,'chr1') cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HET],1) self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR],1) self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP),(0,2))
def testCleanOnly(self): vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . g cg 20 PASS . GT 0/1\n """ norm = normalize(get_reference(), self.getVcf(vcf_str), 50, True) record = norm.next() self.assertEqual(record.POS, 6) self.assertEqual(record.REF, 'G') self.assertEqual(record.ALT, ['CG'])
def testCleanOnly(self): vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . g cg 20 PASS . GT 0/1\n """ norm = normalize(get_reference(),self.getVcf(vcf_str),50,True) record = norm.next() self.assertEqual(record.POS,6) self.assertEqual(record.REF,'G') self.assertEqual(record.ALT,['CG'])
def testNormalizedToCollision(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 4 . C T 20 PASS . GT 0/1\n chr2 5 . C CGC 20 PASS . GT 0/1\n chr4 2 . A AGG 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() r3 = norm_iter.next() r4 = norm_iter.next() self.assertEqual(r1.POS,4) # chr2 SNP doesn't change self.assertEqual(r2.POS,5) # chr2 insertion gets normed forward 1 base and slid back to original pos self.assertEqual(r2.REF,"C") self.assertEqual(r2.ALT,["CGC"]) self.assertEqual(r3.POS,2) self.assertEqual(r3.REF,"A") self.assertEqual(r3.ALT,["AGG"]) self.assertEqual(r4.POS,3) self.assertEqual(r4.REF,"T") self.assertEqual(r4.ALT,["TCT"]) vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . ATC A 20 PASS . GT 0/1\n chr4 6 . CTC C 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() self.assertEqual(r1.POS,2) self.assertEqual(r1.REF,"ATC") self.assertEqual(r1.ALT,["A"]) self.assertEqual(r2.POS,5) self.assertEqual(r2.REF,"TCT") self.assertEqual(r2.ALT,["T"])
def testMultipleAltAlleles(self): # multiple alleles aren't normalized if the two alt alleles would be normalized differently vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . G CG 20 PASS . GT 0/1\n """ record = normalize(get_reference(), self.getVcf(vcf_str)).next() self.assertEqual(record.POS, 3) self.assertEqual(record.REF, 'G') self.assertEqual(record.ALT[0], 'GC') vcf_str2 = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . G CG,C 20 PASS . GT 0/1\n """ record = normalize(get_reference(), self.getVcf(vcf_str2)).next() self.assertEqual(record.POS, 6) self.assertEqual(record.REF, 'G') self.assertEqual(record.ALT[0], 'CG')
def testMultipleAltAlleles(self): # multiple alleles aren't normalized if the two alt alleles would be normalized differently vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . G CG 20 PASS . GT 0/1\n """ record = normalize(get_reference(),self.getVcf(vcf_str)).next() self.assertEqual(record.POS,3) self.assertEqual(record.REF,'G') self.assertEqual(record.ALT[0], 'GC') vcf_str2 = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . G CG,C 20 PASS . GT 0/1\n """ record = normalize(get_reference(),self.getVcf(vcf_str2)).next() self.assertEqual(record.POS,6) self.assertEqual(record.REF,'G') self.assertEqual(record.ALT[0],'CG')
def testNormalizeTwoToCollision(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C CTC 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() self.assertEqual(r1.POS, 2) self.assertEqual(r1.REF, "A") self.assertEqual(r1.ALT, ["ATC"]) self.assertEqual(r2.POS, 3) self.assertEqual(r2.REF, "T") self.assertEqual(r2.ALT, ["TCT"])
def testNormalizeTwoToCollision(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C CTC 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() self.assertEqual(r1.POS,2) self.assertEqual(r1.REF,"A") self.assertEqual(r1.ALT,["ATC"]) self.assertEqual(r2.POS,3) self.assertEqual(r2.REF,"T") self.assertEqual(r2.ALT,["TCT"])
def testIndelDeletionMismatchedAllele(self): true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 5 . ATC A 20 PASS . GT 0/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 5 . ATCG A 20 PASS . GT 0/1 """ true_vars = vcf_to_ChromVariants(true_str,'chr3') pred_vars = vcf_to_ChromVariants(pred_str, 'chr3') cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],1)
def testVariantWithMismatchedRef(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G C 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr2') fp_vars = vcf_to_ChromVariants(fp_str, 'chr2') rescuer = SequenceRescuer('chr2', 2, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertFalse(rescuer.rescued)
def testNormalizedVariants(self): fp_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C CTC 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ fn_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTC 20 PASS . GT 0/1\n """ fp_vars = normalize_vcf_to_ChromVariants(fp_str, 'chr4') fn_vars = vcf_to_ChromVariants(fn_str, 'chr4') rescuer = SequenceRescuer('chr4', 2, fn_vars, fp_vars, get_empty_ChromVariants('chr4'), get_reference(), 50) self.assertTrue(rescuer.rescued)
def testIndelDeletionMismatchedAllele(self): true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 5 . ATC A 20 PASS . GT 0/1\n """ pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr3 5 . ATCG A 20 PASS . GT 0/1 """ true_vars = vcf_to_ChromVariants(true_str, 'chr3') pred_vars = vcf_to_ChromVariants(pred_str, 'chr3') cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50) self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0) self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 1) self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 1)
def testEmptyWindow(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G C 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr1') fp_vars = vcf_to_ChromVariants(fp_str, 'chr1') rescuer = SequenceRescuer('chr1', 10049, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertFalse(rescuer.rescued)
def test_approx_sv(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr19 88013 . CTT C 20 PASS . GT 0/1\n chr19 89272 . C T 20 PASS . GT 0/1\n chr19 269771 . A AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20 PASS . GT 1/1 """ pred_io = StringIO.StringIO(pred_str) pred_vcf = vcf.Reader(pred_io) stat_reporter = evaluate_low_memory(self.true_vars, pred_vcf, sv_eps, sv_eps,get_reference(), 50, 50, {'chr19':0,None:100}) self.truePositive(stat_reporter,VARIANT_TYPE.SNP) self.trueNegative(stat_reporter,VARIANT_TYPE.INDEL_INS) self.truePositive(stat_reporter,VARIANT_TYPE.INDEL_DEL) self.truePositive(stat_reporter,VARIANT_TYPE.SV_INS) self.trueNegative(stat_reporter,VARIANT_TYPE.SV_DEL)
def testNormalizeThreeCollision(self): # the OP info flag is fake to force vars to right-slide vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTT 20 PASS OP=1 GT 0/1\n chr4 2 . A T 20 PASS . GT 0/1\n chr4 2 . ATCTC T 20 PASS OP=2 GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() r3 = norm_iter.next() r1,r2,r3 = sorted([r1,r2,r3],key=lambda x: x.POS) # order of vars from same pos not guaranteed self.assertEqual(r1.POS,2) self.assertEqual(r2.POS,3) self.assertEqual(r2.REF,"T") self.assertEqual(r2.ALT,["TCTTT"]) self.assertEqual(r3.POS,4) self.assertEqual(r3.REF,"CTCTC") self.assertEqual(r3.ALT,["C"])
def test_known_false_positives(self): true_vcf = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 1 . T A 20 PASS . GT 0/1\n chr1 8 . A C 20 PASS . GT 1/1\n """ pred_vcf = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 3 . G C 20 PASS . GT 1/1\n chr1 5 . C G 20 PASS . GT 0/1\n chr1 8 . A C 20 PASS . GT 1/1\n """ known_fp_vcf = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 3 . G . 20 PASS . GT 0/0\n chr1 5 . C G 20 PASS . GT 0/0\n chr1 9 . T . 20 PASS . GT 0/0\n """ known_fp_io = StringIO.StringIO(known_fp_vcf) known_fp_vars = Variants(vcf.Reader(known_fp_io),MAX_INDEL_LEN,knownFP=True) stat_reporter, vcf_output = evaluate_variants(vcf_to_Variants(true_vcf),vcf_to_Variants(pred_vcf),sv_eps,sv_eps, \ get_reference(),50,known_fp_vars) snp_stats = stat_reporter(VARIANT_TYPE.SNP) self.assertEqual(snp_stats['num_true'],2) self.assertEqual(snp_stats['num_pred'],3) self.assertEqual(snp_stats['good_predictions'],1) self.assertEqual(snp_stats['false_positives'],2) # predicted vars not in ground truth self.assertEqual(snp_stats['false_negatives'],1) self.assertEqual(snp_stats['known_fp_calls'],2) self.assertEqual(snp_stats['known_fp'],2)
def testWindowTooBig(self): longsv1 = 'ATTGTTCATGA' * 300 longsv2 = 'GCCTAGGGTCA' * 300 fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 7001 . """ + longsv1 + """ A 20 PASS . GT 1/1\n chr1 10100 . """ + longsv2 + """ G 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr1') fp_vars = vcf_to_ChromVariants(fp_str, 'chr1') rescuer = SequenceRescuer('chr1', 10049, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertFalse(rescuer.rescued)
def testOverlappingVariants(self): # if vcf contains overlapping variants, don't rescue that sequence fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 1 . T G 20 PASS . GT 1/1\n """ tp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCC G 20 PASS . GT 1/1\n chr2 4 . C G 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 7 . GA A 20 PASS . GT 1/1\n """ tp_vars = vcf_to_ChromVariants(tp_str,'chr2') fn_vars = vcf_to_ChromVariants(fn_str,'chr2') fp_vars = vcf_to_ChromVariants(fp_str,'chr2') rescuer = SequenceRescuer('chr2',1,fn_vars,fp_vars,tp_vars,get_reference(),50) self.assertFalse(rescuer.rescued)
def testOverlappingVariants(self): # if vcf contains overlapping variants, don't rescue that sequence fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 1 . T G 20 PASS . GT 1/1\n """ tp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCC G 20 PASS . GT 1/1\n chr2 4 . C G 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 7 . GA A 20 PASS . GT 1/1\n """ tp_vars = vcf_to_ChromVariants(tp_str, 'chr2') fn_vars = vcf_to_ChromVariants(fn_str, 'chr2') fp_vars = vcf_to_ChromVariants(fp_str, 'chr2') rescuer = SequenceRescuer('chr2', 1, fn_vars, fp_vars, tp_vars, get_reference(), 50) self.assertFalse(rescuer.rescued)
def testNormalize(self): #regular records are unchanged vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C A 20 PASS . GT 0/1\n """ norm_vcf = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf),1) #test that hom ref records are removed vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C C 20 PASS . GT 0/0\n chr1 3 . G A 20 PASS . GT 1/1\n """ norm_vcf = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf),1) #test that SNP/indels without genotyping are removed vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C A 20 PASS . GT .\n chr1 3 . G C 20 PASS . GT 0/0\n chr1 4 . G T 20 PASS . GT 0|0\n chr1 5 . G A 20 PASS . GT 1/1\n """ norm_vcf = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf),1) #test that SV without genotyping is retained vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20 PASS . GT .\n """ norm_vcf = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf),1) #test that lower case ref/alt gets upper-cased vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . c a 20 PASS . GT 0/1\n """ vcf_io = StringIO.StringIO(vcf_str) lowercase_vcf = vcf.Reader(StringIO.StringIO(vcf_str)) output_vcf = normalize(get_reference(),self.getVcf(vcf_str)) original_r = lowercase_vcf.next() norm_r = output_vcf.next() self.assertEqual(original_r.REF,'c') self.assertEqual(original_r.ALT[0], 'a') self.assertEqual(norm_r.REF,'C') self.assertEqual(norm_r.ALT[0],'A') # test normalizing an insertion vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 9 . a ga 20 PASS . GT 0/1\n """ record = normalize(get_reference(),self.getVcf(vcf_str)).next() self.assertEqual(record.POS,6) self.assertEqual(record.REF,'C') self.assertEqual(record.ALT,['CG']) # test normalizing a deletion vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 5 . cc c 20 PASS . GT 0/1\n """ record = normalize(get_reference(),self.getVcf(vcf_str)).next() self.assertEqual(record.POS,4) self.assertEqual(record.REF,'GC') self.assertEqual(record.ALT,['G'])
def testRescueMission(self): # false negative variant at location is SV; don't rescue fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G GATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G GC 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(fn_str, 'chr1') pred_vars = vcf_to_ChromVariants(fp_str, 'chr1') num_new_tp, num_removed_fn = rescue_mission( true_vars, pred_vars, get_empty_ChromVariants('chr1'), 8000, get_reference(), 100) self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues()))) self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues()))) # variant couldn't be rescued; no change to counts or ChromVariants fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A C 20 PASS . GT 1/1\n chr1 7 . C T 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 4 . A C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr1') fp_vars = vcf_to_ChromVariants(fp_str, 'chr1') num_new_tp, num_removed_fn = rescue_mission( fn_vars, fp_vars, get_empty_ChromVariants('chr1'), 2, get_reference(), 100) self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues()))) self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues()))) self.assertEqual(len(fn_vars.all_locations), 2) self.assertEqual(len(fp_vars.all_locations), 1) # variant is rescued; counts change; variants are removed from fn/fp fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr2') fp_vars = vcf_to_ChromVariants(fp_str, 'chr2') num_new_tp, num_removed_fn = rescue_mission( fn_vars, fp_vars, get_empty_ChromVariants('chr2'), 2, get_reference(), 100) self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_OTH], 1) self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP], 2) self.assertEqual(len(fn_vars.all_locations), 0) self.assertEqual(len(fp_vars.all_locations), 0)
def testRescueMission(self): # false negative variant at location is SV; don't rescue fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G GATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G GC 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(fn_str,'chr1') pred_vars = vcf_to_ChromVariants(fp_str,'chr1') num_new_tp,num_removed_fn,rescuedvars = rescue_mission(true_vars,pred_vars,get_empty_ChromVariants('chr1'),8000,get_reference(),100) self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues()))) self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues()))) # variant couldn't be rescued; no change to counts or ChromVariants fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A C 20 PASS . GT 1/1\n chr1 7 . C T 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 4 . A C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr1') fp_vars = vcf_to_ChromVariants(fp_str,'chr1') num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr1'),2,get_reference(),100) self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues()))) self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues()))) self.assertEqual(len(fn_vars.all_locations),2) self.assertEqual(len(fp_vars.all_locations),1) self.assertEqual(rescuedvars,[]) # variant is rescued; counts change; variants are removed from fn/fp fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr2') fp_vars = vcf_to_ChromVariants(fp_str,'chr2') num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr2'),2,get_reference(),100) self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_OTH],1) self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP],2) self.assertEqual(len(fn_vars.all_locations),0) self.assertEqual(len(fp_vars.all_locations),0) self.assertEqual(map(lambda r: r.pos,rescuedvars),[3,4])
def testTooManyPaths(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n chr1 10053 . TGCGT T 20 PASS . GT 0/1\n chr1 10055 . GCTAA G 20 PASS . GT 0/1\n chr1 10057 . TA T 20 PASS . GT 1/1\n chr1 10058 . GC G 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10025 . CTTAAGCT C 20 PASS . GT 1/1\n chr1 10028 . TGCGT T 20 PASS . GT 0/1\n chr1 10029 . GCTAA G 20 PASS . GT 0/1\n chr1 10032 . TA T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr1') fp_vars = vcf_to_ChromVariants(fp_str,'chr1') rescuer = SequenceRescuer('chr1',10000,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertFalse(rescuer.rescued)
def testEmptyWindow(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G C 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr1') fp_vars = vcf_to_ChromVariants(fp_str,'chr1') rescuer = SequenceRescuer('chr1',10049,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertFalse(rescuer.rescued)
def testWindowTooBig(self): longsv1 = 'ATTGTTCATGA'*300 longsv2 = 'GCCTAGGGTCA'*300 fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 7001 . """ + longsv1 + """ A 20 PASS . GT 1/1\n chr1 10100 . """ + longsv2 + """ G 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr1') fp_vars = vcf_to_ChromVariants(fp_str,'chr1') rescuer = SequenceRescuer('chr1',10049,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertFalse(rescuer.rescued)
def testVariantWithMismatchedRef(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G C 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr2') fp_vars = vcf_to_ChromVariants(fp_str,'chr2') rescuer = SequenceRescuer('chr2',2,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertFalse(rescuer.rescued)