Example #1
0
    def testTruePosRescueMission(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr4')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr4')
        tp_vars = vcf_to_ChromVariants(tp_str,'chr4')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,tp_vars,3,get_reference(),100)
        self.assertEqual(num_new_tp[VARIANT_TYPE.SNP],1)
        self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_DEL],1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP],1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.INDEL_DEL],1)
        self.assertFalse(fn_vars.all_locations)
        self.assertFalse(fp_vars.all_locations)
        self.assertEqual(map(lambda r: r.pos,rescuedvars),[4,7])
Example #2
0
    def testTooManyPaths(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr1   10053   .       TGCGT        T       20      PASS    .       GT      0/1\n
chr1   10055   .       GCTAA        G       20      PASS    .       GT      0/1\n
chr1   10057   .       TA           T       20      PASS    .       GT      1/1\n
chr1   10058   .       GC           G       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10025   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr1   10028   .       TGCGT        T       20      PASS    .       GT      0/1\n
chr1   10029   .       GCTAA        G       20      PASS    .       GT      0/1\n
chr1   10032   .       TA           T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr1')
        rescuer = SequenceRescuer('chr1', 10000, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Example #3
0
    def testChromEvaluateVariantsKnownFP(self):
        # one known true variant
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
        """
        # call var where known fp is, where true var is, where nothing is known
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS    .       GT      0/1\n
chr1    4       .       G       C       20      PASS    .       GT      1/1\n
chr1    7       .       G       A       20      PASS    .       GT      0/1\n
        """
        # known locations with NO variant
        known_fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    1       .       A       T       20      PASS    .       GT       ./.\n
chr1    7       .       G       .       20      PASS    .       GT       0/0\n
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        known_fp_io = StringIO.StringIO(known_fp_str)
        known_fp = Variants(vcf.Reader(known_fp_io),
                            MAX_INDEL_LEN,
                            knownFP=True)
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50,
                                      known_fp.on_chrom('chr1'))
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],
                         2)  # usual definition, in pred vars but not in true
        self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP],
                         1)  # call at location known to NOT have SNP
Example #4
0
    def testInit(self):
        # test counts of false positive, false negative, true positive
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   5   .       C     T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   7   .       G     C       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        gtdict = _genotype_concordance_dict()
        gtdict[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR] += 1
        cvs = ChromVariantStats(true_vars,pred_vars,[3],[7],[5],gtdict)
        self.assertEqual(cvs.num_true[VARIANT_TYPE.SNP],2)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],2)
        self.assertEqual(len(cvs.false_positives.all_locations),1)
        self.assertEqual(len(cvs.false_negatives.all_locations),1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],1)
Example #5
0
    def testTruePosRectify(self):
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   5   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr4')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr4')
        gtdict = _genotype_concordance_dict()
        cvs = ChromVariantStats(true_vars,pred_vars,[5],[4,7],[3,8],gtdict)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],1)
        cvs.rectify(get_reference(),100)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],2)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0)
Example #6
0
    def testRescueChromEvalVariants(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr2')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH], 0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
Example #7
0
    def testRectify2(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr2')
        gtdict = _genotype_concordance_dict()
        cvs = ChromVariantStats(true_vars, pred_vars, [], [3, 6], [3], gtdict)
        cvs.rectify(get_reference(), 100)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH], 0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
Example #8
0
    def testRectify(self):
        # rectify CVS with a rescue-able indel
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        gtdict = _genotype_concordance_dict() # leave empty, we aren't testing this yet
        cvs = ChromVariantStats(true_vars,pred_vars,[],[3,4],[2],gtdict)
        # before rectify, no true positives
        self.assertTrue(all(map(lambda x: x == 0,cvs.num_tp.itervalues())))
        # one false negative indel
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],1)
        # two false positives SNPs
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2)
        cvs.rectify(get_reference(),100)
        # after rectify, one true positive indel
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1)
        # no false positives or false negatives
        self.assertTrue(all(map(lambda x: x == 0, cvs.num_fp.itervalues())))
        self.assertTrue(all(map(lambda x: x ==0, cvs.num_fn.itervalues())))
Example #9
0
    def testRescueChromEvalVariants(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH],0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
Example #10
0
    def testTruePosRescueMission(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr4')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr4')
        tp_vars = vcf_to_ChromVariants(tp_str, 'chr4')
        num_new_tp, num_removed_fn = rescue_mission(fn_vars, fp_vars, tp_vars,
                                                    3, get_reference(), 100)
        self.assertEqual(num_new_tp[VARIANT_TYPE.SNP], 1)
        self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP], 1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertFalse(fn_vars.all_locations)
        self.assertFalse(fp_vars.all_locations)
Example #11
0
    def testRectify(self):
        # rectify CVS with a rescue-able indel
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr2')
        gtdict = _genotype_concordance_dict(
        )  # leave empty, we aren't testing this yet
        cvs = ChromVariantStats(true_vars, pred_vars, [], [3, 4], [2], gtdict)
        # before rectify, no true positives
        self.assertTrue(all(map(lambda x: x == 0, cvs.num_tp.itervalues())))
        # one false negative indel
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 1)
        # two false positives SNPs
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 2)
        cvs.rectify(get_reference(), 100)
        # after rectify, one true positive indel
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1)
        # no false positives or false negatives
        self.assertTrue(all(map(lambda x: x == 0, cvs.num_fp.itervalues())))
        self.assertTrue(all(map(lambda x: x == 0, cvs.num_fn.itervalues())))
Example #12
0
    def testChromEvaluateVariantsKnownFP(self):
        # one known true variant
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
        """
        # call var where known fp is, where true var is, where nothing is known
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS    .       GT      0/1\n
chr1    4       .       G       C       20      PASS    .       GT      1/1\n
chr1    7       .       G       A       20      PASS    .       GT      0/1\n
        """
        # known locations with NO variant
        known_fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    1       .       A       T       20      PASS    .       GT       ./.\n
chr1    7       .       G       .       20      PASS    .       GT       0/0\n
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        known_fp_io = StringIO.StringIO(known_fp_str)
        known_fp = Variants(vcf.Reader(known_fp_io),MAX_INDEL_LEN,knownFP=True)
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50,known_fp.on_chrom('chr1'))
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2) # usual definition, in pred vars but not in true
        self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP],1) # call at location known to NOT have SNP
Example #13
0
    def testInit(self):
        # test counts of false positive, false negative, true positive
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   5   .       C     T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   7   .       G     C       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr2')
        gtdict = _genotype_concordance_dict()
        gtdict[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][
            GENOTYPE_TYPE.HOM_VAR] += 1
        cvs = ChromVariantStats(true_vars, pred_vars, [3], [7], [5], gtdict)
        self.assertEqual(cvs.num_true[VARIANT_TYPE.SNP], 2)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 2)
        self.assertEqual(len(cvs.false_positives.all_locations), 1)
        self.assertEqual(len(cvs.false_negatives.all_locations), 1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 1)
Example #14
0
    def testTruePosRectify(self):
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   5   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr4')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr4')
        gtdict = _genotype_concordance_dict()
        cvs = ChromVariantStats(true_vars, pred_vars, [5], [4, 7], [3, 8],
                                gtdict)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 1)
        cvs.rectify(get_reference(), 100)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 2)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0)
Example #15
0
    def testRectify2(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        gtdict = _genotype_concordance_dict()
        cvs = ChromVariantStats(true_vars, pred_vars, [], [3,6], [3],gtdict)
        cvs.rectify(get_reference(),100)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH],0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
Example #16
0
    def testChromEvaluateGenotypeConcordance(self):
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
chr1    5       .       C       T       20      PASS     .      GT      0/1\n
chr1    9       .       A       G       20      PASS     .      GT      1/1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      1/1\n
chr1    6       .       C       G       20      PASS     .      GT      0/1\n
chr1    9       .       A       G       20      PASS     .      GT      1/1\n
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(
            cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][
                GENOTYPE_TYPE.HOM_VAR], 1)
        self.assertEqual(
            cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][
                GENOTYPE_TYPE.HOM_VAR], 1)
        # anything other than TP don't get counted in genotype concordance
        self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP), (1, 2))
        # phased variants should be counted correctly
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0|1\n
chr1    9       .       A       G       20      PASS     .      GT      1|1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      1|0\n
chr1    9       .       A       G       20      PASS     .      GT      1|1\n
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(
            cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][
                GENOTYPE_TYPE.HET], 1)
        self.assertEqual(
            cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][
                GENOTYPE_TYPE.HOM_VAR], 1)
        self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP), (0, 2))
Example #17
0
    def testChromEvaluateVariantsSV(self):
        #NB: SVs aren't rescued, just checked for within breakpoint tolerance
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        #SV with exact position, exact allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1)
        #SV with exact position, difference allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAATGC       20      PASS    .       GT      0/1
"""
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1)
        #SV with position within tolerance, exact allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   4   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""

        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1)
        #SV outside of tolerance
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   110   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 0)
Example #18
0
    def testVariantWithMismatchedRef(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     C       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        rescuer = SequenceRescuer('chr2',2,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertFalse(rescuer.rescued)
Example #19
0
    def testIndelDeletionMismatchedAllele(self):
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   5        .       ATC     A       20      PASS    .       GT      0/1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   5        .       ATCG    A       20      PASS    .       GT      0/1
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr3')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr3')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],1)
Example #20
0
    def testEmptyWindow(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     C       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        rescuer = SequenceRescuer('chr1',10049,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertFalse(rescuer.rescued)
Example #21
0
    def testChromEvaluateVariantsSV(self):
        #NB: SVs aren't rescued, just checked for within breakpoint tolerance
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        #SV with exact position, exact allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1)
        #SV with exact position, difference allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAATGC       20      PASS    .       GT      0/1
"""
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1)
        #SV with position within tolerance, exact allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   4   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""

        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1)
        #SV outside of tolerance
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   110   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],0)
Example #22
0
    def testIndelDeletionMismatchedAllele(self):
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   5        .       ATC     A       20      PASS    .       GT      0/1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   5        .       ATCG    A       20      PASS    .       GT      0/1
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr3')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr3')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 1)
Example #23
0
    def testVariantWithMismatchedRef(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     C       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr2')
        rescuer = SequenceRescuer('chr2', 2, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Example #24
0
    def testAggregate(self):
        # build two ChromVariantStats objects
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   5   .       C     T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   7   .       G     C       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        gtdict = _genotype_concordance_dict() # leave empty for now
        cvs2 = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   3   .       G     A       20      PASS    .       GT      1/1\n
chr3   5   .       C     T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   3   .       G     A       20      PASS    .       GT      1/1\n
chr3   4   .       T     A       20      PASS    .       GT      1/1\n
chr3   7   .       G     C       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr3')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr3')
        cvs3 = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        #cvs5 = ChromVariantStats(true_vars,pred_vars,[31],[49,79],[52],_genotype_concordance_dict())
        aggregator,errors = _aggregate([cvs2,cvs3])
        # test some sums
        self.assertEqual(cvs2.num_true[VARIANT_TYPE.SNP],2)
        self.assertEqual(cvs3.num_true[VARIANT_TYPE.SNP],2)
        self.assertEqual(aggregator(VARIANT_TYPE.SNP)['num_true'],4)
        self.assertEqual(cvs2.num_tp[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs3.num_tp[VARIANT_TYPE.SNP],1)
        self.assertEqual(aggregator(VARIANT_TYPE.SNP)['good_predictions'],2)
Example #25
0
    def testEmptyWindow(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     C       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr1')
        rescuer = SequenceRescuer('chr1', 10049, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Example #26
0
    def testWindowTooBig(self):
        longsv1 = 'ATTGTTCATGA'*300
        longsv2 = 'GCCTAGGGTCA'*300
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   7001   .       """ + longsv1 + """     A       20      PASS    .       GT      1/1\n
chr1   10100   .       """ + longsv2 + """     G       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        rescuer = SequenceRescuer('chr1',10049,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertFalse(rescuer.rescued)
Example #27
0
    def testFullRescue(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        rescuer = SequenceRescuer('chr2',2,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertTrue(rescuer.rescued)
        self.assertEqual(rescuer.windowsRescued,(0,0))

        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        rescuer = SequenceRescuer('chr2',3,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertTrue(rescuer.rescued)

        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
        """
        fn_vars = vcf_to_ChromVariants(fn_str,'chr4')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr4')
        tp_vars = vcf_to_ChromVariants(tp_str,'chr4')
        rescuer = SequenceRescuer('chr4',3,fn_vars,fp_vars,tp_vars,get_reference(),50)
        self.assertTrue(rescuer.rescued)
Example #28
0
    def testChromEvaluateGenotypeConcordance(self):
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
chr1    5       .       C       T       20      PASS     .      GT      0/1\n
chr1    9       .       A       G       20      PASS     .      GT      1/1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      1/1\n
chr1    6       .       C       G       20      PASS     .      GT      0/1\n
chr1    9       .       A       G       20      PASS     .      GT      1/1\n
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_VAR],1)
        self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR],1)
        # anything other than TP don't get counted in genotype concordance
        self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP),(1,2))
        # phased variants should be counted correctly
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0|1\n
chr1    9       .       A       G       20      PASS     .      GT      1|1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      1|0\n
chr1    9       .       A       G       20      PASS     .      GT      1|1\n
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HET],1)
        self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR],1)
        self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP),(0,2))
Example #29
0
    def testEnlargeBounds(self):
        #no variant overlaps low, so no change
        #variant exactly abuts high, so get back high + 1
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str, 'chr19')
        (low, high) = _enlarge_bounds(variants, 88000, 88020)
        self.assertEqual(low, 88000)
        self.assertEqual(high, 88021)
Example #30
0
    def testWindowTooBig(self):
        longsv1 = 'ATTGTTCATGA' * 300
        longsv2 = 'GCCTAGGGTCA' * 300
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   7001   .       """ + longsv1 + """     A       20      PASS    .       GT      1/1\n
chr1   10100   .       """ + longsv2 + """     G       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr1')
        rescuer = SequenceRescuer('chr1', 10049, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Example #31
0
    def testEnlargeBounds(self):
        #no variant overlaps low, so no change
        #variant exactly abuts high, so get back high + 1
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str,'chr19')
        (low,high) = _enlarge_bounds(variants,88000,88020)
        self.assertEqual(low,88000)
        self.assertEqual(high,88021)
Example #32
0
    def testRescueMission(self):
        # false negative variant at location is SV; don't rescue
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     GATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     GC       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(fn_str,'chr1')
        pred_vars = vcf_to_ChromVariants(fp_str,'chr1')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(true_vars,pred_vars,get_empty_ChromVariants('chr1'),8000,get_reference(),100)
        self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues())))
        self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues())))
        # variant couldn't be rescued; no change to counts or ChromVariants
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C       20      PASS    .       GT      1/1\n
chr1   7   .       C        T       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   4   .       A     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr1'),2,get_reference(),100)
        self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues())))
        self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues())))
        self.assertEqual(len(fn_vars.all_locations),2)
        self.assertEqual(len(fp_vars.all_locations),1)
        self.assertEqual(rescuedvars,[])
        # variant is rescued; counts change; variants are removed from fn/fp
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr2'),2,get_reference(),100)
        self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP],2)
        self.assertEqual(len(fn_vars.all_locations),0)
        self.assertEqual(len(fp_vars.all_locations),0)
        self.assertEqual(map(lambda r: r.pos,rescuedvars),[3,4])
Example #33
0
    def testRescueMission(self):
        # false negative variant at location is SV; don't rescue
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     GATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     GC       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(fn_str,'chr1')
        pred_vars = vcf_to_ChromVariants(fp_str,'chr1')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(true_vars,pred_vars,get_empty_ChromVariants('chr1'),8000,get_reference(),100)
        self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues())))
        self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues())))
        # variant couldn't be rescued; no change to counts or ChromVariants
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C       20      PASS    .       GT      1/1\n
chr1   7   .       C        T       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   4   .       A     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr1'),2,get_reference(),100)
        self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues())))
        self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues())))
        self.assertEqual(len(fn_vars.all_locations),2)
        self.assertEqual(len(fp_vars.all_locations),1)
        self.assertEqual(rescuedvars,[])
        # variant is rescued; counts change; variants are removed from fn/fp
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr2'),2,get_reference(),100)
        self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP],2)
        self.assertEqual(len(fn_vars.all_locations),0)
        self.assertEqual(len(fp_vars.all_locations),0)
        self.assertEqual(map(lambda r: r.pos,rescuedvars),[3,4])
Example #34
0
    def testOverlappingVariants(self):
        # if vcf contains overlapping variants, don't rescue that sequence
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   1        .       T       G       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .    GCC     G       20      PASS    .       GT      1/1\n
chr2   4   .    C       G       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   7   .       GA     A       20      PASS    .       GT      1/1\n
"""
        tp_vars = vcf_to_ChromVariants(tp_str,'chr2')
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        rescuer = SequenceRescuer('chr2',1,fn_vars,fp_vars,tp_vars,get_reference(),50)
        self.assertFalse(rescuer.rescued)
Example #35
0
    def testOverlappingVariants(self):
        # if vcf contains overlapping variants, don't rescue that sequence
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   1        .       T       G       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .    GCC     G       20      PASS    .       GT      1/1\n
chr2   4   .    C       G       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   7   .       GA     A       20      PASS    .       GT      1/1\n
"""
        tp_vars = vcf_to_ChromVariants(tp_str, 'chr2')
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr2')
        rescuer = SequenceRescuer('chr2', 1, fn_vars, fp_vars, tp_vars,
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Example #36
0
    def testTooManyPaths(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr1   10053   .       TGCGT        T       20      PASS    .       GT      0/1\n
chr1   10055   .       GCTAA        G       20      PASS    .       GT      0/1\n
chr1   10057   .       TA           T       20      PASS    .       GT      1/1\n
chr1   10058   .       GC           G       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10025   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr1   10028   .       TGCGT        T       20      PASS    .       GT      0/1\n
chr1   10029   .       GCTAA        G       20      PASS    .       GT      0/1\n
chr1   10032   .       TA           T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        rescuer = SequenceRescuer('chr1',10000,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertFalse(rescuer.rescued)
Example #37
0
    def testNormalizedVariants(self):
        fp_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    4       .       C       CTC     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        fn_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTC     20      PASS    .       GT      0/1\n
"""
        fp_vars = normalize_vcf_to_ChromVariants(fp_str,'chr4')
        fn_vars = vcf_to_ChromVariants(fn_str,'chr4')
        rescuer = SequenceRescuer('chr4',2,fn_vars,fp_vars,get_empty_ChromVariants('chr4'),get_reference(),50)
        self.assertTrue(rescuer.rescued)
Example #38
0
    def testGetSeq(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   2   .       TCGA     T       20      PASS    .       GT      1/1\n
chr3   9   .       A        AAAA    20      PASS    .       GT      0/1\n
"""
        variants = vcf_to_ChromVariants(pred_str,'chr3')
        window_tup = (1,13,'chr3')
        sequence = _get_seq(window_tup,variants.getAllVariants(),get_reference(),False)
        self.assertEqual(sequence[0],'ATTCGAAAATCG')
        self.assertEqual(sequence[1],'')
        sequence = _get_seq(window_tup,variants.getAllVariants(),get_reference(),True)
        self.assertEqual(sequence[0],'ATTCGATCG')
        self.assertEqual(sequence[1],'ATCGATCGAAAATCG')
Example #39
0
    def testNormalizedVariants(self):
        fp_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    4       .       C       CTC     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        fn_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTC     20      PASS    .       GT      0/1\n
"""
        fp_vars = normalize_vcf_to_ChromVariants(fp_str, 'chr4')
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr4')
        rescuer = SequenceRescuer('chr4', 2, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr4'),
                                  get_reference(), 50)
        self.assertTrue(rescuer.rescued)
Example #40
0
    def testGetSeq(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   2   .       TCGA     T       20      PASS    .       GT      1/1\n
chr3   9   .       A        AAAA    20      PASS    .       GT      0/1\n
"""
        variants = vcf_to_ChromVariants(pred_str, 'chr3')
        window_tup = (1, 13, 'chr3')
        sequence = _get_seq(window_tup, variants.getAllVariants(),
                            get_reference(), False)
        self.assertEqual(sequence[0], 'ATTCGAAAATCG')
        self.assertEqual(sequence[1], '')
        sequence = _get_seq(window_tup, variants.getAllVariants(),
                            get_reference(), True)
        self.assertEqual(sequence[0], 'ATTCGATCG')
        self.assertEqual(sequence[1], 'ATCGATCGAAAATCG')
Example #41
0
    def testGetChoppedVariant(self):
        #base case
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str, 'chr19')
        chopped = _get_chopped_variant(variants, 88015, False)
        self.assertEqual(chopped.pos, 88012)
        chopped = _get_chopped_variant(variants, 88015, True)
        self.assertEqual(chopped.pos, 88012)

        #no variants within range
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   87962   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str, 'chr19')
        chopped = _get_chopped_variant(variants, 88015, False)
        self.assertFalse(chopped)
        chopped = _get_chopped_variant(variants, 88015, True)
        self.assertFalse(chopped)

        #ignore non overlapping snp
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88008   .       T        G       20        PASS     .       GT      1/1\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str, 'chr19')
        chopped = _get_chopped_variant(variants, 88015, False)
        self.assertEqual(chopped.pos, 88012)
        chopped = _get_chopped_variant(variants, 88015, True)
        self.assertEqual(chopped.pos, 88012)
        #ignore overlapping snp since indel is farther left (and also right)
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr19   88014   .       T            G       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str, 'chr19')
        chopped = _get_chopped_variant(variants, 88015, False)
        self.assertEqual(chopped.pos, 88012)
        chopped = _get_chopped_variant(variants, 88015, True)
        self.assertEqual(chopped.pos, 88012)

        #find longest of overlapping indels
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88008   .       ATTGCTTAACG       A       20       PASS   .       GT      0/1\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str, 'chr19')
        chopped = _get_chopped_variant(variants, 88015, False)
        self.assertEqual(chopped.pos, 88008)
        chopped = _get_chopped_variant(variants, 88015, True)
        self.assertEqual(chopped.pos, 88012)
Example #42
0
    def testFullRescue(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr2')
        rescuer = SequenceRescuer('chr2', 2, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertTrue(rescuer.rescued)
        self.assertEqual(rescuer.windowsRescued, (0, 0))

        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr2')
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr2')
        rescuer = SequenceRescuer('chr2', 3, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertTrue(rescuer.rescued)

        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
        """
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr4')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr4')
        tp_vars = vcf_to_ChromVariants(tp_str, 'chr4')
        rescuer = SequenceRescuer('chr4', 3, fn_vars, fp_vars, tp_vars,
                                  get_reference(), 50)
        self.assertTrue(rescuer.rescued)
Example #43
0
    def testGetChoppedVariant(self):
        #base case
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str,'chr19')
        chopped = _get_chopped_variant(variants,88015,False)
        self.assertEqual(chopped.pos,88012)
        chopped = _get_chopped_variant(variants,88015,True)
        self.assertEqual(chopped.pos,88012)

        #no variants within range
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   87962   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str,'chr19')
        chopped = _get_chopped_variant(variants,88015,False)
        self.assertFalse(chopped)
        chopped = _get_chopped_variant(variants,88015,True)
        self.assertFalse(chopped)

        #ignore non overlapping snp
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88008   .       T        G       20        PASS     .       GT      1/1\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str,'chr19')
        chopped = _get_chopped_variant(variants,88015,False)
        self.assertEqual(chopped.pos,88012)
        chopped = _get_chopped_variant(variants,88015,True)
        self.assertEqual(chopped.pos,88012)
        #ignore overlapping snp since indel is farther left (and also right)
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr19   88014   .       T            G       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str,'chr19')
        chopped = _get_chopped_variant(variants,88015,False)
        self.assertEqual(chopped.pos,88012)
        chopped = _get_chopped_variant(variants,88015,True)
        self.assertEqual(chopped.pos,88012)

        #find longest of overlapping indels
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88008   .       ATTGCTTAACG       A       20       PASS   .       GT      0/1\n
chr19   88012   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        variants = vcf_to_ChromVariants(pred_str,'chr19')
        chopped = _get_chopped_variant(variants,88015,False)
        self.assertEqual(chopped.pos,88008)
        chopped = _get_chopped_variant(variants,88015,True)
        self.assertEqual(chopped.pos,88012)