Beispiel #1
0
    def testLeftNormalize(self):
        #left normalize deletion
        norm_pos, norm_ref, norm_alts = left_normalize(get_reference(),'chr1',2,'CGCCG',['CG'])
        self.assertEqual(norm_pos,0)
        self.assertEqual(norm_ref,'AACGC')
        self.assertEqual(norm_alts[0],'AA')

        #left normalize insertion
        norm_pos, norm_ref, norm_alts = left_normalize(get_reference(),'chr4',12,'G',['GGG'])
        self.assertEqual(norm_pos,7)
        self.assertEqual(norm_ref,'C')
        self.assertEqual(norm_alts[0],'CGG')
Beispiel #2
0
    def testChromEvaluateGenotypeConcordance(self):
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
chr1    5       .       C       T       20      PASS     .      GT      0/1\n
chr1    9       .       A       G       20      PASS     .      GT      1/1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      1/1\n
chr1    6       .       C       G       20      PASS     .      GT      0/1\n
chr1    9       .       A       G       20      PASS     .      GT      1/1\n
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(
            cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][
                GENOTYPE_TYPE.HOM_VAR], 1)
        self.assertEqual(
            cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][
                GENOTYPE_TYPE.HOM_VAR], 1)
        # anything other than TP don't get counted in genotype concordance
        self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP), (1, 2))
        # phased variants should be counted correctly
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0|1\n
chr1    9       .       A       G       20      PASS     .      GT      1|1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      1|0\n
chr1    9       .       A       G       20      PASS     .      GT      1|1\n
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(
            cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][
                GENOTYPE_TYPE.HET], 1)
        self.assertEqual(
            cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][
                GENOTYPE_TYPE.HOM_VAR], 1)
        self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP), (0, 2))
Beispiel #3
0
    def testLeftNormalize(self):
        #left normalize deletion
        norm_pos, norm_ref, norm_alts = left_normalize(get_reference(), 'chr1',
                                                       2, 'CGCCG', ['CG'])
        self.assertEqual(norm_pos, 0)
        self.assertEqual(norm_ref, 'AACGC')
        self.assertEqual(norm_alts[0], 'AA')

        #left normalize insertion
        norm_pos, norm_ref, norm_alts = left_normalize(get_reference(), 'chr2',
                                                       4, 'CGGA', ['CTTGGA'])
        self.assertEqual(norm_pos, 1)
        self.assertEqual(norm_ref, 'TGCC')
        self.assertEqual(norm_alts[0], 'TGCCTT')
Beispiel #4
0
    def testChromEvaluateVariantsSV(self):
        #NB: SVs aren't rescued, just checked for within breakpoint tolerance
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        #SV with exact position, exact allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1)
        #SV with exact position, difference allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAATGC       20      PASS    .       GT      0/1
"""
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1)
        #SV with position within tolerance, exact allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   4   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""

        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 1)
        #SV outside of tolerance
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   110   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS], 0)
Beispiel #5
0
    def testRectify2(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        gtdict = _genotype_concordance_dict()
        cvs = ChromVariantStats(true_vars, pred_vars, [], [3,6], [3],gtdict)
        cvs.rectify(get_reference(),100)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH],0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
Beispiel #6
0
    def testRectify(self):
        # rectify CVS with a rescue-able indel
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        gtdict = _genotype_concordance_dict() # leave empty, we aren't testing this yet
        cvs = ChromVariantStats(true_vars,pred_vars,[],[3,4],[2],gtdict)
        # before rectify, no true positives
        self.assertTrue(all(map(lambda x: x == 0,cvs.num_tp.itervalues())))
        # one false negative indel
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],1)
        # two false positives SNPs
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2)
        cvs.rectify(get_reference(),100)
        # after rectify, one true positive indel
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1)
        # no false positives or false negatives
        self.assertTrue(all(map(lambda x: x == 0, cvs.num_fp.itervalues())))
        self.assertTrue(all(map(lambda x: x ==0, cvs.num_fn.itervalues())))
Beispiel #7
0
    def testRescueChromEvalVariants(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH],0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
Beispiel #8
0
    def testTooManyPaths(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr1   10053   .       TGCGT        T       20      PASS    .       GT      0/1\n
chr1   10055   .       GCTAA        G       20      PASS    .       GT      0/1\n
chr1   10057   .       TA           T       20      PASS    .       GT      1/1\n
chr1   10058   .       GC           G       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10025   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr1   10028   .       TGCGT        T       20      PASS    .       GT      0/1\n
chr1   10029   .       GCTAA        G       20      PASS    .       GT      0/1\n
chr1   10032   .       TA           T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr1')
        rescuer = SequenceRescuer('chr1', 10000, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Beispiel #9
0
    def testChromEvaluateVariantsKnownFP(self):
        # one known true variant
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
        """
        # call var where known fp is, where true var is, where nothing is known
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS    .       GT      0/1\n
chr1    4       .       G       C       20      PASS    .       GT      1/1\n
chr1    7       .       G       A       20      PASS    .       GT      0/1\n
        """
        # known locations with NO variant
        known_fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    1       .       A       T       20      PASS    .       GT       ./.\n
chr1    7       .       G       .       20      PASS    .       GT       0/0\n
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        known_fp_io = StringIO.StringIO(known_fp_str)
        known_fp = Variants(vcf.Reader(known_fp_io),MAX_INDEL_LEN,knownFP=True)
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50,known_fp.on_chrom('chr1'))
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2) # usual definition, in pred vars but not in true
        self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP],1) # call at location known to NOT have SNP
Beispiel #10
0
    def testTruePosRescueMission(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr4')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr4')
        tp_vars = vcf_to_ChromVariants(tp_str, 'chr4')
        num_new_tp, num_removed_fn = rescue_mission(fn_vars, fp_vars, tp_vars,
                                                    3, get_reference(), 100)
        self.assertEqual(num_new_tp[VARIANT_TYPE.SNP], 1)
        self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP], 1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertFalse(fn_vars.all_locations)
        self.assertFalse(fp_vars.all_locations)
Beispiel #11
0
    def testGetSeq(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   2   .       TCGA     T       20      PASS    .       GT      1/1\n
chr3   9   .       A        AAAA    20      PASS    .       GT      0/1\n
"""
        variants = vcf_to_ChromVariants(pred_str,'chr3')
        window_tup = (1,13,'chr3')
        sequence = _get_seq(window_tup,variants.getAllVariants(),get_reference(),False)
        self.assertEqual(sequence[0],'ATTCGAAAATCG')
        self.assertEqual(sequence[1],'')
        sequence = _get_seq(window_tup,variants.getAllVariants(),get_reference(),True)
        self.assertEqual(sequence[0],'ATTCGATCG')
        self.assertEqual(sequence[1],'ATCGATCGAAAATCG')
Beispiel #12
0
    def testChromEvaluateVariantsKnownFP(self):
        # one known true variant
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
        """
        # call var where known fp is, where true var is, where nothing is known
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS    .       GT      0/1\n
chr1    4       .       G       C       20      PASS    .       GT      1/1\n
chr1    7       .       G       A       20      PASS    .       GT      0/1\n
        """
        # known locations with NO variant
        known_fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    1       .       A       T       20      PASS    .       GT       ./.\n
chr1    7       .       G       .       20      PASS    .       GT       0/0\n
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        known_fp_io = StringIO.StringIO(known_fp_str)
        known_fp = Variants(vcf.Reader(known_fp_io),
                            MAX_INDEL_LEN,
                            knownFP=True)
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50,
                                      known_fp.on_chrom('chr1'))
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],
                         2)  # usual definition, in pred vars but not in true
        self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP],
                         1)  # call at location known to NOT have SNP
Beispiel #13
0
    def testRectify2(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr2')
        gtdict = _genotype_concordance_dict()
        cvs = ChromVariantStats(true_vars, pred_vars, [], [3, 6], [3], gtdict)
        cvs.rectify(get_reference(), 100)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH], 0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
Beispiel #14
0
    def testTruePosRectify(self):
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   5   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr4')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr4')
        gtdict = _genotype_concordance_dict()
        cvs = ChromVariantStats(true_vars, pred_vars, [5], [4, 7], [3, 8],
                                gtdict)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 1)
        cvs.rectify(get_reference(), 100)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 2)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0)
Beispiel #15
0
    def testTruePosRectify(self):
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   5   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr4')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr4')
        gtdict = _genotype_concordance_dict()
        cvs = ChromVariantStats(true_vars,pred_vars,[5],[4,7],[3,8],gtdict)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],1)
        cvs.rectify(get_reference(),100)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],2)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],0)
Beispiel #16
0
    def testRescueChromEvalVariants(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr2')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_OTH], 0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SNP], 0)
        self.assertEqual(cvs.num_pred[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
Beispiel #17
0
    def testRectify(self):
        # rectify CVS with a rescue-able indel
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str, 'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr2')
        gtdict = _genotype_concordance_dict(
        )  # leave empty, we aren't testing this yet
        cvs = ChromVariantStats(true_vars, pred_vars, [], [3, 4], [2], gtdict)
        # before rectify, no true positives
        self.assertTrue(all(map(lambda x: x == 0, cvs.num_tp.itervalues())))
        # one false negative indel
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_OTH], 1)
        # two false positives SNPs
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 2)
        cvs.rectify(get_reference(), 100)
        # after rectify, one true positive indel
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_OTH], 1)
        # no false positives or false negatives
        self.assertTrue(all(map(lambda x: x == 0, cvs.num_fp.itervalues())))
        self.assertTrue(all(map(lambda x: x == 0, cvs.num_fn.itervalues())))
Beispiel #18
0
    def testChromEvaluateVariantsSV(self):
        #NB: SVs aren't rescued, just checked for within breakpoint tolerance
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        #SV with exact position, exact allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1)
        #SV with exact position, difference allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   6   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAATGC       20      PASS    .       GT      0/1
"""
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1)
        #SV with position within tolerance, exact allele match
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   4   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""

        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],1)
        #SV outside of tolerance
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   110   .       C     CGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGACGTGAGATGAAAAA       20      PASS    .       GT      0/1
"""
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.SV_INS],0)
Beispiel #19
0
 def normalizeStringToWriter(self,vcf_str):
     vcf_io = StringIO.StringIO(vcf_str)
     test_vcf = vcf.Reader(vcf_io)
     output_io = StringIO.StringIO()
     output_writer = VCFWriter('ref.fasta','name',output_io)
     map(lambda r: write(r,output_writer),normalize(get_reference(),test_vcf))
     outputStr = output_io.getvalue()
     outputStr = outputStr.replace('\n','\n\n')
     return vcf.Reader(StringIO.StringIO(outputStr))
Beispiel #20
0
    def testNBaseNormalization(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTT   20      PASS    .          GT      0/1\n
chr4    3       .       NN      N       20      PASS    .          GT      0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_iter),1)
Beispiel #21
0
    def testNBaseNormalization(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTT   20      PASS    .          GT      0/1\n
chr4    3       .       NN      N       20      PASS    .          GT      0/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_iter), 1)
Beispiel #22
0
    def testGetSeq(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   2   .       TCGA     T       20      PASS    .       GT      1/1\n
chr3   9   .       A        AAAA    20      PASS    .       GT      0/1\n
"""
        variants = vcf_to_ChromVariants(pred_str, 'chr3')
        window_tup = (1, 13, 'chr3')
        sequence = _get_seq(window_tup, variants.getAllVariants(),
                            get_reference(), False)
        self.assertEqual(sequence[0], 'ATTCGAAAATCG')
        self.assertEqual(sequence[1], '')
        sequence = _get_seq(window_tup, variants.getAllVariants(),
                            get_reference(), True)
        self.assertEqual(sequence[0], 'ATTCGATCG')
        self.assertEqual(sequence[1], 'ATCGATCGAAAATCG')
Beispiel #23
0
    def testFullRescue(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        rescuer = SequenceRescuer('chr2',2,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertTrue(rescuer.rescued)
        self.assertEqual(rescuer.windowsRescued,(0,0))

        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GC     G       20      PASS    .       GT      1/1\n
chr2   6   .       G      A       20      PASS    .       GT      1/1\n
"""
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       GCCG     GCA       20      PASS    .       GT      1/1\n
"""
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        rescuer = SequenceRescuer('chr2',3,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertTrue(rescuer.rescued)

        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   3   .       TC     T       20      PASS    .       GT      1/1\n
chr4   8   .       C      T       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   4   .       C     T       20      PASS    .       GT      1/1\n
chr4   7   .       TC    T       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4   5   .       TC    T       20      PASS    .       GT      1/1\n
        """
        fn_vars = vcf_to_ChromVariants(fn_str,'chr4')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr4')
        tp_vars = vcf_to_ChromVariants(tp_str,'chr4')
        rescuer = SequenceRescuer('chr4',3,fn_vars,fp_vars,tp_vars,get_reference(),50)
        self.assertTrue(rescuer.rescued)
Beispiel #24
0
    def testAggregate(self):
        # build two ChromVariantStats objects
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   5   .       C     T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   7   .       G     C       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr2')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr2')
        gtdict = _genotype_concordance_dict() # leave empty for now
        cvs2 = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   3   .       G     A       20      PASS    .       GT      1/1\n
chr3   5   .       C     T       20      PASS    .       GT      1/1\n
"""
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   3   .       G     A       20      PASS    .       GT      1/1\n
chr3   4   .       T     A       20      PASS    .       GT      1/1\n
chr3   7   .       G     C       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(true_str,'chr3')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr3')
        cvs3 = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        #cvs5 = ChromVariantStats(true_vars,pred_vars,[31],[49,79],[52],_genotype_concordance_dict())
        aggregator,errors = _aggregate([cvs2,cvs3])
        # test some sums
        self.assertEqual(cvs2.num_true[VARIANT_TYPE.SNP],2)
        self.assertEqual(cvs3.num_true[VARIANT_TYPE.SNP],2)
        self.assertEqual(aggregator(VARIANT_TYPE.SNP)['num_true'],4)
        self.assertEqual(cvs2.num_tp[VARIANT_TYPE.SNP],1)
        self.assertEqual(cvs3.num_tp[VARIANT_TYPE.SNP],1)
        self.assertEqual(aggregator(VARIANT_TYPE.SNP)['good_predictions'],2)
Beispiel #25
0
    def testCollidingVariants(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   5   .      A     TGC       20      PASS    .       GT      1/1\n
chr1   5   .      A      GGG       20      PASS     .      GT      1/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        count = self.countRecords(norm_iter)
        self.assertEqual(count,1)
Beispiel #26
0
    def testCollidingVariants(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   5   .      A     TGC       20      PASS    .       GT      1/1\n
chr1   5   .      A      GGG       20      PASS     .      GT      1/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        count = self.countRecords(norm_iter)
        self.assertEqual(count, 1)
Beispiel #27
0
    def testNormalizedToCollision(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    4       .       C       T       20      PASS    .       GT      0/1\n
chr2    5       .       C       CGC     20      PASS    .       GT      0/1\n
chr4    2       .       A       AGG     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        r3 = norm_iter.next()
        r4 = norm_iter.next()
        self.assertEqual(r1.POS, 4)  # chr2 SNP doesn't change
        self.assertEqual(
            r2.POS, 5
        )  # chr2 insertion gets normed forward 1 base and slid back to original pos
        self.assertEqual(r2.REF, "C")
        self.assertEqual(r2.ALT, ["CGC"])
        self.assertEqual(r3.POS, 2)
        self.assertEqual(r3.REF, "A")
        self.assertEqual(r3.ALT, ["AGG"])
        self.assertEqual(r4.POS, 3)
        self.assertEqual(r4.REF, "T")
        self.assertEqual(r4.ALT, ["TCT"])

        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       ATC     A     20      PASS    .       GT      0/1\n
chr4    6       .       CTC     C     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        self.assertEqual(r1.POS, 2)
        self.assertEqual(r1.REF, "ATC")
        self.assertEqual(r1.ALT, ["A"])
        self.assertEqual(r2.POS, 5)
        self.assertEqual(r2.REF, "TCT")
        self.assertEqual(r2.ALT, ["T"])
Beispiel #28
0
    def testGenotypes(self):
        # keep genotype info for a compound heterozygous call
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C,T       20      PASS    .       GT      1/2\n
"""
        vcf = self.getVcf(vcf_str)
        record = normalize(get_reference(), vcf).next()
        self.assertEqual(record.samples[0].gt_nums, "1/2")
Beispiel #29
0
    def testGenotypes(self):
        # keep genotype info for a compound heterozygous call
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C,T       20      PASS    .       GT      1/2\n
"""
        vcf = self.getVcf(vcf_str)
        record = normalize(get_reference(),vcf).next()
        self.assertEqual(record.samples[0].gt_nums, "1/2")
Beispiel #30
0
    def testChromEvaluateGenotypeConcordance(self):
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
chr1    5       .       C       T       20      PASS     .      GT      0/1\n
chr1    9       .       A       G       20      PASS     .      GT      1/1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      1/1\n
chr1    6       .       C       G       20      PASS     .      GT      0/1\n
chr1    9       .       A       G       20      PASS     .      GT      1/1\n
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_VAR],1)
        self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR],1)
        # anything other than TP don't get counted in genotype concordance
        self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP),(1,2))
        # phased variants should be counted correctly
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0|1\n
chr1    9       .       A       G       20      PASS     .      GT      1|1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      1|0\n
chr1    9       .       A       G       20      PASS     .      GT      1|1\n
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HET],1)
        self.assertEqual(cvs.genotype_concordance[VARIANT_TYPE.SNP][GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR],1)
        self.assertEqual(cvs._nrd_counts(VARIANT_TYPE.SNP),(0,2))
Beispiel #31
0
    def testCleanOnly(self):
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       g       cg       20      PASS    .       GT      0/1\n
"""
        norm = normalize(get_reference(), self.getVcf(vcf_str), 50, True)
        record = norm.next()
        self.assertEqual(record.POS, 6)
        self.assertEqual(record.REF, 'G')
        self.assertEqual(record.ALT, ['CG'])
Beispiel #32
0
    def testCleanOnly(self):
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       g       cg       20      PASS    .       GT      0/1\n
"""
        norm = normalize(get_reference(),self.getVcf(vcf_str),50,True)
        record = norm.next()
        self.assertEqual(record.POS,6)
        self.assertEqual(record.REF,'G')
        self.assertEqual(record.ALT,['CG'])
Beispiel #33
0
    def testNormalizedToCollision(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    4       .       C       T       20      PASS    .       GT      0/1\n
chr2    5       .       C       CGC     20      PASS    .       GT      0/1\n
chr4    2       .       A       AGG     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        r3 = norm_iter.next()
        r4 = norm_iter.next()
        self.assertEqual(r1.POS,4) # chr2 SNP doesn't change
        self.assertEqual(r2.POS,5) # chr2 insertion gets normed forward 1 base and slid back to original pos
        self.assertEqual(r2.REF,"C")
        self.assertEqual(r2.ALT,["CGC"])
        self.assertEqual(r3.POS,2)
        self.assertEqual(r3.REF,"A")
        self.assertEqual(r3.ALT,["AGG"])
        self.assertEqual(r4.POS,3)
        self.assertEqual(r4.REF,"T")
        self.assertEqual(r4.ALT,["TCT"])

        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       ATC     A     20      PASS    .       GT      0/1\n
chr4    6       .       CTC     C     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        self.assertEqual(r1.POS,2)
        self.assertEqual(r1.REF,"ATC")
        self.assertEqual(r1.ALT,["A"])
        self.assertEqual(r2.POS,5)
        self.assertEqual(r2.REF,"TCT")
        self.assertEqual(r2.ALT,["T"])
Beispiel #34
0
    def testMultipleAltAlleles(self):
        # multiple alleles aren't normalized if the two alt alleles would be normalized differently
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       G       CG       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(), self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS, 3)
        self.assertEqual(record.REF, 'G')
        self.assertEqual(record.ALT[0], 'GC')
        vcf_str2 = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       G       CG,C       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(), self.getVcf(vcf_str2)).next()
        self.assertEqual(record.POS, 6)
        self.assertEqual(record.REF, 'G')
        self.assertEqual(record.ALT[0], 'CG')
Beispiel #35
0
    def testMultipleAltAlleles(self):
        # multiple alleles aren't normalized if the two alt alleles would be normalized differently
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       G       CG       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(),self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS,3)
        self.assertEqual(record.REF,'G')
        self.assertEqual(record.ALT[0], 'GC')
        vcf_str2 = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       G       CG,C       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(),self.getVcf(vcf_str2)).next()
        self.assertEqual(record.POS,6)
        self.assertEqual(record.REF,'G')
        self.assertEqual(record.ALT[0],'CG')
Beispiel #36
0
    def testNormalizeTwoToCollision(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    4       .       C       CTC     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        self.assertEqual(r1.POS, 2)
        self.assertEqual(r1.REF, "A")
        self.assertEqual(r1.ALT, ["ATC"])
        self.assertEqual(r2.POS, 3)
        self.assertEqual(r2.REF, "T")
        self.assertEqual(r2.ALT, ["TCT"])
Beispiel #37
0
    def testNormalizeTwoToCollision(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    4       .       C       CTC     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        self.assertEqual(r1.POS,2)
        self.assertEqual(r1.REF,"A")
        self.assertEqual(r1.ALT,["ATC"])
        self.assertEqual(r2.POS,3)
        self.assertEqual(r2.REF,"T")
        self.assertEqual(r2.ALT,["TCT"])
Beispiel #38
0
    def testIndelDeletionMismatchedAllele(self):
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   5        .       ATC     A       20      PASS    .       GT      0/1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   5        .       ATCG    A       20      PASS    .       GT      0/1
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr3')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr3')
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL],0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL],1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL],1)
Beispiel #39
0
    def testVariantWithMismatchedRef(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     C       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr2')
        rescuer = SequenceRescuer('chr2', 2, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Beispiel #40
0
    def testNormalizedVariants(self):
        fp_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    4       .       C       CTC     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        fn_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTC     20      PASS    .       GT      0/1\n
"""
        fp_vars = normalize_vcf_to_ChromVariants(fp_str, 'chr4')
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr4')
        rescuer = SequenceRescuer('chr4', 2, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr4'),
                                  get_reference(), 50)
        self.assertTrue(rescuer.rescued)
Beispiel #41
0
    def testIndelDeletionMismatchedAllele(self):
        true_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   5        .       ATC     A       20      PASS    .       GT      0/1\n
        """
        pred_str = """##fileformat=VCFv4.0\n
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr3   5        .       ATCG    A       20      PASS    .       GT      0/1
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr3')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr3')
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50)
        self.assertEqual(cvs.num_tp[VARIANT_TYPE.INDEL_DEL], 0)
        self.assertEqual(cvs.num_fn[VARIANT_TYPE.INDEL_DEL], 1)
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.INDEL_DEL], 1)
Beispiel #42
0
    def testEmptyWindow(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     C       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr1')
        rescuer = SequenceRescuer('chr1', 10049, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Beispiel #43
0
    def test_approx_sv(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88013   .       CTT     C       20      PASS    .       GT      0/1\n
chr19   89272   .       C       T       20      PASS    .       GT      0/1\n
chr19   269771  .       A       AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20      PASS    .       GT      1/1
"""

        pred_io = StringIO.StringIO(pred_str)
        pred_vcf = vcf.Reader(pred_io)

        stat_reporter = evaluate_low_memory(self.true_vars, pred_vcf, sv_eps, sv_eps,get_reference(), 50, 50, {'chr19':0,None:100})

        self.truePositive(stat_reporter,VARIANT_TYPE.SNP)
        self.trueNegative(stat_reporter,VARIANT_TYPE.INDEL_INS)
        self.truePositive(stat_reporter,VARIANT_TYPE.INDEL_DEL)
        self.truePositive(stat_reporter,VARIANT_TYPE.SV_INS)
        self.trueNegative(stat_reporter,VARIANT_TYPE.SV_DEL)
Beispiel #44
0
    def testNormalizeThreeCollision(self):
        # the OP info flag is fake to force vars to right-slide
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTT     20      PASS    OP=1       GT      0/1\n
chr4    2       .       A       T     20      PASS    .       GT      0/1\n
chr4    2       .       ATCTC    T        20      PASS     OP=2        GT     0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        r3 = norm_iter.next()
        r1,r2,r3 = sorted([r1,r2,r3],key=lambda x: x.POS) # order of vars from same pos not guaranteed
        self.assertEqual(r1.POS,2)
        self.assertEqual(r2.POS,3)
        self.assertEqual(r2.REF,"T")
        self.assertEqual(r2.ALT,["TCTTT"])
        self.assertEqual(r3.POS,4)
        self.assertEqual(r3.REF,"CTCTC")
        self.assertEqual(r3.ALT,["C"])
Beispiel #45
0
    def test_known_false_positives(self):
        true_vcf = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    1       .       T       A       20      PASS    .       GT       0/1\n
chr1    8       .       A       C       20      PASS    .       GT       1/1\n
"""
        pred_vcf = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    3       .       G       C       20      PASS     .      GT      1/1\n
chr1    5       .       C       G       20      PASS     .      GT      0/1\n
chr1    8       .       A       C       20      PASS     .      GT      1/1\n
"""
        known_fp_vcf = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    3       .       G       .       20      PASS    .       GT      0/0\n
chr1    5       .       C       G       20      PASS    .       GT      0/0\n
chr1    9       .       T       .       20      PASS    .       GT      0/0\n
"""

        known_fp_io = StringIO.StringIO(known_fp_vcf)
        known_fp_vars = Variants(vcf.Reader(known_fp_io),MAX_INDEL_LEN,knownFP=True)

        stat_reporter, vcf_output = evaluate_variants(vcf_to_Variants(true_vcf),vcf_to_Variants(pred_vcf),sv_eps,sv_eps, \
            get_reference(),50,known_fp_vars)

        snp_stats = stat_reporter(VARIANT_TYPE.SNP)

        self.assertEqual(snp_stats['num_true'],2)
        self.assertEqual(snp_stats['num_pred'],3)
        self.assertEqual(snp_stats['good_predictions'],1)
        self.assertEqual(snp_stats['false_positives'],2) # predicted vars not in ground truth
        self.assertEqual(snp_stats['false_negatives'],1)
        self.assertEqual(snp_stats['known_fp_calls'],2)
        self.assertEqual(snp_stats['known_fp'],2)
Beispiel #46
0
    def testWindowTooBig(self):
        longsv1 = 'ATTGTTCATGA' * 300
        longsv2 = 'GCCTAGGGTCA' * 300
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   7001   .       """ + longsv1 + """     A       20      PASS    .       GT      1/1\n
chr1   10100   .       """ + longsv2 + """     G       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr1')
        rescuer = SequenceRescuer('chr1', 10049, fn_vars, fp_vars,
                                  get_empty_ChromVariants('chr2'),
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Beispiel #47
0
    def testOverlappingVariants(self):
        # if vcf contains overlapping variants, don't rescue that sequence
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   1        .       T       G       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .    GCC     G       20      PASS    .       GT      1/1\n
chr2   4   .    C       G       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   7   .       GA     A       20      PASS    .       GT      1/1\n
"""
        tp_vars = vcf_to_ChromVariants(tp_str,'chr2')
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        rescuer = SequenceRescuer('chr2',1,fn_vars,fp_vars,tp_vars,get_reference(),50)
        self.assertFalse(rescuer.rescued)
Beispiel #48
0
    def testOverlappingVariants(self):
        # if vcf contains overlapping variants, don't rescue that sequence
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   1        .       T       G       20      PASS    .       GT      1/1\n
"""
        tp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .    GCC     G       20      PASS    .       GT      1/1\n
chr2   4   .    C       G       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   7   .       GA     A       20      PASS    .       GT      1/1\n
"""
        tp_vars = vcf_to_ChromVariants(tp_str, 'chr2')
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr2')
        rescuer = SequenceRescuer('chr2', 1, fn_vars, fp_vars, tp_vars,
                                  get_reference(), 50)
        self.assertFalse(rescuer.rescued)
Beispiel #49
0
    def testNormalize(self):
        #regular records are unchanged
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     A       20      PASS    .       GT      0/1\n
"""
        norm_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf),1)

        #test that hom ref records are removed
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     C       20      PASS    .       GT      0/0\n
chr1   3   .       G     A       20      PASS    .       GT      1/1\n
"""
        norm_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf),1)

        #test that SNP/indels without genotyping are removed
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     A       20      PASS    .       GT      .\n
chr1   3   .       G     C       20      PASS    .       GT      0/0\n
chr1   4   .       G     T       20      PASS    .       GT      0|0\n
chr1   5   .       G     A       20      PASS    .       GT      1/1\n
"""
        norm_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf),1)

        #test that SV without genotyping is retained
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG       20      PASS    .       GT      .\n
"""
        norm_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf),1)

        #test that lower case ref/alt gets upper-cased
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       c     a       20      PASS    .       GT      0/1\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        lowercase_vcf = vcf.Reader(StringIO.StringIO(vcf_str))
        output_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        original_r = lowercase_vcf.next()
        norm_r = output_vcf.next()
        self.assertEqual(original_r.REF,'c')
        self.assertEqual(original_r.ALT[0], 'a')
        self.assertEqual(norm_r.REF,'C')
        self.assertEqual(norm_r.ALT[0],'A')

        # test normalizing an insertion
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   9   .       a     ga       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(),self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS,6)
        self.assertEqual(record.REF,'C')
        self.assertEqual(record.ALT,['CG'])

        # test normalizing a deletion
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   5   .       cc     c       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(),self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS,4)
        self.assertEqual(record.REF,'GC')
        self.assertEqual(record.ALT,['G'])
Beispiel #50
0
    def testRescueMission(self):
        # false negative variant at location is SV; don't rescue
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     GATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     GC       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(fn_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(fp_str, 'chr1')
        num_new_tp, num_removed_fn = rescue_mission(
            true_vars, pred_vars, get_empty_ChromVariants('chr1'), 8000,
            get_reference(), 100)
        self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues())))
        self.assertFalse(any(map(lambda x: x > 0,
                                 num_removed_fn.itervalues())))
        # variant couldn't be rescued; no change to counts or ChromVariants
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C       20      PASS    .       GT      1/1\n
chr1   7   .       C        T       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   4   .       A     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr1')
        num_new_tp, num_removed_fn = rescue_mission(
            fn_vars, fp_vars, get_empty_ChromVariants('chr1'), 2,
            get_reference(), 100)
        self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues())))
        self.assertFalse(any(map(lambda x: x > 0,
                                 num_removed_fn.itervalues())))
        self.assertEqual(len(fn_vars.all_locations), 2)
        self.assertEqual(len(fp_vars.all_locations), 1)
        # variant is rescued; counts change; variants are removed from fn/fp
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str, 'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str, 'chr2')
        num_new_tp, num_removed_fn = rescue_mission(
            fn_vars, fp_vars, get_empty_ChromVariants('chr2'), 2,
            get_reference(), 100)
        self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_OTH], 1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP], 2)
        self.assertEqual(len(fn_vars.all_locations), 0)
        self.assertEqual(len(fp_vars.all_locations), 0)
Beispiel #51
0
    def testRescueMission(self):
        # false negative variant at location is SV; don't rescue
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     GATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     GC       20      PASS    .       GT      1/1\n
"""
        true_vars = vcf_to_ChromVariants(fn_str,'chr1')
        pred_vars = vcf_to_ChromVariants(fp_str,'chr1')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(true_vars,pred_vars,get_empty_ChromVariants('chr1'),8000,get_reference(),100)
        self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues())))
        self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues())))
        # variant couldn't be rescued; no change to counts or ChromVariants
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C       20      PASS    .       GT      1/1\n
chr1   7   .       C        T       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   4   .       A     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr1'),2,get_reference(),100)
        self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues())))
        self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues())))
        self.assertEqual(len(fn_vars.all_locations),2)
        self.assertEqual(len(fp_vars.all_locations),1)
        self.assertEqual(rescuedvars,[])
        # variant is rescued; counts change; variants are removed from fn/fp
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     A       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr2'),2,get_reference(),100)
        self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_OTH],1)
        self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP],2)
        self.assertEqual(len(fn_vars.all_locations),0)
        self.assertEqual(len(fp_vars.all_locations),0)
        self.assertEqual(map(lambda r: r.pos,rescuedvars),[3,4])
Beispiel #52
0
    def testTooManyPaths(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr1   10053   .       TGCGT        T       20      PASS    .       GT      0/1\n
chr1   10055   .       GCTAA        G       20      PASS    .       GT      0/1\n
chr1   10057   .       TA           T       20      PASS    .       GT      1/1\n
chr1   10058   .       GC           G       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10025   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
chr1   10028   .       TGCGT        T       20      PASS    .       GT      0/1\n
chr1   10029   .       GCTAA        G       20      PASS    .       GT      0/1\n
chr1   10032   .       TA           T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        rescuer = SequenceRescuer('chr1',10000,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertFalse(rescuer.rescued)
Beispiel #53
0
    def testEmptyWindow(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   8000   .       G     C       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        rescuer = SequenceRescuer('chr1',10049,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertFalse(rescuer.rescued)
Beispiel #54
0
    def testWindowTooBig(self):
        longsv1 = 'ATTGTTCATGA'*300
        longsv2 = 'GCCTAGGGTCA'*300
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   7001   .       """ + longsv1 + """     A       20      PASS    .       GT      1/1\n
chr1   10100   .       """ + longsv2 + """     G       20      PASS    .       GT      0/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   10049   .       CTTAAGCT     C       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr1')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr1')
        rescuer = SequenceRescuer('chr1',10049,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertFalse(rescuer.rescued)
Beispiel #55
0
    def testVariantWithMismatchedRef(self):
        fn_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   2   .       TGC     TAT       20      PASS    .       GT      1/1\n
"""
        fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2   3   .       G     C       20      PASS    .       GT      1/1\n
chr2   4   .       C     T       20      PASS    .       GT      1/1\n
"""
        fn_vars = vcf_to_ChromVariants(fn_str,'chr2')
        fp_vars = vcf_to_ChromVariants(fp_str,'chr2')
        rescuer = SequenceRescuer('chr2',2,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50)
        self.assertFalse(rescuer.rescued)