Esempio n. 1
0
    def test_sv_out_of_range(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88013   .       CTT     C       20      PASS    .       GT      0/1\n
chr19   89272   .       C       T       20      PASS    .       GT      0/1\n
chr19   269852  .       A       AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20      PASS    .       GT      1/1
"""

        pred_io = StringIO.StringIO(pred_str)
        pred_vcf = vcf.Reader(pred_io)
        pred_vars = Variants(pred_vcf, MAX_INDEL_LEN)

        stat_reporter, errors = evaluate_variants(self.true_vars, pred_vars,
                                                  sv_eps, sv_eps, None, None,
                                                  None)

        self.truePositive(stat_reporter, VARIANT_TYPE.SNP)
        self.trueNegative(stat_reporter, VARIANT_TYPE.INDEL_INS)
        self.truePositive(stat_reporter, VARIANT_TYPE.INDEL_DEL)

        sv_ins_stats = stat_reporter(VARIANT_TYPE.SV_INS)

        self.assertEqual(sv_ins_stats['num_true'], 1)
        self.assertEqual(sv_ins_stats['num_pred'], 1)
        self.assertEqual(sv_ins_stats['good_predictions'], 0)
        self.assertEqual(sv_ins_stats['intersect_bad'], 0)
        self.assertEqual(sv_ins_stats['false_negatives'], 1)
        self.assertEqual(sv_ins_stats['nrd_total'], 0)
        self.assertEqual(sv_ins_stats['nrd_wrong'], 0)

        self.trueNegative(stat_reporter, VARIANT_TYPE.SV_DEL)
Esempio n. 2
0
    def test_bad_sv_ins(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88013   .       CTT     C       20      PASS    .       GT      0/1\n
chr19   89272   .       C       T       20      PASS    .       GT      0/1\n
chr19   269751  .       A       AAAAGAAAGGCATGACCTATCCTTTTATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20      PASS    .       GT      1/1
"""

        pred_io = StringIO.StringIO(pred_str)
        pred_vcf = vcf.Reader(pred_io)
        pred_vars = Variants(pred_vcf, MAX_INDEL_LEN)

        sv_eps = 100

        stat_reporter, errors = evaluate_variants(self.true_vars, pred_vars,
                                                  sv_eps, sv_eps, None, None,
                                                  None)

        self.truePositive(stat_reporter, VARIANT_TYPE.SNP)
        self.trueNegative(stat_reporter, VARIANT_TYPE.INDEL_INS)
        self.truePositive(stat_reporter, VARIANT_TYPE.INDEL_DEL)
        self.badCallAtTrueSite(stat_reporter, VARIANT_TYPE.SV_INS)
        self.trueNegative(stat_reporter, VARIANT_TYPE.SV_DEL)
Esempio n. 3
0
    def setUp(self):
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   88013   .       CTT     C       20      PASS    .       GT      0/1\n
chr19   89272   .       C       T       20      PASS    .       GT      0/1\n
chr19   269751  .       A       AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20      PASS    .       GT      1/1
"""
        true_io = StringIO.StringIO(true_str)
        true_vcf = vcf.Reader(true_io)
        self.true_vars = Variants(true_vcf, MAX_INDEL_LEN)
Esempio n. 4
0
    def test_known_false_positives(self):
        true_vcf = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    1       .       T       A       20      PASS    .       GT       0/1\n
chr1    8       .       A       C       20      PASS    .       GT       1/1\n
"""
        pred_vcf = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    3       .       G       C       20      PASS     .      GT      1/1\n
chr1    5       .       C       G       20      PASS     .      GT      0/1\n
chr1    8       .       A       C       20      PASS     .      GT      1/1\n
"""
        known_fp_vcf = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    3       .       G       .       20      PASS    .       GT      0/0\n
chr1    5       .       C       G       20      PASS    .       GT      0/0\n
chr1    9       .       T       .       20      PASS    .       GT      0/0\n
"""

        known_fp_io = StringIO.StringIO(known_fp_vcf)
        known_fp_vars = Variants(vcf.Reader(known_fp_io),MAX_INDEL_LEN,knownFP=True)

        stat_reporter, vcf_output = evaluate_variants(vcf_to_Variants(true_vcf),vcf_to_Variants(pred_vcf),sv_eps,sv_eps, \
            get_reference(),50,known_fp_vars)

        snp_stats = stat_reporter(VARIANT_TYPE.SNP)

        self.assertEqual(snp_stats['num_true'],2)
        self.assertEqual(snp_stats['num_pred'],3)
        self.assertEqual(snp_stats['good_predictions'],1)
        self.assertEqual(snp_stats['false_positives'],2) # predicted vars not in ground truth
        self.assertEqual(snp_stats['false_negatives'],1)
        self.assertEqual(snp_stats['known_fp_calls'],2)
        self.assertEqual(snp_stats['known_fp'],2)