Ejemplo n.º 1
0
    def testChromEvaluateVariantsKnownFP(self):
        # one known true variant
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
        """
        # call var where known fp is, where true var is, where nothing is known
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS    .       GT      0/1\n
chr1    4       .       G       C       20      PASS    .       GT      1/1\n
chr1    7       .       G       A       20      PASS    .       GT      0/1\n
        """
        # known locations with NO variant
        known_fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    1       .       A       T       20      PASS    .       GT       ./.\n
chr1    7       .       G       .       20      PASS    .       GT       0/0\n
        """
        true_vars = vcf_to_ChromVariants(true_str,'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str,'chr1')
        known_fp_io = StringIO.StringIO(known_fp_str)
        known_fp = Variants(vcf.Reader(known_fp_io),MAX_INDEL_LEN,knownFP=True)
        cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50,known_fp.on_chrom('chr1'))
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2) # usual definition, in pred vars but not in true
        self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP],1) # call at location known to NOT have SNP
Ejemplo n.º 2
0
    def testGetRestOfPaths(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   11   .       ACT     A       20      PASS    .       GT      1/1\n
chr19   15   .       ACGATT      AA       20       PASS    .      GT      1/1\n
chr19   16   .       ACG      A        20      PASS    .       GT      1/1\n
chr19   22   .       ATT       A         20      PASS    .       GT      0/1\n
"""
        pred_io = StringIO.StringIO(pred_str)
        pred_vcf = vcf.Reader(pred_io)
        pred_vars = Variants(pred_vcf,MAX_INDEL_LEN)
        viw = extract_range_and_filter(pred_vars.on_chrom('chr19'),10,25,11)

        paths = _getRestOfPath([], _getOverlaps([],viw))
        #all paths take variants at pos 11 and 22; one takes pos 15, one pos 16
        self.assertEqual(len(paths),2)
        self.assertEqual(len(paths[0]),3)
        self.assertEqual(len(paths[1]),3)
        self.assertTrue(all(map(lambda e: any(map(lambda x: x.pos == 11, e)), paths)))
        self.assertTrue(all(map(lambda e: any(map(lambda x: x.pos == 22, e)), paths)))
        self.assertTrue(any(map(lambda x: x.pos == 15, paths[0])))
        self.assertFalse(any(map(lambda x: x.pos == 16, paths[0])))
        self.assertFalse(any(map(lambda x: x.pos == 15, paths[1])))
        self.assertTrue(any(map(lambda x: x.pos == 16, paths[1])))
Ejemplo n.º 3
0
    def testChromEvaluateVariantsKnownFP(self):
        # one known true variant
        true_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS     .      GT      0/1\n
        """
        # call var where known fp is, where true var is, where nothing is known
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    2       .       A       T       20      PASS    .       GT      0/1\n
chr1    4       .       G       C       20      PASS    .       GT      1/1\n
chr1    7       .       G       A       20      PASS    .       GT      0/1\n
        """
        # known locations with NO variant
        known_fp_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1    1       .       A       T       20      PASS    .       GT       ./.\n
chr1    7       .       G       .       20      PASS    .       GT       0/0\n
        """
        true_vars = vcf_to_ChromVariants(true_str, 'chr1')
        pred_vars = vcf_to_ChromVariants(pred_str, 'chr1')
        known_fp_io = StringIO.StringIO(known_fp_str)
        known_fp = Variants(vcf.Reader(known_fp_io),
                            MAX_INDEL_LEN,
                            knownFP=True)
        cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100,
                                      get_reference(), 50,
                                      known_fp.on_chrom('chr1'))
        self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],
                         2)  # usual definition, in pred vars but not in true
        self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP],
                         1)  # call at location known to NOT have SNP
Ejemplo n.º 4
0
    def testGetOverlaps(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   10   .       ACT     A       20      PASS    .       GT      1/1\n
chr19   13   .       AC      A       20       PASS    .      GT      1/1\n
chr19   14   .       TAGG      TA        20      PASS    .       GT      1/1\n
chr19   15   .       AGG       A         20      PASS    .       GT      0/1\n
chr19   19  .       T       TAAAC 20      PASS    .       GT      0/1
"""
        pred_io = StringIO.StringIO(pred_str)
        pred_vcf = vcf.Reader(pred_io)
        pred_vars = Variants(pred_vcf,MAX_INDEL_LEN)
        variants_in_window = extract_range_and_filter(pred_vars.on_chrom('chr19'),10,20,10)

        #the three overlapping variants should be in same group
        overlaps = _getOverlaps([], variants_in_window)
        self.assertEqual(len(overlaps),3)
        self.assertEqual(map(lambda o: len(o),overlaps),[1,3,1])
Ejemplo n.º 5
0
    def testExtractRangeAndFilter(self):
        pred_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr19   2    .       T     G       20      PASS    .       GT      0/1\n
chr19   10   .       A     C       20      PASS    .       GT      1/1\n
chr19   13   .       A       ACT       20      PASS    .       GT      1/1\n
chr19   15   .       A       T         20      PASS    .       GT      0/1\n
chr19   18  .       AAAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG       A 20      PASS    .       GT      0/1
"""
        pred_io = StringIO.StringIO(pred_str)
        pred_vcf = vcf.Reader(pred_io)
        pred_vars = Variants(pred_vcf, MAX_INDEL_LEN)

        variants_in_window = extract_range_and_filter(pred_vars.on_chrom('chr19'),10,20,13)
        self.assertEqual(len(variants_in_window),3)
        #SV is removed
        self.assertFalse(any(map(lambda v: v.var_type.startswith("SV"), variants_in_window)))
        #variant overlapping with variant at location of interest is removed
        self.assertFalse(any(map(lambda v: v.pos == 2,variants_in_window)))
Ejemplo n.º 6
0
def normalize_vcf_to_ChromVariants(vcf_str,chrom):
    str_io = StringIO.StringIO(vcf_str)
    str_vcf = vcf.Reader(str_io)
    norm_iter = normalize(get_reference(),str_vcf)
    str_vars = Variants(norm_iter,MAX_INDEL_LEN)
    return str_vars.on_chrom(chrom)
Ejemplo n.º 7
0
def vcf_to_ChromVariants(vcf_str,chrom):
    str_io = StringIO.StringIO(vcf_str)
    str_vcf = vcf.Reader(str_io)
    str_vars = Variants(str_vcf,MAX_INDEL_LEN)
    return str_vars.on_chrom(chrom)
Ejemplo n.º 8
0
def vcf_to_ChromVariants(vcf_str, chrom):
    str_io = StringIO.StringIO(vcf_str)
    str_vcf = vcf.Reader(str_io)
    str_vars = Variants(str_vcf, MAX_INDEL_LEN)
    return str_vars.on_chrom(chrom)
Ejemplo n.º 9
0
def normalize_vcf_to_ChromVariants(vcf_str, chrom):
    str_io = StringIO.StringIO(vcf_str)
    str_vcf = vcf.Reader(str_io)
    norm_iter = normalize(get_reference(), str_vcf)
    str_vars = Variants(norm_iter, MAX_INDEL_LEN)
    return str_vars.on_chrom(chrom)
Ejemplo n.º 10
0
def vcf_to_Variants(vcf_str):
    str_io = StringIO.StringIO(vcf_str)
    str_vcf = vcf.Reader(str_io)
    return Variants(str_vcf, MAX_INDEL_LEN)