def testChromEvaluateVariantsKnownFP(self): # one known true variant true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n """ # call var where known fp is, where true var is, where nothing is known pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n chr1 4 . G C 20 PASS . GT 1/1\n chr1 7 . G A 20 PASS . GT 0/1\n """ # known locations with NO variant known_fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 1 . A T 20 PASS . GT ./.\n chr1 7 . G . 20 PASS . GT 0/0\n """ true_vars = vcf_to_ChromVariants(true_str,'chr1') pred_vars = vcf_to_ChromVariants(pred_str,'chr1') known_fp_io = StringIO.StringIO(known_fp_str) known_fp = Variants(vcf.Reader(known_fp_io),MAX_INDEL_LEN,knownFP=True) cvs = chrom_evaluate_variants(true_vars,pred_vars,100,100,get_reference(),50,known_fp.on_chrom('chr1')) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP],2) # usual definition, in pred vars but not in true self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP],1) # call at location known to NOT have SNP
def testGetRestOfPaths(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr19 11 . ACT A 20 PASS . GT 1/1\n chr19 15 . ACGATT AA 20 PASS . GT 1/1\n chr19 16 . ACG A 20 PASS . GT 1/1\n chr19 22 . ATT A 20 PASS . GT 0/1\n """ pred_io = StringIO.StringIO(pred_str) pred_vcf = vcf.Reader(pred_io) pred_vars = Variants(pred_vcf,MAX_INDEL_LEN) viw = extract_range_and_filter(pred_vars.on_chrom('chr19'),10,25,11) paths = _getRestOfPath([], _getOverlaps([],viw)) #all paths take variants at pos 11 and 22; one takes pos 15, one pos 16 self.assertEqual(len(paths),2) self.assertEqual(len(paths[0]),3) self.assertEqual(len(paths[1]),3) self.assertTrue(all(map(lambda e: any(map(lambda x: x.pos == 11, e)), paths))) self.assertTrue(all(map(lambda e: any(map(lambda x: x.pos == 22, e)), paths))) self.assertTrue(any(map(lambda x: x.pos == 15, paths[0]))) self.assertFalse(any(map(lambda x: x.pos == 16, paths[0]))) self.assertFalse(any(map(lambda x: x.pos == 15, paths[1]))) self.assertTrue(any(map(lambda x: x.pos == 16, paths[1])))
def testChromEvaluateVariantsKnownFP(self): # one known true variant true_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n """ # call var where known fp is, where true var is, where nothing is known pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A T 20 PASS . GT 0/1\n chr1 4 . G C 20 PASS . GT 1/1\n chr1 7 . G A 20 PASS . GT 0/1\n """ # known locations with NO variant known_fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 1 . A T 20 PASS . GT ./.\n chr1 7 . G . 20 PASS . GT 0/0\n """ true_vars = vcf_to_ChromVariants(true_str, 'chr1') pred_vars = vcf_to_ChromVariants(pred_str, 'chr1') known_fp_io = StringIO.StringIO(known_fp_str) known_fp = Variants(vcf.Reader(known_fp_io), MAX_INDEL_LEN, knownFP=True) cvs = chrom_evaluate_variants(true_vars, pred_vars, 100, 100, get_reference(), 50, known_fp.on_chrom('chr1')) self.assertEqual(cvs.num_fp[VARIANT_TYPE.SNP], 2) # usual definition, in pred vars but not in true self.assertEqual(cvs.calls_at_known_fp[VARIANT_TYPE.SNP], 1) # call at location known to NOT have SNP
def testGetOverlaps(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr19 10 . ACT A 20 PASS . GT 1/1\n chr19 13 . AC A 20 PASS . GT 1/1\n chr19 14 . TAGG TA 20 PASS . GT 1/1\n chr19 15 . AGG A 20 PASS . GT 0/1\n chr19 19 . T TAAAC 20 PASS . GT 0/1 """ pred_io = StringIO.StringIO(pred_str) pred_vcf = vcf.Reader(pred_io) pred_vars = Variants(pred_vcf,MAX_INDEL_LEN) variants_in_window = extract_range_and_filter(pred_vars.on_chrom('chr19'),10,20,10) #the three overlapping variants should be in same group overlaps = _getOverlaps([], variants_in_window) self.assertEqual(len(overlaps),3) self.assertEqual(map(lambda o: len(o),overlaps),[1,3,1])
def testExtractRangeAndFilter(self): pred_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr19 2 . T G 20 PASS . GT 0/1\n chr19 10 . A C 20 PASS . GT 1/1\n chr19 13 . A ACT 20 PASS . GT 1/1\n chr19 15 . A T 20 PASS . GT 0/1\n chr19 18 . AAAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG A 20 PASS . GT 0/1 """ pred_io = StringIO.StringIO(pred_str) pred_vcf = vcf.Reader(pred_io) pred_vars = Variants(pred_vcf, MAX_INDEL_LEN) variants_in_window = extract_range_and_filter(pred_vars.on_chrom('chr19'),10,20,13) self.assertEqual(len(variants_in_window),3) #SV is removed self.assertFalse(any(map(lambda v: v.var_type.startswith("SV"), variants_in_window))) #variant overlapping with variant at location of interest is removed self.assertFalse(any(map(lambda v: v.pos == 2,variants_in_window)))
def normalize_vcf_to_ChromVariants(vcf_str,chrom): str_io = StringIO.StringIO(vcf_str) str_vcf = vcf.Reader(str_io) norm_iter = normalize(get_reference(),str_vcf) str_vars = Variants(norm_iter,MAX_INDEL_LEN) return str_vars.on_chrom(chrom)
def vcf_to_ChromVariants(vcf_str,chrom): str_io = StringIO.StringIO(vcf_str) str_vcf = vcf.Reader(str_io) str_vars = Variants(str_vcf,MAX_INDEL_LEN) return str_vars.on_chrom(chrom)
def vcf_to_ChromVariants(vcf_str, chrom): str_io = StringIO.StringIO(vcf_str) str_vcf = vcf.Reader(str_io) str_vars = Variants(str_vcf, MAX_INDEL_LEN) return str_vars.on_chrom(chrom)
def normalize_vcf_to_ChromVariants(vcf_str, chrom): str_io = StringIO.StringIO(vcf_str) str_vcf = vcf.Reader(str_io) norm_iter = normalize(get_reference(), str_vcf) str_vars = Variants(norm_iter, MAX_INDEL_LEN) return str_vars.on_chrom(chrom)
def vcf_to_Variants(vcf_str): str_io = StringIO.StringIO(vcf_str) str_vcf = vcf.Reader(str_io) return Variants(str_vcf, MAX_INDEL_LEN)