def testGenotypes(self): # normalize a compound heterozygous call vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A C 20 PASS . GT 1/2\n """ vcf_io = StringIO.StringIO(vcf_str) test_vcf = vcf.Reader(vcf_io) output_io = StringIO.StringIO() output_writer = VCFWriter(self.test_fasta, 'name', output_io) normalize(self.test_fasta, test_vcf, output_writer) output_vcf = self.outputToVcf(output_io) record = output_vcf.next() self.assertEqual(record.samples[0].gt_nums, "1/2")
def testNBaseNormalization(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTT 20 PASS . GT 0/1\n chr4 3 . NN N 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_iter), 1)
def testNBaseNormalization(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTT 20 PASS . GT 0/1\n chr4 3 . NN N 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_iter),1)
def normalizeStringToWriter(self,vcf_str): vcf_io = StringIO.StringIO(vcf_str) test_vcf = vcf.Reader(vcf_io) output_io = StringIO.StringIO() output_writer = VCFWriter('ref.fasta','name',output_io) map(lambda r: write(r,output_writer),normalize(get_reference(),test_vcf)) outputStr = output_io.getvalue() outputStr = outputStr.replace('\n','\n\n') return vcf.Reader(StringIO.StringIO(outputStr))
def testCollidingVariants(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 5 . A TGC 20 PASS . GT 1/1\n chr1 5 . A GGG 20 PASS . GT 1/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) count = self.countRecords(norm_iter) self.assertEqual(count, 1)
def testCollidingVariants(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 5 . A TGC 20 PASS . GT 1/1\n chr1 5 . A GGG 20 PASS . GT 1/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) count = self.countRecords(norm_iter) self.assertEqual(count,1)
def testNormalizedToCollision(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 4 . C T 20 PASS . GT 0/1\n chr2 5 . C CGC 20 PASS . GT 0/1\n chr4 2 . A AGG 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() r3 = norm_iter.next() r4 = norm_iter.next() self.assertEqual(r1.POS, 4) # chr2 SNP doesn't change self.assertEqual( r2.POS, 5 ) # chr2 insertion gets normed forward 1 base and slid back to original pos self.assertEqual(r2.REF, "C") self.assertEqual(r2.ALT, ["CGC"]) self.assertEqual(r3.POS, 2) self.assertEqual(r3.REF, "A") self.assertEqual(r3.ALT, ["AGG"]) self.assertEqual(r4.POS, 3) self.assertEqual(r4.REF, "T") self.assertEqual(r4.ALT, ["TCT"]) vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . ATC A 20 PASS . GT 0/1\n chr4 6 . CTC C 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() self.assertEqual(r1.POS, 2) self.assertEqual(r1.REF, "ATC") self.assertEqual(r1.ALT, ["A"]) self.assertEqual(r2.POS, 5) self.assertEqual(r2.REF, "TCT") self.assertEqual(r2.ALT, ["T"])
def testGenotypes(self): # keep genotype info for a compound heterozygous call vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A C,T 20 PASS . GT 1/2\n """ vcf = self.getVcf(vcf_str) record = normalize(get_reference(), vcf).next() self.assertEqual(record.samples[0].gt_nums, "1/2")
def testGenotypes(self): # keep genotype info for a compound heterozygous call vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A C,T 20 PASS . GT 1/2\n """ vcf = self.getVcf(vcf_str) record = normalize(get_reference(),vcf).next() self.assertEqual(record.samples[0].gt_nums, "1/2")
def testCleanOnly(self): vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . g cg 20 PASS . GT 0/1\n """ norm = normalize(get_reference(),self.getVcf(vcf_str),50,True) record = norm.next() self.assertEqual(record.POS,6) self.assertEqual(record.REF,'G') self.assertEqual(record.ALT,['CG'])
def testCleanOnly(self): vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . g cg 20 PASS . GT 0/1\n """ norm = normalize(get_reference(), self.getVcf(vcf_str), 50, True) record = norm.next() self.assertEqual(record.POS, 6) self.assertEqual(record.REF, 'G') self.assertEqual(record.ALT, ['CG'])
def testNormalizedToCollision(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 4 . C T 20 PASS . GT 0/1\n chr2 5 . C CGC 20 PASS . GT 0/1\n chr4 2 . A AGG 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() r3 = norm_iter.next() r4 = norm_iter.next() self.assertEqual(r1.POS,4) # chr2 SNP doesn't change self.assertEqual(r2.POS,5) # chr2 insertion gets normed forward 1 base and slid back to original pos self.assertEqual(r2.REF,"C") self.assertEqual(r2.ALT,["CGC"]) self.assertEqual(r3.POS,2) self.assertEqual(r3.REF,"A") self.assertEqual(r3.ALT,["AGG"]) self.assertEqual(r4.POS,3) self.assertEqual(r4.REF,"T") self.assertEqual(r4.ALT,["TCT"]) vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . ATC A 20 PASS . GT 0/1\n chr4 6 . CTC C 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() self.assertEqual(r1.POS,2) self.assertEqual(r1.REF,"ATC") self.assertEqual(r1.ALT,["A"]) self.assertEqual(r2.POS,5) self.assertEqual(r2.REF,"TCT") self.assertEqual(r2.ALT,["T"])
def testMultipleAltAlleles(self): # multiple alleles aren't normalized if the two alt alleles would be normalized differently vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . G CG 20 PASS . GT 0/1\n """ record = normalize(get_reference(),self.getVcf(vcf_str)).next() self.assertEqual(record.POS,3) self.assertEqual(record.REF,'G') self.assertEqual(record.ALT[0], 'GC') vcf_str2 = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . G CG,C 20 PASS . GT 0/1\n """ record = normalize(get_reference(),self.getVcf(vcf_str2)).next() self.assertEqual(record.POS,6) self.assertEqual(record.REF,'G') self.assertEqual(record.ALT[0],'CG')
def testMultipleAltAlleles(self): # multiple alleles aren't normalized if the two alt alleles would be normalized differently vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . G CG 20 PASS . GT 0/1\n """ record = normalize(get_reference(), self.getVcf(vcf_str)).next() self.assertEqual(record.POS, 3) self.assertEqual(record.REF, 'G') self.assertEqual(record.ALT[0], 'GC') vcf_str2 = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 6 . G CG,C 20 PASS . GT 0/1\n """ record = normalize(get_reference(), self.getVcf(vcf_str2)).next() self.assertEqual(record.POS, 6) self.assertEqual(record.REF, 'G') self.assertEqual(record.ALT[0], 'CG')
def testNormalizeTwoToCollision(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C CTC 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(), self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() self.assertEqual(r1.POS, 2) self.assertEqual(r1.REF, "A") self.assertEqual(r1.ALT, ["ATC"]) self.assertEqual(r2.POS, 3) self.assertEqual(r2.REF, "T") self.assertEqual(r2.ALT, ["TCT"])
def testNormalizeTwoToCollision(self): vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C CTC 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() self.assertEqual(r1.POS,2) self.assertEqual(r1.REF,"A") self.assertEqual(r1.ALT,["ATC"]) self.assertEqual(r2.POS,3) self.assertEqual(r2.REF,"T") self.assertEqual(r2.ALT,["TCT"])
def testNormalizeThreeCollision(self): # the OP info flag is fake to force vars to right-slide vcf_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTT 20 PASS OP=1 GT 0/1\n chr4 2 . A T 20 PASS . GT 0/1\n chr4 2 . ATCTC T 20 PASS OP=2 GT 0/1\n """ norm_iter = normalize(get_reference(),self.getVcf(vcf_str)) r1 = norm_iter.next() r2 = norm_iter.next() r3 = norm_iter.next() r1,r2,r3 = sorted([r1,r2,r3],key=lambda x: x.POS) # order of vars from same pos not guaranteed self.assertEqual(r1.POS,2) self.assertEqual(r2.POS,3) self.assertEqual(r2.REF,"T") self.assertEqual(r2.ALT,["TCTTT"]) self.assertEqual(r3.POS,4) self.assertEqual(r3.REF,"CTCTC") self.assertEqual(r3.ALT,["C"])
def normalize_vcf_to_ChromVariants(vcf_str,chrom): str_io = StringIO.StringIO(vcf_str) str_vcf = vcf.Reader(str_io) norm_iter = normalize(get_reference(),str_vcf) str_vars = Variants(norm_iter,MAX_INDEL_LEN) return str_vars.on_chrom(chrom)
def testNormalize(self): #regular records are unchanged vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C A 20 PASS . GT 0/1\n """ norm_vcf = normalize(get_reference(), self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf), 1) #test that hom ref records are removed vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C C 20 PASS . GT 0/0\n chr1 3 . G A 20 PASS . GT 1/1\n """ norm_vcf = normalize(get_reference(), self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf), 1) #test that SNP/indels without genotyping are removed vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C A 20 PASS . GT .\n chr1 3 . G C 20 PASS . GT 0/0\n chr1 4 . G T 20 PASS . GT 0|0\n chr1 5 . G A 20 PASS . GT 1/1\n """ norm_vcf = normalize(get_reference(), self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf), 1) #test that SV without genotyping is retained vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20 PASS . GT .\n """ norm_vcf = normalize(get_reference(), self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf), 1) #test that lower case ref/alt gets upper-cased vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . c a 20 PASS . GT 0/1\n """ vcf_io = StringIO.StringIO(vcf_str) lowercase_vcf = vcf.Reader(StringIO.StringIO(vcf_str)) output_vcf = normalize(get_reference(), self.getVcf(vcf_str)) original_r = lowercase_vcf.next() norm_r = output_vcf.next() self.assertEqual(original_r.REF, 'c') self.assertEqual(original_r.ALT[0], 'a') self.assertEqual(norm_r.REF, 'C') self.assertEqual(norm_r.ALT[0], 'A') # test normalizing an insertion vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 9 . a ga 20 PASS . GT 0/1\n """ record = normalize(get_reference(), self.getVcf(vcf_str)).next() self.assertEqual(record.POS, 6) self.assertEqual(record.REF, 'C') self.assertEqual(record.ALT, ['CG']) # test normalizing a deletion vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 5 . cc c 20 PASS . GT 0/1\n """ record = normalize(get_reference(), self.getVcf(vcf_str)).next() self.assertEqual(record.POS, 4) self.assertEqual(record.REF, 'GC') self.assertEqual(record.ALT, ['G'])
def normalize_vcf_to_ChromVariants(vcf_str, chrom): str_io = StringIO.StringIO(vcf_str) str_vcf = vcf.Reader(str_io) norm_iter = normalize(get_reference(), str_vcf) str_vars = Variants(norm_iter, MAX_INDEL_LEN) return str_vars.on_chrom(chrom)
def testNormalize(self): #regular records are unchanged vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C A 20 PASS . GT 0/1\n """ norm_vcf = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf),1) #test that hom ref records are removed vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C C 20 PASS . GT 0/0\n chr1 3 . G A 20 PASS . GT 1/1\n """ norm_vcf = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf),1) #test that SNP/indels without genotyping are removed vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C A 20 PASS . GT .\n chr1 3 . G C 20 PASS . GT 0/0\n chr1 4 . G T 20 PASS . GT 0|0\n chr1 5 . G A 20 PASS . GT 1/1\n """ norm_vcf = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf),1) #test that SV without genotyping is retained vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20 PASS . GT .\n """ norm_vcf = normalize(get_reference(),self.getVcf(vcf_str)) self.assertEqual(self.countRecords(norm_vcf),1) #test that lower case ref/alt gets upper-cased vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . c a 20 PASS . GT 0/1\n """ vcf_io = StringIO.StringIO(vcf_str) lowercase_vcf = vcf.Reader(StringIO.StringIO(vcf_str)) output_vcf = normalize(get_reference(),self.getVcf(vcf_str)) original_r = lowercase_vcf.next() norm_r = output_vcf.next() self.assertEqual(original_r.REF,'c') self.assertEqual(original_r.ALT[0], 'a') self.assertEqual(norm_r.REF,'C') self.assertEqual(norm_r.ALT[0],'A') # test normalizing an insertion vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 9 . a ga 20 PASS . GT 0/1\n """ record = normalize(get_reference(),self.getVcf(vcf_str)).next() self.assertEqual(record.POS,6) self.assertEqual(record.REF,'C') self.assertEqual(record.ALT,['CG']) # test normalizing a deletion vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 5 . cc c 20 PASS . GT 0/1\n """ record = normalize(get_reference(),self.getVcf(vcf_str)).next() self.assertEqual(record.POS,4) self.assertEqual(record.REF,'GC') self.assertEqual(record.ALT,['G'])
def testNormalize(self): #regular records are unchanged vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C A 20 PASS . GT 0/1\n """ vcf_io = StringIO.StringIO(vcf_str) norm_vcf = vcf.Reader(vcf_io) output_io = StringIO.StringIO() output_writer = VCFWriter(self.test_fasta, 'name', output_io) normalize(self.test_fasta, norm_vcf, output_writer) self.assertEqual(self.countRecords(self.outputToVcf(output_io)), 1) #test that hom ref records are removed vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C C 20 PASS . GT 0/0\n """ vcf_io = StringIO.StringIO(vcf_str) homref_vcf = vcf.Reader(vcf_io) output_io = StringIO.StringIO() output_writer = VCFWriter(self.test_fasta, 'name', output_io) normalize(self.test_fasta, homref_vcf, output_writer) self.assertEqual(self.countRecords(self.outputToVcf(output_io)), 0) #test that SNP/indels without genotyping are removed vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C A 20 PASS . GT .\n chr1 3 . G C 20 PASS . GT 0/0\n chr1 4 . G T 20 PASS . GT 0|0\n """ vcf_io = StringIO.StringIO(vcf_str) homref_vcf = vcf.Reader(vcf_io) output_io = StringIO.StringIO() output_writer = VCFWriter(self.test_fasta, 'name', output_io) normalize(self.test_fasta, homref_vcf, output_writer) self.assertEqual(self.countRecords(self.outputToVcf(output_io)), 0) #test that SV without genotyping is retained vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . C AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG 20 PASS . GT .\n """ vcf_io = StringIO.StringIO(vcf_str) homref_vcf = vcf.Reader(vcf_io) output_io = StringIO.StringIO() output_writer = VCFWriter(self.test_fasta, 'name', output_io) normalize(self.test_fasta, homref_vcf, output_writer) self.assertEqual(self.countRecords(self.outputToVcf(output_io)), 1) #test that lower case ref/alt gets upper-cased vcf_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . c a 20 PASS . GT 0/1\n """ vcf_io = StringIO.StringIO(vcf_str) lowercase_vcf = vcf.Reader(vcf_io) output_io = StringIO.StringIO() output_writer = VCFWriter(self.test_fasta, 'name', output_io) normalize(self.test_fasta, lowercase_vcf, output_writer) output_vcf = self.outputToVcf(output_io) lowercase_vcf = vcf.Reader(StringIO.StringIO(vcf_str)) original_r = lowercase_vcf.next() norm_r = output_vcf.next() self.assertEqual(original_r.REF, 'c') self.assertEqual(original_r.ALT[0], 'a') self.assertEqual(norm_r.REF, 'C') self.assertEqual(norm_r.ALT[0], 'A')