Ejemplo n.º 1
0
    def testGenotypes(self):
        # normalize a compound heterozygous call
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C       20      PASS    .       GT      1/2\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        test_vcf = vcf.Reader(vcf_io)
        output_io = StringIO.StringIO()
        output_writer = VCFWriter(self.test_fasta, 'name', output_io)
        normalize(self.test_fasta, test_vcf, output_writer)
        output_vcf = self.outputToVcf(output_io)
        record = output_vcf.next()
        self.assertEqual(record.samples[0].gt_nums, "1/2")
Ejemplo n.º 2
0
    def testNBaseNormalization(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTT   20      PASS    .          GT      0/1\n
chr4    3       .       NN      N       20      PASS    .          GT      0/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_iter), 1)
Ejemplo n.º 3
0
    def testNBaseNormalization(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTT   20      PASS    .          GT      0/1\n
chr4    3       .       NN      N       20      PASS    .          GT      0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_iter),1)
Ejemplo n.º 4
0
 def normalizeStringToWriter(self,vcf_str):
     vcf_io = StringIO.StringIO(vcf_str)
     test_vcf = vcf.Reader(vcf_io)
     output_io = StringIO.StringIO()
     output_writer = VCFWriter('ref.fasta','name',output_io)
     map(lambda r: write(r,output_writer),normalize(get_reference(),test_vcf))
     outputStr = output_io.getvalue()
     outputStr = outputStr.replace('\n','\n\n')
     return vcf.Reader(StringIO.StringIO(outputStr))
Ejemplo n.º 5
0
    def testCollidingVariants(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   5   .      A     TGC       20      PASS    .       GT      1/1\n
chr1   5   .      A      GGG       20      PASS     .      GT      1/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        count = self.countRecords(norm_iter)
        self.assertEqual(count, 1)
Ejemplo n.º 6
0
    def testCollidingVariants(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   5   .      A     TGC       20      PASS    .       GT      1/1\n
chr1   5   .      A      GGG       20      PASS     .      GT      1/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        count = self.countRecords(norm_iter)
        self.assertEqual(count,1)
Ejemplo n.º 7
0
    def testNormalizedToCollision(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    4       .       C       T       20      PASS    .       GT      0/1\n
chr2    5       .       C       CGC     20      PASS    .       GT      0/1\n
chr4    2       .       A       AGG     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        r3 = norm_iter.next()
        r4 = norm_iter.next()
        self.assertEqual(r1.POS, 4)  # chr2 SNP doesn't change
        self.assertEqual(
            r2.POS, 5
        )  # chr2 insertion gets normed forward 1 base and slid back to original pos
        self.assertEqual(r2.REF, "C")
        self.assertEqual(r2.ALT, ["CGC"])
        self.assertEqual(r3.POS, 2)
        self.assertEqual(r3.REF, "A")
        self.assertEqual(r3.ALT, ["AGG"])
        self.assertEqual(r4.POS, 3)
        self.assertEqual(r4.REF, "T")
        self.assertEqual(r4.ALT, ["TCT"])

        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       ATC     A     20      PASS    .       GT      0/1\n
chr4    6       .       CTC     C     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        self.assertEqual(r1.POS, 2)
        self.assertEqual(r1.REF, "ATC")
        self.assertEqual(r1.ALT, ["A"])
        self.assertEqual(r2.POS, 5)
        self.assertEqual(r2.REF, "TCT")
        self.assertEqual(r2.ALT, ["T"])
Ejemplo n.º 8
0
    def testGenotypes(self):
        # keep genotype info for a compound heterozygous call
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C,T       20      PASS    .       GT      1/2\n
"""
        vcf = self.getVcf(vcf_str)
        record = normalize(get_reference(), vcf).next()
        self.assertEqual(record.samples[0].gt_nums, "1/2")
Ejemplo n.º 9
0
    def testGenotypes(self):
        # keep genotype info for a compound heterozygous call
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       A     C,T       20      PASS    .       GT      1/2\n
"""
        vcf = self.getVcf(vcf_str)
        record = normalize(get_reference(),vcf).next()
        self.assertEqual(record.samples[0].gt_nums, "1/2")
Ejemplo n.º 10
0
    def testCleanOnly(self):
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       g       cg       20      PASS    .       GT      0/1\n
"""
        norm = normalize(get_reference(),self.getVcf(vcf_str),50,True)
        record = norm.next()
        self.assertEqual(record.POS,6)
        self.assertEqual(record.REF,'G')
        self.assertEqual(record.ALT,['CG'])
Ejemplo n.º 11
0
    def testCleanOnly(self):
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       g       cg       20      PASS    .       GT      0/1\n
"""
        norm = normalize(get_reference(), self.getVcf(vcf_str), 50, True)
        record = norm.next()
        self.assertEqual(record.POS, 6)
        self.assertEqual(record.REF, 'G')
        self.assertEqual(record.ALT, ['CG'])
Ejemplo n.º 12
0
    def testNormalizedToCollision(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    4       .       C       T       20      PASS    .       GT      0/1\n
chr2    5       .       C       CGC     20      PASS    .       GT      0/1\n
chr4    2       .       A       AGG     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        r3 = norm_iter.next()
        r4 = norm_iter.next()
        self.assertEqual(r1.POS,4) # chr2 SNP doesn't change
        self.assertEqual(r2.POS,5) # chr2 insertion gets normed forward 1 base and slid back to original pos
        self.assertEqual(r2.REF,"C")
        self.assertEqual(r2.ALT,["CGC"])
        self.assertEqual(r3.POS,2)
        self.assertEqual(r3.REF,"A")
        self.assertEqual(r3.ALT,["AGG"])
        self.assertEqual(r4.POS,3)
        self.assertEqual(r4.REF,"T")
        self.assertEqual(r4.ALT,["TCT"])

        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       ATC     A     20      PASS    .       GT      0/1\n
chr4    6       .       CTC     C     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        self.assertEqual(r1.POS,2)
        self.assertEqual(r1.REF,"ATC")
        self.assertEqual(r1.ALT,["A"])
        self.assertEqual(r2.POS,5)
        self.assertEqual(r2.REF,"TCT")
        self.assertEqual(r2.ALT,["T"])
Ejemplo n.º 13
0
    def testMultipleAltAlleles(self):
        # multiple alleles aren't normalized if the two alt alleles would be normalized differently
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       G       CG       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(),self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS,3)
        self.assertEqual(record.REF,'G')
        self.assertEqual(record.ALT[0], 'GC')
        vcf_str2 = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       G       CG,C       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(),self.getVcf(vcf_str2)).next()
        self.assertEqual(record.POS,6)
        self.assertEqual(record.REF,'G')
        self.assertEqual(record.ALT[0],'CG')
Ejemplo n.º 14
0
    def testMultipleAltAlleles(self):
        # multiple alleles aren't normalized if the two alt alleles would be normalized differently
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       G       CG       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(), self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS, 3)
        self.assertEqual(record.REF, 'G')
        self.assertEqual(record.ALT[0], 'GC')
        vcf_str2 = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr2    6       .       G       CG,C       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(), self.getVcf(vcf_str2)).next()
        self.assertEqual(record.POS, 6)
        self.assertEqual(record.REF, 'G')
        self.assertEqual(record.ALT[0], 'CG')
Ejemplo n.º 15
0
    def testNormalizeTwoToCollision(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    4       .       C       CTC     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(), self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        self.assertEqual(r1.POS, 2)
        self.assertEqual(r1.REF, "A")
        self.assertEqual(r1.ALT, ["ATC"])
        self.assertEqual(r2.POS, 3)
        self.assertEqual(r2.REF, "T")
        self.assertEqual(r2.ALT, ["TCT"])
Ejemplo n.º 16
0
    def testNormalizeTwoToCollision(self):
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    4       .       C       CTC     20      PASS    .       GT      0/1\n
chr4    6       .       C       CTC     20      PASS    .       GT      0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        self.assertEqual(r1.POS,2)
        self.assertEqual(r1.REF,"A")
        self.assertEqual(r1.ALT,["ATC"])
        self.assertEqual(r2.POS,3)
        self.assertEqual(r2.REF,"T")
        self.assertEqual(r2.ALT,["TCT"])
Ejemplo n.º 17
0
    def testNormalizeThreeCollision(self):
        # the OP info flag is fake to force vars to right-slide
        vcf_str = """##fileformat=VCFv4.0
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr4    2       .       A       ATCTT     20      PASS    OP=1       GT      0/1\n
chr4    2       .       A       T     20      PASS    .       GT      0/1\n
chr4    2       .       ATCTC    T        20      PASS     OP=2        GT     0/1\n
"""
        norm_iter = normalize(get_reference(),self.getVcf(vcf_str))
        r1 = norm_iter.next()
        r2 = norm_iter.next()
        r3 = norm_iter.next()
        r1,r2,r3 = sorted([r1,r2,r3],key=lambda x: x.POS) # order of vars from same pos not guaranteed
        self.assertEqual(r1.POS,2)
        self.assertEqual(r2.POS,3)
        self.assertEqual(r2.REF,"T")
        self.assertEqual(r2.ALT,["TCTTT"])
        self.assertEqual(r3.POS,4)
        self.assertEqual(r3.REF,"CTCTC")
        self.assertEqual(r3.ALT,["C"])
Ejemplo n.º 18
0
def normalize_vcf_to_ChromVariants(vcf_str,chrom):
    str_io = StringIO.StringIO(vcf_str)
    str_vcf = vcf.Reader(str_io)
    norm_iter = normalize(get_reference(),str_vcf)
    str_vars = Variants(norm_iter,MAX_INDEL_LEN)
    return str_vars.on_chrom(chrom)
Ejemplo n.º 19
0
    def testNormalize(self):
        #regular records are unchanged
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     A       20      PASS    .       GT      0/1\n
"""
        norm_vcf = normalize(get_reference(), self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf), 1)

        #test that hom ref records are removed
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     C       20      PASS    .       GT      0/0\n
chr1   3   .       G     A       20      PASS    .       GT      1/1\n
"""
        norm_vcf = normalize(get_reference(), self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf), 1)

        #test that SNP/indels without genotyping are removed
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     A       20      PASS    .       GT      .\n
chr1   3   .       G     C       20      PASS    .       GT      0/0\n
chr1   4   .       G     T       20      PASS    .       GT      0|0\n
chr1   5   .       G     A       20      PASS    .       GT      1/1\n
"""
        norm_vcf = normalize(get_reference(), self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf), 1)

        #test that SV without genotyping is retained
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG       20      PASS    .       GT      .\n
"""
        norm_vcf = normalize(get_reference(), self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf), 1)

        #test that lower case ref/alt gets upper-cased
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       c     a       20      PASS    .       GT      0/1\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        lowercase_vcf = vcf.Reader(StringIO.StringIO(vcf_str))
        output_vcf = normalize(get_reference(), self.getVcf(vcf_str))
        original_r = lowercase_vcf.next()
        norm_r = output_vcf.next()
        self.assertEqual(original_r.REF, 'c')
        self.assertEqual(original_r.ALT[0], 'a')
        self.assertEqual(norm_r.REF, 'C')
        self.assertEqual(norm_r.ALT[0], 'A')

        # test normalizing an insertion
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   9   .       a     ga       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(), self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS, 6)
        self.assertEqual(record.REF, 'C')
        self.assertEqual(record.ALT, ['CG'])

        # test normalizing a deletion
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   5   .       cc     c       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(), self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS, 4)
        self.assertEqual(record.REF, 'GC')
        self.assertEqual(record.ALT, ['G'])
Ejemplo n.º 20
0
def normalize_vcf_to_ChromVariants(vcf_str, chrom):
    str_io = StringIO.StringIO(vcf_str)
    str_vcf = vcf.Reader(str_io)
    norm_iter = normalize(get_reference(), str_vcf)
    str_vars = Variants(norm_iter, MAX_INDEL_LEN)
    return str_vars.on_chrom(chrom)
Ejemplo n.º 21
0
    def testNormalize(self):
        #regular records are unchanged
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     A       20      PASS    .       GT      0/1\n
"""
        norm_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf),1)

        #test that hom ref records are removed
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     C       20      PASS    .       GT      0/0\n
chr1   3   .       G     A       20      PASS    .       GT      1/1\n
"""
        norm_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf),1)

        #test that SNP/indels without genotyping are removed
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     A       20      PASS    .       GT      .\n
chr1   3   .       G     C       20      PASS    .       GT      0/0\n
chr1   4   .       G     T       20      PASS    .       GT      0|0\n
chr1   5   .       G     A       20      PASS    .       GT      1/1\n
"""
        norm_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf),1)

        #test that SV without genotyping is retained
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG       20      PASS    .       GT      .\n
"""
        norm_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        self.assertEqual(self.countRecords(norm_vcf),1)

        #test that lower case ref/alt gets upper-cased
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       c     a       20      PASS    .       GT      0/1\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        lowercase_vcf = vcf.Reader(StringIO.StringIO(vcf_str))
        output_vcf = normalize(get_reference(),self.getVcf(vcf_str))
        original_r = lowercase_vcf.next()
        norm_r = output_vcf.next()
        self.assertEqual(original_r.REF,'c')
        self.assertEqual(original_r.ALT[0], 'a')
        self.assertEqual(norm_r.REF,'C')
        self.assertEqual(norm_r.ALT[0],'A')

        # test normalizing an insertion
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   9   .       a     ga       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(),self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS,6)
        self.assertEqual(record.REF,'C')
        self.assertEqual(record.ALT,['CG'])

        # test normalizing a deletion
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   5   .       cc     c       20      PASS    .       GT      0/1\n
"""
        record = normalize(get_reference(),self.getVcf(vcf_str)).next()
        self.assertEqual(record.POS,4)
        self.assertEqual(record.REF,'GC')
        self.assertEqual(record.ALT,['G'])
Ejemplo n.º 22
0
    def testNormalize(self):
        #regular records are unchanged
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     A       20      PASS    .       GT      0/1\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        norm_vcf = vcf.Reader(vcf_io)
        output_io = StringIO.StringIO()
        output_writer = VCFWriter(self.test_fasta, 'name', output_io)
        normalize(self.test_fasta, norm_vcf, output_writer)
        self.assertEqual(self.countRecords(self.outputToVcf(output_io)), 1)

        #test that hom ref records are removed
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     C       20      PASS    .       GT      0/0\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        homref_vcf = vcf.Reader(vcf_io)
        output_io = StringIO.StringIO()
        output_writer = VCFWriter(self.test_fasta, 'name', output_io)
        normalize(self.test_fasta, homref_vcf, output_writer)
        self.assertEqual(self.countRecords(self.outputToVcf(output_io)), 0)

        #test that SNP/indels without genotyping are removed
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     A       20      PASS    .       GT      .\n
chr1   3   .       G     C       20      PASS    .       GT      0/0\n
chr1   4   .       G     T       20      PASS    .       GT      0|0\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        homref_vcf = vcf.Reader(vcf_io)
        output_io = StringIO.StringIO()
        output_writer = VCFWriter(self.test_fasta, 'name', output_io)
        normalize(self.test_fasta, homref_vcf, output_writer)
        self.assertEqual(self.countRecords(self.outputToVcf(output_io)), 0)

        #test that SV without genotyping is retained
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       C     AAAAGAAAGGCATGACCTATCCACCCATGCCACCTGGATGGACCTCACAGGCACACTGCTTCATGAGAGAG       20      PASS    .       GT      .\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        homref_vcf = vcf.Reader(vcf_io)
        output_io = StringIO.StringIO()
        output_writer = VCFWriter(self.test_fasta, 'name', output_io)
        normalize(self.test_fasta, homref_vcf, output_writer)
        self.assertEqual(self.countRecords(self.outputToVcf(output_io)), 1)

        #test that lower case ref/alt gets upper-cased
        vcf_str = """##fileformat=VCFv4.0\n
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n
##source=TVsim\n
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001\n
chr1   2   .       c     a       20      PASS    .       GT      0/1\n
"""
        vcf_io = StringIO.StringIO(vcf_str)
        lowercase_vcf = vcf.Reader(vcf_io)
        output_io = StringIO.StringIO()
        output_writer = VCFWriter(self.test_fasta, 'name', output_io)
        normalize(self.test_fasta, lowercase_vcf, output_writer)
        output_vcf = self.outputToVcf(output_io)
        lowercase_vcf = vcf.Reader(StringIO.StringIO(vcf_str))
        original_r = lowercase_vcf.next()
        norm_r = output_vcf.next()
        self.assertEqual(original_r.REF, 'c')
        self.assertEqual(original_r.ALT[0], 'a')
        self.assertEqual(norm_r.REF, 'C')
        self.assertEqual(norm_r.ALT[0], 'A')