Beispiel #1
0
    def test_minimal_VCF_definition_io(self):
        buf = io.StringIO()
        with open(pkg_file('genomvar.test', 'data/example1.vcf'), 'rt') as fh:
            for line in fh:
                if line.startswith('##fileformat') \
                          or line.startswith('#CHROM') \
                          or not line.startswith('#'):
                    buf.write(line)

        buf.seek(0)
        reader = VCFReader(buf)

        outbuf = io.StringIO()
        writer = VCFWriter(format_spec=[RESERVED_FORMAT.GT],
                           samples=reader.samples)
        variants1 = []
        for vrt in reader.iter_vrt(parse_samples=True):
            self.assertTrue(
                isinstance(vrt.attrib['samples']['SAMP1']['GT'], str))
            if vrt.attrib['samples']['SAMP1'].get('GT') == '0/1':
                vrt.attrib['samples']['SAMP1']['GT'] = (0, 1)
            else:
                vrt.attrib['samples']['SAMP1']['GT'] = None
            outbuf.write(str(writer.get_row(vrt)))
            variants1.append(vrt)
        variants1.sort(key=lambda v: v.start)

        outbuf.seek(0)
        variants2 = list(VCFReader(outbuf).iter_vrt())
        variants2.sort(key=lambda v: v.start)

        for v1, v2 in zip(variants1, variants2):
            self.assertTrue(v1.edit_equal(v2))
Beispiel #2
0
    def test_from_variants_to_vcf_with_sampdata(self):
        file = pkg_file('genomvar.test', 'data/example3.vcf')
        variants1 = sorted(VCFReader(file).iter_vrt(parse_samples=True),
                           key=lambda v: v.key)
        vs = VariantSet.from_variants(variants1)
        tf = tempfile.NamedTemporaryFile(suffix='.vcf')

        with open(tf.name, 'wt') as fh:
            vs.to_vcf(
                fh,
                format_spec=[RESERVED_FORMAT.GT, ('AD', 'R', 'Integer', '')],
                samples=['SAMP1'])

        with open(tf.name, 'rt') as fh:
            fh.seek(0)
            self.assertIn(
                '##FORMAT=<ID=AD,Number=R,Type=Integer,'\
                +'Description="">',
                fh.read().splitlines())
        variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_samples=True),
                           key=lambda v: v.key)
        self.assertEqual(len(variants1), len(variants2))
        cnt = 0
        for v1, v2 in zip(variants1, variants2):
            self.assertTrue(v1.edit_equal(v2))
            self.assertEqual(v1.attrib['samples']['SAMP1']['AD'],
                             v2.attrib['samples']['SAMP1']['AD'])
Beispiel #3
0
    def test_iter_vrt_gzipped(self):
        reader = VCFReader(pkg_file('genomvar.test', 'data/example2.vcf.gz'),
                           index=True)
        self.assertEqual(list(reader.chroms), ['chr23', 'chr24'])
        for vrt in reader.iter_vrt():
            self.assertIn(vrt.chrom, ['chr24', 'chr23'])

        self.assertEqual(len(list(reader.find_vrt(chrom='chr24'))), 4)
        self.assertEqual(len(list(reader.find_vrt('chr23', 7464, 7465))), 3)
Beispiel #4
0
    def test_example3(self):
        reader = VCFReader(pkg_file('genomvar.test', 'data/example3.vcf'))
        self.assertEqual(list(reader.get_chroms(allow_no_index=True)),
                         ['chr1', 'chr2', 'chr10'])
        vrt = list(reader.iter_vrt(parse_info=True, parse_samples=True))
        self.assertGreater(len(vrt), 0)

        v = vrt[3]
        self.assertEqual(v.attrib['id'], None)
Beispiel #5
0
    def test_sv_types(self):
        reader = VCFReader(pkg_file('genomvar.test', 'data/example4.vcf.gz'))

        with warnings.catch_warnings(record=True) as wrn:
            # warnings.simplefilter(append=True)
            for cnt, vrt in enumerate(reader.iter_vrt()):
                pass
            self.assertEqual(cnt, 99)
            self.assertGreater(len(wrn), 1)
            self.assertIn('Structural', str(wrn[-1].message))
Beispiel #6
0
    def test_check_getting_vrt_is_sorted(self):
        reader = VCFReader(pkg_file('genomvar.test',
                                    'data/example_gnomad_2.vcf.gz'),
                           index=True)
        starts = [v.start for v in reader.iter_vrt()]
        self.assertEqual(starts, sorted(starts))

        starts2 = [
            v.start for v in reader.find_vrt('chr15', 74719587, 74824401)
        ]
        self.assertEqual(starts2, sorted(starts2))
Beispiel #7
0
    def test_from_variants_with_attributes(self):
        reader = VCFReader(pkg_file('genomvar.test', 'data/example1.vcf'))
        vset = VariantSet.from_variants(list(reader.iter_vrt(parse_info=True)))
        vrt = list(vset.find_vrt('chr24', 1200, 1210))
        self.assertEqual(len(vrt), 2)

        v1 = vrt[0]
        self.assertEqual(v1.attrib['info']['NSV'], 1)
        self.assertEqual(v1.attrib['id'], '5')

        v2 = vrt[1]
        self.assertEqual(v2.attrib['id'], None)

        recs = vset.to_records()
        self.assertEqual(recs[0]['attrib']['info']['NSV'], 2)
Beispiel #8
0
    def test_zip_variants(self):
        fl = pkg_file('genomvar.test', 'data/example3.vcf')
        with open(fl, 'rt') as fh1:
            with open(fl) as fh2:
                matches = list(
                    zip_variants(
                        VCFReader(fh1).iter_vrt(),
                        VCFReader(fh2).iter_vrt()))
        d1, v1, ovlp1 = matches[0]
        self.assertEqual(d1, 0)
        self.assertEqual(v1.alt, '')
        self.assertEqual([v.ref for v in ovlp1], ['G', ''])

        dlast, vlast, ovlplast = matches[-1]
        self.assertEqual(vlast.alt, 'CC')
        self.assertEqual([v.ref for v in ovlplast], ['TG'])
Beispiel #9
0
    def test_from_variants_vcf(self):
        vs0 = varset.VariantSet.from_vcf(pkg_file('genomvar.test',
                                                  'data/example1.vcf'),
                                         parse_info=True)
        variants1 = sorted(vs0.iter_vrt(), key=lambda v: v.key)
        vs = VariantSet.from_variants(variants1)
        _desc = 'Test for multinumber field'
        info_spec_tuples = [('DP4', 4, 'Integer', _desc),
                            ('NSV', 1, 'Integer')]
        info_spec_dict = vs0.dtype['info']
        for info_spec in (info_spec_tuples, info_spec_dict):
            tf = tempfile.NamedTemporaryFile(suffix='.vcf')
            with open(tf.name, 'wt') as fh:
                vs.to_vcf(fh, info_spec=info_spec)

            with open(tf.name, 'rt') as fh:
                self.assertIn(
                    '##INFO=<ID=DP4,Number=4,Type=Integer,Description="{}">'\
                          .format(_desc),
                    fh.read().splitlines())
                fh.seek(0)
                # print(fh.read())
            variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True),
                               key=lambda v: v.key)
            self.assertEqual(len(variants1), len(variants2))
            cnt = 0
            for v1, v2 in zip(variants1, variants2):
                self.assertTrue(v1.edit_equal(v2))
                self.assertEqual(v1.attrib['info']['NSV'],
                                 v2.attrib['info']['NSV'])
Beispiel #10
0
 def test_init(self):
     reader = VCFReader(pkg_file('genomvar.test', 'data/example1.vcf'))
     self.assertEqual(reader.header_len, 15)
     dtype = reader._dtype
     self.assertEqual(len(dtype['format']), 1)
     self.assertTrue(issubclass(dtype['format']['GT']['dtype'], np.object_),
                     msg='Got type' + str(dtype['format']['GT']['type']))
Beispiel #11
0
    def test_iter_chrom_rows(self):
        reader = VCFReader(pkg_file('genomvar.test', 'data/example2.vcf.gz'))
        chroms = set()
        rows = {}
        for chrom, it in reader.iter_rows_by_chrom():
            chroms.add(chrom)
            l = list(it)
            rows[chrom] = len(l)
        self.assertEqual(chroms, {'chr24', 'chr23'})
        self.assertEqual(rows['chr23'], 5)
        self.assertEqual(rows['chr24'], 4)

        # same but without reading the chroms
        chroms = set()
        for chrom, it in reader.iter_rows_by_chrom():
            chroms.add(chrom)
        self.assertEqual(chroms, {'chr24', 'chr23'})
Beispiel #12
0
    def test_from_vcf_missing_values(self):
        buf = io.StringIO()
        format_fields = (RESERVED_FORMAT.AD, RESERVED_FORMAT.DP,
                         RESERVED_FORMAT.GT)
        header = vcf_header.render(
            samples=['S1','S2'],
            ctg_len={},
            format=[{k:getattr(spec, k.upper()) for k in \
                     ('name', 'number', 'type', 'description')} \
                    for spec in format_fields])

        buf.write(header)
        buf.write('chr15\t17017413\t.\tA\tG\t38\t.\t.\tGT\t./.\t0/1\n')
        buf.write('chr15\t17017413\t.\tA\tG\t38\t.\t.\tGT:AD\t./.\t0/1:10,.\n')
        buf.write('chr15\t17017413\t.\tA\tG\t38\t.\t.\tGT:DP\t./.\t1/1:.\n')
        buf.seek(0)
        buf.seek(0)
        vs = VCFReader(buf)

        v = list(vs.iter_vrt(parse_samples=True))[0]
        self.assertEqual(v.attrib['samples']['S1']['GT'], (None, None))
Beispiel #13
0
    def test_from_to_vcf(self):
        fl = pkg_file('genomvar.test', 'data/example1.vcf')
        variants1 = sorted(VCFReader(fl).iter_vrt(parse_info=True),
                           key=lambda v: v.key)
        vs = VariantSet.from_vcf(fl, parse_info=True)
        tf = tempfile.NamedTemporaryFile(suffix='.vcf')
        with open(tf.name, 'wt') as fh:
            vs.to_vcf(fh)

        with open(tf.name, 'rt') as fh:
            fh.seek(0)
            self.assertIn(
                '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Test for multinumber field">',
                fh.read().splitlines())
        variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True),
                           key=lambda v: v.key)
        self.assertEqual(len(variants1), len(variants2))
        cnt = 0
        for v1, v2 in zip(variants1, variants2):
            self.assertTrue(v1.edit_equal(v2))
            self.assertEqual(v1.attrib['info']['NSV'],
                             v2.attrib['info']['NSV'])
Beispiel #14
0
    def test_iter_vrt_example1(self):
        reader = VCFReader(pkg_file('genomvar.test', 'data/example1.vcf'))
        self.assertEqual(reader.samples, ['SAMP1'])
        vrts = list(reader.iter_vrt(parse_info=True, parse_samples=True))
        vrt1, vrt2 = vrts[:2]

        # Test Ref and Alt
        self.assertEqual([vrt1.start, vrt1.ref, vrt1.alt], [23, 'G', ''])
        self.assertEqual([vrt2.start, vrt2.ref, vrt2.alt], [24, '', 'G'])

        # Check row numbers
        self.assertEqual(vrt1.attrib['vcf_notation']['row'], 0)
        self.assertEqual(vrt2.attrib['vcf_notation']['row'], 0)
        self.assertEqual(vrt1.attrib['allele_num'], 0)
        self.assertEqual(vrt2.attrib['allele_num'], 1)
        # Test INFO
        self.assertEqual(vrt1.attrib['info']['AF'], 0.5)
        self.assertEqual(vrt2.attrib['info']['AF'], 0.5)

        # Test SAMPLES fields
        self.assertEqual(vrt1.attrib['samples']['SAMP1']['GT'], (0, 1, 0))
        self.assertEqual(vrt2.attrib['samples']['SAMP1']['GT'], (0, 0, 1))
Beispiel #15
0
    def test_from_variants_to_vcf_with_info(self):
        variants1 = sorted(VCFReader(
            pkg_file('genomvar.test',
                     'data/example1.vcf')).iter_vrt(parse_info=True),
                           key=lambda v: v.key)
        vs = VariantSet.from_variants(variants1)
        tf = tempfile.NamedTemporaryFile(suffix='.vcf')

        # Test Invalid specs
        invalid_specs = [('NSV', ), ('NSV', 1, 'Integedr'),
                         ('NSV', 'C', 'Integer', 'Number of Simple Variants')]
        _buf = io.StringIO()
        for spec in invalid_specs:
            with self.assertRaises(ValueError) as cm:
                vs.to_vcf(_buf, info_spec=[spec])
            exc = cm.exception
            self.assertTrue('INFO spec' in exc.args[0])

        with open(tf.name, 'wt') as fh:
            vs.to_vcf(fh,
                      info_spec=[
                          ('NSV', 1, 'Integer', 'Number of Simple Variants'),
                          ('AF', 'A', 'Float', '', 'source', 'version')
                      ])

        with open(tf.name, 'rt') as fh:
            self.assertIn(
                '##INFO=<ID=NSV,Number=1,Type=Integer,'\
                +'Description="Number of Simple Variants">',
                fh.read().splitlines())
        variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True),
                           key=lambda v: v.key)
        self.assertEqual(len(variants1), len(variants2))
        cnt = 0
        for v1, v2 in zip(variants1, variants2):
            self.assertTrue(v1.edit_equal(v2))
            self.assertEqual(v1.attrib['info']['NSV'],
                             v2.attrib['info']['NSV'])
Beispiel #16
0
 def test_change_of_attributes(self):
     reader = VCFReader(
         pkg_file('genomvar.test','data/example1.vcf'))
     vrt = list(reader.iter_vrt())[0]
     self.assertEqual(str(self.writer.get_row(vrt)),
                      'chr24\t23\t1\tAG\tA\t100\tPASS\t.')
     vrt2 = copy.deepcopy(vrt)
     vrt2.attrib['id'] = '.'
     vrt2.attrib['qual'] = '.'
     vrt2.attrib['filter'] = '.'
     self.assertEqual(str(self.writer.get_row(vrt2)),
                      'chr24\t23\t.\tAG\tA\t.\t.\t.')
     self.assertEqual(str(self.writer.get_row(vrt2, id='.', qual='.', filter='.')),
                      'chr24\t23\t.\tAG\tA\t.\t.\t.')
     
     vrt3 = copy.deepcopy(vrt)
     vrt3.attrib['id'] = None
     vrt3.attrib['qual'] = None
     vrt3.attrib['filter'] = None
     self.assertEqual(str(self.writer.get_row(vrt3)),
                      'chr24\t23\t.\tAG\tA\t.\t.\t.')
     reader.close()
     reader.close()
Beispiel #17
0
    def test_to_vcf_row_from_file(self):
        def _split_multiallelic(rows):
            for row in rows:
                for alt in row.ALT.split(','):
                    kwds = {f:getattr(row,f) for f in VCF_FIELDS}
                    kwds['ALT'] = alt
                    kwds['INFO'] = '.'
                    kwds['FORMAT'] = None
                    kwds['SAMPLES'] = None
                    yield str(VCFRow(**kwds))

        reader = VCFReader(pkg_file('genomvar.test','data/example1.vcf'))
        variants = list(reader.iter_vrt(
            parse_info=False,parse_samples=False))
        rows = [str(self.writer.get_row(v)) for v in variants]
        
        for r1, r2 in zip(
                _split_multiallelic(reader.iter_rows()), rows):
            if 'AG\tAGG' in r1: # stripping 
                continue
            self.assertEqual(r1,r2)
        reader.close()