def test_minimal_VCF_definition_io(self): buf = io.StringIO() with open(pkg_file('genomvar.test', 'data/example1.vcf'), 'rt') as fh: for line in fh: if line.startswith('##fileformat') \ or line.startswith('#CHROM') \ or not line.startswith('#'): buf.write(line) buf.seek(0) reader = VCFReader(buf) outbuf = io.StringIO() writer = VCFWriter(format_spec=[RESERVED_FORMAT.GT], samples=reader.samples) variants1 = [] for vrt in reader.iter_vrt(parse_samples=True): self.assertTrue( isinstance(vrt.attrib['samples']['SAMP1']['GT'], str)) if vrt.attrib['samples']['SAMP1'].get('GT') == '0/1': vrt.attrib['samples']['SAMP1']['GT'] = (0, 1) else: vrt.attrib['samples']['SAMP1']['GT'] = None outbuf.write(str(writer.get_row(vrt))) variants1.append(vrt) variants1.sort(key=lambda v: v.start) outbuf.seek(0) variants2 = list(VCFReader(outbuf).iter_vrt()) variants2.sort(key=lambda v: v.start) for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2))
def test_from_variants_to_vcf_with_sampdata(self): file = pkg_file('genomvar.test', 'data/example3.vcf') variants1 = sorted(VCFReader(file).iter_vrt(parse_samples=True), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf( fh, format_spec=[RESERVED_FORMAT.GT, ('AD', 'R', 'Integer', '')], samples=['SAMP1']) with open(tf.name, 'rt') as fh: fh.seek(0) self.assertIn( '##FORMAT=<ID=AD,Number=R,Type=Integer,'\ +'Description="">', fh.read().splitlines()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_samples=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['samples']['SAMP1']['AD'], v2.attrib['samples']['SAMP1']['AD'])
def test_iter_vrt_gzipped(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example2.vcf.gz'), index=True) self.assertEqual(list(reader.chroms), ['chr23', 'chr24']) for vrt in reader.iter_vrt(): self.assertIn(vrt.chrom, ['chr24', 'chr23']) self.assertEqual(len(list(reader.find_vrt(chrom='chr24'))), 4) self.assertEqual(len(list(reader.find_vrt('chr23', 7464, 7465))), 3)
def test_example3(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example3.vcf')) self.assertEqual(list(reader.get_chroms(allow_no_index=True)), ['chr1', 'chr2', 'chr10']) vrt = list(reader.iter_vrt(parse_info=True, parse_samples=True)) self.assertGreater(len(vrt), 0) v = vrt[3] self.assertEqual(v.attrib['id'], None)
def test_sv_types(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example4.vcf.gz')) with warnings.catch_warnings(record=True) as wrn: # warnings.simplefilter(append=True) for cnt, vrt in enumerate(reader.iter_vrt()): pass self.assertEqual(cnt, 99) self.assertGreater(len(wrn), 1) self.assertIn('Structural', str(wrn[-1].message))
def test_check_getting_vrt_is_sorted(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example_gnomad_2.vcf.gz'), index=True) starts = [v.start for v in reader.iter_vrt()] self.assertEqual(starts, sorted(starts)) starts2 = [ v.start for v in reader.find_vrt('chr15', 74719587, 74824401) ] self.assertEqual(starts2, sorted(starts2))
def test_from_variants_with_attributes(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example1.vcf')) vset = VariantSet.from_variants(list(reader.iter_vrt(parse_info=True))) vrt = list(vset.find_vrt('chr24', 1200, 1210)) self.assertEqual(len(vrt), 2) v1 = vrt[0] self.assertEqual(v1.attrib['info']['NSV'], 1) self.assertEqual(v1.attrib['id'], '5') v2 = vrt[1] self.assertEqual(v2.attrib['id'], None) recs = vset.to_records() self.assertEqual(recs[0]['attrib']['info']['NSV'], 2)
def test_zip_variants(self): fl = pkg_file('genomvar.test', 'data/example3.vcf') with open(fl, 'rt') as fh1: with open(fl) as fh2: matches = list( zip_variants( VCFReader(fh1).iter_vrt(), VCFReader(fh2).iter_vrt())) d1, v1, ovlp1 = matches[0] self.assertEqual(d1, 0) self.assertEqual(v1.alt, '') self.assertEqual([v.ref for v in ovlp1], ['G', '']) dlast, vlast, ovlplast = matches[-1] self.assertEqual(vlast.alt, 'CC') self.assertEqual([v.ref for v in ovlplast], ['TG'])
def test_from_variants_vcf(self): vs0 = varset.VariantSet.from_vcf(pkg_file('genomvar.test', 'data/example1.vcf'), parse_info=True) variants1 = sorted(vs0.iter_vrt(), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) _desc = 'Test for multinumber field' info_spec_tuples = [('DP4', 4, 'Integer', _desc), ('NSV', 1, 'Integer')] info_spec_dict = vs0.dtype['info'] for info_spec in (info_spec_tuples, info_spec_dict): tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf(fh, info_spec=info_spec) with open(tf.name, 'rt') as fh: self.assertIn( '##INFO=<ID=DP4,Number=4,Type=Integer,Description="{}">'\ .format(_desc), fh.read().splitlines()) fh.seek(0) # print(fh.read()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['info']['NSV'], v2.attrib['info']['NSV'])
def test_init(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example1.vcf')) self.assertEqual(reader.header_len, 15) dtype = reader._dtype self.assertEqual(len(dtype['format']), 1) self.assertTrue(issubclass(dtype['format']['GT']['dtype'], np.object_), msg='Got type' + str(dtype['format']['GT']['type']))
def test_iter_chrom_rows(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example2.vcf.gz')) chroms = set() rows = {} for chrom, it in reader.iter_rows_by_chrom(): chroms.add(chrom) l = list(it) rows[chrom] = len(l) self.assertEqual(chroms, {'chr24', 'chr23'}) self.assertEqual(rows['chr23'], 5) self.assertEqual(rows['chr24'], 4) # same but without reading the chroms chroms = set() for chrom, it in reader.iter_rows_by_chrom(): chroms.add(chrom) self.assertEqual(chroms, {'chr24', 'chr23'})
def test_from_vcf_missing_values(self): buf = io.StringIO() format_fields = (RESERVED_FORMAT.AD, RESERVED_FORMAT.DP, RESERVED_FORMAT.GT) header = vcf_header.render( samples=['S1','S2'], ctg_len={}, format=[{k:getattr(spec, k.upper()) for k in \ ('name', 'number', 'type', 'description')} \ for spec in format_fields]) buf.write(header) buf.write('chr15\t17017413\t.\tA\tG\t38\t.\t.\tGT\t./.\t0/1\n') buf.write('chr15\t17017413\t.\tA\tG\t38\t.\t.\tGT:AD\t./.\t0/1:10,.\n') buf.write('chr15\t17017413\t.\tA\tG\t38\t.\t.\tGT:DP\t./.\t1/1:.\n') buf.seek(0) buf.seek(0) vs = VCFReader(buf) v = list(vs.iter_vrt(parse_samples=True))[0] self.assertEqual(v.attrib['samples']['S1']['GT'], (None, None))
def test_from_to_vcf(self): fl = pkg_file('genomvar.test', 'data/example1.vcf') variants1 = sorted(VCFReader(fl).iter_vrt(parse_info=True), key=lambda v: v.key) vs = VariantSet.from_vcf(fl, parse_info=True) tf = tempfile.NamedTemporaryFile(suffix='.vcf') with open(tf.name, 'wt') as fh: vs.to_vcf(fh) with open(tf.name, 'rt') as fh: fh.seek(0) self.assertIn( '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Test for multinumber field">', fh.read().splitlines()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['info']['NSV'], v2.attrib['info']['NSV'])
def test_iter_vrt_example1(self): reader = VCFReader(pkg_file('genomvar.test', 'data/example1.vcf')) self.assertEqual(reader.samples, ['SAMP1']) vrts = list(reader.iter_vrt(parse_info=True, parse_samples=True)) vrt1, vrt2 = vrts[:2] # Test Ref and Alt self.assertEqual([vrt1.start, vrt1.ref, vrt1.alt], [23, 'G', '']) self.assertEqual([vrt2.start, vrt2.ref, vrt2.alt], [24, '', 'G']) # Check row numbers self.assertEqual(vrt1.attrib['vcf_notation']['row'], 0) self.assertEqual(vrt2.attrib['vcf_notation']['row'], 0) self.assertEqual(vrt1.attrib['allele_num'], 0) self.assertEqual(vrt2.attrib['allele_num'], 1) # Test INFO self.assertEqual(vrt1.attrib['info']['AF'], 0.5) self.assertEqual(vrt2.attrib['info']['AF'], 0.5) # Test SAMPLES fields self.assertEqual(vrt1.attrib['samples']['SAMP1']['GT'], (0, 1, 0)) self.assertEqual(vrt2.attrib['samples']['SAMP1']['GT'], (0, 0, 1))
def test_from_variants_to_vcf_with_info(self): variants1 = sorted(VCFReader( pkg_file('genomvar.test', 'data/example1.vcf')).iter_vrt(parse_info=True), key=lambda v: v.key) vs = VariantSet.from_variants(variants1) tf = tempfile.NamedTemporaryFile(suffix='.vcf') # Test Invalid specs invalid_specs = [('NSV', ), ('NSV', 1, 'Integedr'), ('NSV', 'C', 'Integer', 'Number of Simple Variants')] _buf = io.StringIO() for spec in invalid_specs: with self.assertRaises(ValueError) as cm: vs.to_vcf(_buf, info_spec=[spec]) exc = cm.exception self.assertTrue('INFO spec' in exc.args[0]) with open(tf.name, 'wt') as fh: vs.to_vcf(fh, info_spec=[ ('NSV', 1, 'Integer', 'Number of Simple Variants'), ('AF', 'A', 'Float', '', 'source', 'version') ]) with open(tf.name, 'rt') as fh: self.assertIn( '##INFO=<ID=NSV,Number=1,Type=Integer,'\ +'Description="Number of Simple Variants">', fh.read().splitlines()) variants2 = sorted(VCFReader(tf.name).iter_vrt(parse_info=True), key=lambda v: v.key) self.assertEqual(len(variants1), len(variants2)) cnt = 0 for v1, v2 in zip(variants1, variants2): self.assertTrue(v1.edit_equal(v2)) self.assertEqual(v1.attrib['info']['NSV'], v2.attrib['info']['NSV'])
def test_change_of_attributes(self): reader = VCFReader( pkg_file('genomvar.test','data/example1.vcf')) vrt = list(reader.iter_vrt())[0] self.assertEqual(str(self.writer.get_row(vrt)), 'chr24\t23\t1\tAG\tA\t100\tPASS\t.') vrt2 = copy.deepcopy(vrt) vrt2.attrib['id'] = '.' vrt2.attrib['qual'] = '.' vrt2.attrib['filter'] = '.' self.assertEqual(str(self.writer.get_row(vrt2)), 'chr24\t23\t.\tAG\tA\t.\t.\t.') self.assertEqual(str(self.writer.get_row(vrt2, id='.', qual='.', filter='.')), 'chr24\t23\t.\tAG\tA\t.\t.\t.') vrt3 = copy.deepcopy(vrt) vrt3.attrib['id'] = None vrt3.attrib['qual'] = None vrt3.attrib['filter'] = None self.assertEqual(str(self.writer.get_row(vrt3)), 'chr24\t23\t.\tAG\tA\t.\t.\t.') reader.close() reader.close()
def test_to_vcf_row_from_file(self): def _split_multiallelic(rows): for row in rows: for alt in row.ALT.split(','): kwds = {f:getattr(row,f) for f in VCF_FIELDS} kwds['ALT'] = alt kwds['INFO'] = '.' kwds['FORMAT'] = None kwds['SAMPLES'] = None yield str(VCFRow(**kwds)) reader = VCFReader(pkg_file('genomvar.test','data/example1.vcf')) variants = list(reader.iter_vrt( parse_info=False,parse_samples=False)) rows = [str(self.writer.get_row(v)) for v in variants] for r1, r2 in zip( _split_multiallelic(reader.iter_rows()), rows): if 'AG\tAGG' in r1: # stripping continue self.assertEqual(r1,r2) reader.close()