def test_by_chunks(self): fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) snps = VariationsArrays() snps.put_vars(vcf_parser) fhand.close() fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) snps = VariationsArrays(vars_in_chunk=1) snps.put_vars(vcf_parser) fhand.close()
def test_vcf_detect_fields(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand, kept_fields=['/variations/qual']) vcf2 = VCFParser(vcf_fhand2, ignored_fields=['/variations/qual']) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) metadata = snps.metadata snps2 = VariationsArrays(ignore_undefined_fields=True) snps2.put_vars(vcf2) metadata2 = snps2.metadata assert '/calls/HQ' in metadata.keys() assert '/variations/qual' not in metadata2.keys() vcf_fhand.close() vcf_fhand2.close()
def test_delete_item_from_variationArray(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) del snps['/calls/GT'] assert '/calls/GT' not in snps.keys() vcf_fhand.close()
def test_parser_vcf_filters(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_without_info.vcf'), 'rb') vcf = VCFParser(vcf_fhand) filters = [] for var in vcf.variations: filters.append(var[6]) assert filters == [[], [b'q10'], [], [], []] vcf_fhand.close() # No filters vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_without_filter.vcf'), 'rb') vcf = VCFParser(vcf_fhand) filters = [] for var in vcf.variations: filters.append(var[6]) assert filters == [None, None, None, None, None] vcf_fhand.close()
def test_put_vars_arrays_from_vcf(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) assert snps['/calls/GT'].shape == (5, 3, 2) assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]]) expected = numpy.array([48, 48, 43], dtype=numpy.int16) assert numpy.all(snps['/calls/GQ'][0, :] == expected) vcf_fhand.close()
def _create_var_mat_objs_from_vcf(vcf_fpath, kwargs, kept_fields=None, ignored_fields=None): for klass in VAR_MAT_CLASSES: if vcf_fpath.endswith('.gz'): fhand = gzip.open(vcf_fpath, 'rb') else: fhand = open(vcf_fpath, 'rb') vcf_parser = VCFParser(fhand=fhand, **kwargs) out_snps = _init_var_mat(klass) out_snps.put_vars(vcf_parser) fhand.close() yield out_snps
def _parse_vcf(chrom, vcf_fpath, tmp_dir, kept_fields, ignored_fields): tmp_h5_fhand = NamedTemporaryFile(prefix=chrom.decode() + '.', suffix='.tmp.h5', dir=tmp_dir) tmp_h5_fpath = tmp_h5_fhand.name tmp_h5_fhand.close() tmp_h5 = VariationsH5(tmp_h5_fpath, 'w', ignore_undefined_fields=True, kept_fields=kept_fields, ignored_fields=ignored_fields) vcf_parser = VCFParser(get_vcf_lines_for_chrom(chrom, vcf_fpath), kept_fields=kept_fields, ignored_fields=ignored_fields) tmp_h5.put_vars(vcf_parser) tmp_h5.close() return tmp_h5_fpath
def test_put_vars_hdf5_from_vcf(self): vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) with NamedTemporaryFile(suffix='.hdf5') as fhand: os.remove(fhand.name) h5f = VariationsH5(fhand.name, 'w', ignore_undefined_fields=True) h5f.put_vars(vcf) assert numpy.all(h5f['/variations/alt'][:] == [[b'A', b''], [b'A', b''], [b'G', b'T'], [b'', b''], [b'G', b'GTACT']]) assert h5f['/calls/GT'].shape == (5, 3, 2) assert numpy.all(h5f['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]]) expected = numpy.array([48, 48, 43], dtype=numpy.int16) assert numpy.all(h5f['/calls/GQ'][0, :] == expected) vcf_fhand.close()
def test_generated_vcf_feed_outputs_equal_vcfs(self): h5_vars = VariationsH5( join(TEST_DATA_DIR, 'tomato.apeki_gbs.calmd.1stchunk.h5'), "r") with NamedTemporaryFile(mode='wb') as vcf_vars_from_h5: write_vcf(h5_vars, vcf_vars_from_h5) vcf_vars_from_h5.flush() vcf_fhand = open(vcf_vars_from_h5.name, 'rb') vcf = VCFParser(vcf_fhand) vcf_vars_parsed = VariationsArrays() vcf_vars_parsed.put_vars(vcf) with NamedTemporaryFile(mode='wb') as vcf_vars_from_vcf: vcf_vars_parsed.write_vcf(vcf_vars_from_vcf) vcf_vars_from_vcf.flush() vcf_from_h5_fhand = open(vcf_vars_from_h5.name, 'rb') vcf_from_vcf_fhand = open(vcf_vars_from_vcf.name, 'rb') for line_parsed_from_h5, line_parsed_from_vcf in zip( vcf_from_h5_fhand, vcf_from_vcf_fhand): assert line_parsed_from_h5 == line_parsed_from_vcf, "when importing from a generated VCF and exporting to a new VCF both files must be the same"
def test_count_alleles(self): for klass in VAR_MAT_CLASSES: in_snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') var_mat = _init_var_mat(klass) try: chunks = in_snps.iterate_chunks(kept_fields=['/calls/GT']) var_mat.put_chunks(chunks) assert numpy.any(var_mat.allele_count) in_snps.close() finally: pass expected = [[3, 3, 0], [5, 1, 0], [0, 2, 4], [6, 0, 0], [2, 3, 1]] for klass in VAR_MAT_CLASSES: fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand) var_mat = _init_var_mat(klass) var_mat.put_vars(vcf_parser) assert numpy.all(var_mat.allele_count == expected) fhand.close()
def main(): description = 'Transforms VCF file into HDF5 format' parser = _setup_argparse(description=description) args = _parse_args(parser) in_fpath = args['in_fpath'] if in_fpath.split('.')[-1] == 'gz': fhand = read_gzip_file(in_fpath) else: fhand = open(in_fpath, 'rb') vcf_parser = VCFParser(fhand=fhand, pre_read_max_size=args['pre_read_max_size'], ignored_fields=args['ignored_fields'], kept_fields=args['kept_fields'], max_field_lens={ 'CALLS': { b'AO': args['alt_gt_num'] }, 'alt': args['alt_gt_num'] }) h5 = VariationsH5(args['out_fpath'], mode='w') h5.put_vars(vcf_parser)
def test_samples(self): gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]], [[0, 0], [0, 0], [1, 1], [2, 2]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] assert varis.samples == [1, 2, 3, 4] # With another file tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True) h5.put_vars(vcf_parser) fhand.close() samples = h5.samples samples[0] = '0' h5.samples = samples
def test_write_meta_header(self): files = [ 'format_def_without_info.vcf', 'format_def_without_filter.vcf', 'format_without_flt_info_qual.vcf' ] for file in files: vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb') header_lines = [ line for line in vcf_fhand if line.startswith(b'#') ] vcf_fhand.close() with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand: vcf = VCFParser(vcf_fhand) var_array = VariationsArrays(ignore_undefined_fields=True) var_array.put_vars(vcf) with NamedTemporaryFile(suffix='.h5') as tmp_fhand: _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0') _write_vcf_header(var_array, tmp_fhand) tmp_fhand.flush() with open(tmp_fhand.name, 'rb') as retmp_fhand: for line in retmp_fhand: assert line in header_lines
def test_set_to_missing(self): orig_vars = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') vars = copy_setting_gts_to_missing(orig_vars, gt_rate_to_missing=0.9) orig_gts = orig_vars[GT_FIELD][...] noise_gts = vars[GT_FIELD] assert orig_gts.shape == noise_gts.shape mask_different_gts = orig_gts != noise_gts expected_num_gts_set_to_missing = int( round(numpy.sum(orig_gts != MISSING_INT) * 0.9)) assert expected_num_gts_set_to_missing == mask_different_gts.sum() assert not numpy.sum(orig_gts[mask_different_gts] == MISSING_INT) vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf = VCFParser(vcf_fhand) snps = VariationsArrays(ignore_undefined_fields=True) snps.put_vars(vcf) vcf_fhand.close() numpy.random.seed(1) gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[0, 0], [0, 1], [-1, 0]], [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0], [1, 0]], [[0, 1], [-1, 2], [1, 1]]]) expected_gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[-1, -1], [0, 1], [-1, 0]], [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0], [-1, -1]], [[-1, -1], [-1, 2], [-1, -1]]]) del snps[GT_FIELD] snps[GT_FIELD] = gts vars = copy_setting_gts_to_missing(snps, gt_rate_to_missing=0.5) noise_gts = vars[GT_FIELD] assert numpy.all(noise_gts == expected_gts)
def test_vcf_to_hdf5(self): tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, n_threads=None) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True, vars_in_chunk=2) h5.put_vars(vcf_parser) fhand.close() h5 = VariationsH5(path, 'r') assert h5['/calls/GT'].shape == (5, 3, 2) assert numpy.all(h5['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]]) expected = numpy.array([[[51, 51], [51, 51], [-1, -1]], [[58, 50], [65, 3], [-1, -1]], [[23, 27], [18, 2], [-1, -1]], [[56, 60], [51, 51], [-1, -1]], [[-1, -1], [-1, -1], [-1, -1]]], dtype=numpy.int16) assert numpy.all(h5['/calls/HQ'][:] == expected) expected = numpy.array([48, 48, 43], dtype=numpy.int16) assert numpy.all(h5['/calls/GQ'][0, :] == expected) # Variations filters fields expected = numpy.array([1, 0, 1, 1, 1]) assert numpy.all(h5['/variations/filter/q10'][:] == expected) expected = numpy.array([False, False, False, False, False]) expected = numpy.array([1, 1, 1, 1, 1]) assert numpy.all(h5['/variations/filter/s50'][:] == expected) # Variations info fields expected = numpy.array([[0.5, numpy.nan], [0.01699829, numpy.nan], [0.33300781, 0.66699219], [numpy.nan, numpy.nan], [numpy.nan, numpy.nan]]) af = h5['/variations/info/AF'][:] assert numpy.allclose(af, expected, equal_nan=True, atol=0.01) expected = numpy.array([3, 3, 2, 3, 3]) assert numpy.all(h5['/variations/info/NS'][:] == expected) expected = numpy.array([14, 11, 10, 13, 9]) assert numpy.all(h5['/variations/info/DP'][:] == expected) expected = numpy.array([True, False, True, False, False]) assert numpy.all(h5['/variations/info/DB'][:] == expected) expected = numpy.array([True, False, False, False, False]) assert numpy.all(h5['/variations/info/H2'][:] == expected) os.remove(path) # With another file tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True) h5.put_vars(vcf_parser) fhand.close() h5 = h5py.File(path, 'r') assert numpy.all(h5['/calls/GT'].shape == (2, 42, 2)) assert numpy.all(h5['/calls/GT'][1, 12] == [1, 1]) assert numpy.all(h5['/calls/GL'][0, 0, 0] == 0) os.remove(path)