def test_merge_variations(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001', b'NA00002', b'NA00003'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'), 'r') new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] try: assert numpy.all(expected_h5[field][:] == result) except AssertionError: print(field) print(expected_h5[field][:]) print(result) # Change the order h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'NA00001', b'NA00002', b'NA00003', b'TS-1', b'TS-11', b'TS-21'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'), 'r') new_vars = VariationsArrays(ignore_overflows=True, ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] assert numpy.all(expected_h5[field][:] == result)
def _preprocess_format_calls_paths(variations, var_index, calls_paths): new_format_paths, new_calls_paths = [], [] for key in calls_paths: values = remove_nans(variations[key][var_index]) if (not np.all(values == MISSING_VALUES[values.dtype]) and values.shape[0] != 0): new_calls_paths.append(key) new_format_paths.append(key.split('/')[-1]) return new_calls_paths, new_format_paths
def _get_info_value(variations, index, info_paths, metadata, num_alt): info = [] for key in info_paths: dtype = str(variations[key].dtype) value = variations[key][index] field_key = key.split('/')[-1] meta_number = metadata[key]['Number'] if 'bool' in dtype: if value: info.append(field_key) elif meta_number == 'A': try: value = value[:num_alt] except IndexError: if num_alt == 1: value = numpy.array(value) else: raise if '|S' in dtype: value = [val.decode() for val in value] else: value = remove_nans(value) value = [str(val) for val in value] value = ','.join(value) if value else None elif meta_number == 1: if '|S' in dtype: value = value.decode() else: value = str(value) elif meta_number > 1: value = [str(val) for val in value if not numpy.isnan(val)] value = ','.join(value) if value else None elif not value: value = None else: raise(NotImplemented) if 'bool' not in dtype and value and value is not None: info.append('{}={}'.format(field_key, value)) return ';'.join(info)
def _get_info_value(variations, index, info_paths, metadata, num_alt): info = [] for key in info_paths: dtype = str(variations[key].dtype) value = variations[key][index] field_key = key.split('/')[-1] meta_number = metadata[key]['Number'] if 'bool' in dtype: if value: info.append(field_key) elif meta_number == 'A': try: value = value[:num_alt] except IndexError: if num_alt == 1: value = np.array(value) else: raise if '|S' in dtype: value = [val.decode() for val in value] else: value = remove_nans(value) value = [str(val) for val in value] value = ','.join(value) if value else None elif meta_number == 1: if '|S' in dtype: value = value.decode() else: value = str(value) elif meta_number > 1: value = [str(val) for val in value if not np.isnan(val)] value = ','.join(value) if value else None elif not value: value = None else: raise(NotImplemented) if 'bool' not in dtype and value and value is not None: info.append('{}={}'.format(field_key, value)) return ';'.join(info)
def test_merge_variations(self): h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001', b'NA00002', b'NA00003'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'), 'r') new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) first_h5 = h5_1 second_h5 = h5_2 field_paths = [] for field_path in field_paths: print('path', field_path) print('first:') if field_path in first_h5: print(h5_1[field_path][:].shape) print('second:') if field_path in second_h5: print(second_h5[field_path][:].shape) print('expected:') print(expected_h5[field_path][:].shape) print('merged:') print(new_vars[field_path].shape) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.allclose(remove_nans(expected_h5[field][:]), remove_nans(new_vars[field][:])) else: result = new_vars[field][:] try: if not expected_h5[field][:].shape == result.shape: raise AssertionError('comparison failed for field: ' + field) assert numpy.all(expected_h5[field][:] == result) except (AssertionError, ValueError, TypeError): print(field) print(expected_h5[field][:]) print(result) raise # Change the order h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r") h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r") merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3}, ignore_complex_overlaps=True, check_ref_matches=False) assert merger.ploidy == 2 assert merger.samples == [b'NA00001', b'NA00002', b'NA00003', b'TS-1', b'TS-11', b'TS-21'] expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'), 'r') new_vars = VariationsArrays(ignore_undefined_fields=True) new_vars.put_vars(merger) for field in new_vars.keys(): if 'float' in str(new_vars[field][:].dtype): assert numpy.all(remove_nans(expected_h5[field][:]) == remove_nans(new_vars[field][:])) else: result = new_vars[field][:] assert numpy.all(expected_h5[field][:] == result)
def test_vcf_to_hdf5(self): tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, max_field_lens={'alt': 5}, n_threads=None, pre_read_max_size=1000) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True) h5.put_vars(vcf_parser) fhand.close() h5 = VariationsH5(path, 'r') assert h5['/calls/GT'].shape == (5, 3, 2) assert numpy.all(h5['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]]) expected = numpy.array([[[51, 51], [51, 51], [-1, -1]], [[58, 50], [65, 3], [-1, -1]], [[23, 27], [18, 2], [-1, -1]], [[56, 60], [51, 51], [-1, -1]], [[-1, -1], [-1, -1], [-1, -1]]], dtype=numpy.int16) assert numpy.all(h5['/calls/HQ'][:] == expected) expected = numpy.array([48, 48, 43], dtype=numpy.int16) assert numpy.all(h5['/calls/GQ'][0, :] == expected) # Variations filters fields expected = numpy.array([False, True, False, False, False]) assert numpy.all(h5['/variations/filter/q10'][:] == expected) expected = numpy.array([False, False, False, False, False]) assert numpy.all(h5['/variations/filter/s50'][:] == expected) expected = [True, False, True, True, True] assert numpy.all(h5['/variations/filter/PASS'][:] == expected) # Variations info fields expected = remove_nans(numpy.array([[0.5, numpy.nan], [0.01699829, numpy.nan], [0.33300781, 0.66699219], [numpy.nan, numpy.nan], [numpy.nan, numpy.nan]], dtype=numpy.float16)) af = remove_nans(h5['/variations/info/AF'][:]) assert numpy.all(af == expected) expected = numpy.array([3, 3, 2, 3, 3]) assert numpy.all(h5['/variations/info/NS'][:] == expected) expected = numpy.array([14, 11, 10, 13, 9]) assert numpy.all(h5['/variations/info/DP'][:] == expected) expected = numpy.array([True, False, True, False, False]) assert numpy.all(h5['/variations/info/DB'][:] == expected) expected = numpy.array([True, False, False, False, False]) assert numpy.all(h5['/variations/info/H2'][:] == expected) os.remove(path) # With another file tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, pre_read_max_size=1000) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True) h5.put_vars(vcf_parser) fhand.close() h5 = h5py.File(path, 'r') assert numpy.all(h5['/calls/GT'].shape == (2, 42, 2)) assert numpy.all(h5['/calls/GT'][1, 12] == [1, 1]) assert numpy.all(h5['/calls/GL'][0, 0, 0] == 0) os.remove(path)