def test_vlen_ascii(self): dt = h5py.string_dtype(encoding='ascii') string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is bytes
def test_vlen_utf8(self): dt = h5py.string_dtype() string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is str
def test_vlen_utf8(self): dt = h5py.string_dtype() string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is six.text_type
def test_fixed_utf8(self): dt = h5py.string_dtype(length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def test_fixed_utf8(self): dt = h5py.string_dtype(length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def test_fixed_ascii(self): dt = h5py.string_dtype(encoding='ascii', length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def test_fixed_ascii(self): dt = h5py.string_dtype(encoding='ascii', length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def test_vlen_ascii(self): dt = h5py.string_dtype(encoding='ascii') string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is bytes
def test_vlen_enum(self): fname = self.mktemp() arr1 = [[1], [1, 2]] dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i')) with h5py.File(fname, 'w') as f: df1 = f.create_dataset('test', (len(arr1), ), dtype=dt1) df1[:] = np.array(arr1) with h5py.File(fname, 'r') as f: df2 = f['test'] dt2 = df2.dtype arr2 = [e.tolist() for e in df2[:]] self.assertEqual(arr1, arr2) self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)), h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
def test_vlen_enum(self): fname = self.mktemp() arr1 = [[1],[1,2]] dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i')) with h5py.File(fname,'w') as f: df1 = f.create_dataset('test', (len(arr1),), dtype=dt1) df1[:] = np.array(arr1) with h5py.File(fname,'r') as f: df2 = f['test'] dt2 = df2.dtype arr2 = [e.tolist() for e in df2[:]] self.assertEqual(arr1, arr2) self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)), h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
def _test_dset_val(zobj, hobj, hobj_info): if (hobj_info.type == h5py.h5o.TYPE_DATASET and self._checkdtype_structobjref(hobj) == (False, False) and h5py.check_vlen_dtype(hobj.dtype)): hobj = hobj.asstr() # wrapper to read data as python str hval = hobj[()] zval = zobj[()] assert_array_equal(hval, zval)
def _test_dset_val(zobj, hobj, hobj_info): if (hobj_info.type == h5py.h5o.TYPE_DATASET and self._checkdtype_structobjref(hobj) == (True, False) and not h5py.check_vlen_dtype(hobj.dtype)): hval = hobj[()] zval = zobj[()] for dt_name in hobj.dtype.names: assert_array_equal(hval[dt_name], zval[dt_name])
def _test_dset_val(zobj, hobj, hobj_info): if (hobj_info.type == h5py.h5o.TYPE_DATASET and self._checkdtype_structobjref(hobj) == (False, True) and not h5py.check_vlen_dtype(hobj.dtype)): hval = hobj[()] zval = zobj[()] ref_array_func = np.frompyfunc( lambda x: h5py.h5i.get_name( h5py.h5r.dereference(x, self.hfile.id)), 1, 1) if hobj.shape != (): hval_str = ref_array_func(hval).astype(str) else: hval_str = h5py.h5i.get_name( h5py.h5r.dereference(hval, self.hfile.id)) hval_str = hval_str.decode('utf-8') if self.hfile.name == '/': assert_array_equal(hval_str, zval) else: assert_array_equal( np.frompyfunc( lambda x: x if x.startswith(self.hfile.name) else '', 1, 1)(hval_str), zval)
def test_open_as_zarr_dset_values(self, request, capsys, filesbase): # get list of files passes by hdf5files arg # if hdf5files is not specified, file_list will contain generated hdf5 files num_files = filesbase.num_files file_list = filesbase.file_list[0:num_files] # find list of datasets in files if len(self.objnames) != 0: dset_list = [ f[name] for name in self.objnames for f in file_list if (name in f and isinstance(f[name], h5py.Dataset)) ] if len(dset_list) == 0: message = f"No given file contains {self.objnames}" with capsys.disabled(): print("\n" + message.rjust(len(request.node.nodeid)), end='') pytest.skip(message) # if objnames arg is not passed, select datasets for each file else: numdset = request.config.getoption('numdataset') if numdset <= 0: pytest.skip(f"numdataset: {numdset}") dset_names = [] def _get_dsets(name, info): nonlocal dset_names if info.type == h5py.h5o.TYPE_DATASET: dset_names.append(name.decode('utf-8')) dset_list = [] for hfile in file_list: dset_names = [] h5py.h5o.visit(hfile.id, _get_dsets, info=True) names = dset_names[0:numdset] for name in names: dset_list.append(hfile[name]) message = f"objnames not specified. open_as_zarr run with file: {hfile.filename}, dataset: {name}" with capsys.disabled(): print("\n" + message.rjust(len(request.node.nodeid)), end='') for dset in dset_list: with capsys.disabled(): print("\n" + f"file: {dset.file.filename}, dataset: {dset} :".rjust( len(request.node.nodeid)), end='') print("\n" + f"dataset: {dset.name}, data :".rjust( len(request.node.nodeid)), end='') # call open_as_zarr if not dset.dtype.hasobject: zarray = open_as_zarr( dset) # dataset does not have object references else: zarray = open_as_zarr( dset, collectrefs=True) # dataset with object references # test values when dtype is variable length if h5py.check_vlen_dtype(dset.dtype) is not None: dset_str = dset.asstr() # wrapper to read data as python str assert_array_equal(dset_str, zarray) # test values when dtype is structured array with object reference elif dset.dtype.hasobject and dset.dtype.names is not None: hval = dset[()] # function to get reference target names ref_array_func = np.frompyfunc( lambda x: h5py.h5i.get_name( h5py.h5r.dereference(x, dset.file.id)), 1, 1) for dtname in dset.dtype.names: if dset.dtype[dtname].hasobject: if dset.shape != (): hval_str = ref_array_func(hval[dtname]).astype(str) else: hval_str = h5py.h5i.get_name( h5py.h5r.dereference(hval[dtname], dset.file.id)) hval_str = hval_str.decode('utf-8') assert_array_equal(hval_str, zarray[dtname]) # test values when dtype is object reference elif dset.dtype.hasobject and dset.dtype.names is None: hval = dset[()] # function to get reference target names ref_array_func = np.frompyfunc( lambda x: h5py.h5i.get_name( h5py.h5r.dereference(x, dset.file.id)), 1, 1) if dset.shape != (): hval_str = ref_array_func(hval).astype(str) else: hval_str = h5py.h5i.get_name( h5py.h5r.dereference(hval, dset.file.id)) hval_str = hval_str.decode('utf-8') assert_array_equal(hval_str, zarray) # test values when dtype is simple else: assert_array_equal(dset, zarray) with capsys.disabled(): print("\n" + f"dataset: {dset.name}, attrs :".rjust( len(request.node.nodeid)), end='') # test attrs for key, val in dset.attrs.items(): assert_array_equal(val, zarray.attrs[key]) with capsys.disabled(): print("\n" + f"dataset: {dset.name}, fillvalue :".rjust( len(request.node.nodeid)), end='') # test fillvalue # if dtype is structured array if dset.fillvalue is not None and dset.fillvalue.dtype.names is not None: if dset.fillvalue.dtype.hasobject: message = f"structured array fillvalue {dset.fillvalue} with object dtype not supported." with capsys.disabled(): print(("\n" + message).rjust(len(request.node.nodeid)), end='') pytest.xfail(reason=message) assert_array_equal(dset.fill_value, zarray.fillvalue) # if fillvalue is an object reference: elif dset.fillvalue is not None and dset.fillvalue.dtype.hasobject: hval_str = h5py.h5i.get_name( h5py.h5r.dereference(dset.fillvalue, dset.file.id)) hval_str = hval_str.decode('utf-8') assert_array_equal(hval_str, zarray.fill_value) # simple fillvalue else: assert_array_equal(dset.fillvalue, zarray.fill_value)
def _rewrite_vlen_to_fixed(h5py_group, changed_dsets={}): """ Scan hdf5 file or hdf5 group object and recursively convert variable-length string dataset to fixed-length Args: h5py_group: h5py.Group or h5py.File object """ if (not isinstance(h5py_group, h5py.File) and (not issubclass( h5py_group.file.get(h5py_group.name, getclass=True), h5py.Group) or not issubclass( h5py_group.file.get(h5py_group.name, getclass=True, getlink=True), h5py.HardLink))): raise TypeError( f"{h5py_group} should be a h5py.File or h5py.Group as a h5py.HardLink" ) # iterate through group members group_iter = [name for name in h5py_group.keys()] for name in group_iter: obj = h5py_group[name] # get group member's link class obj_linkclass = h5py_group.get(name, getclass=True, getlink=True) # Datasets if issubclass(h5py_group.get(name, getclass=True), h5py.Dataset): if issubclass(obj_linkclass, h5py.ExternalLink): print( f"Skipped rewriting variable-length dataset {obj.name}: External Link" ) continue dset = obj # variable-length Datasets if h5py.check_vlen_dtype( dset.dtype) and h5py.check_string_dtype(dset.dtype): vlen_stringarr = dset[()] if dset.shape == (): string_lengths_ = len(vlen_stringarr) length_max = string_lengths_ else: length_max = max( len(el) for el in vlen_stringarr.flatten()) if dset.fillvalue is not None: length_max = max(length_max, len(dset.fillvalue)) length_max = length_max + (-length_max) % 8 dt_fixedlen = f'|S{length_max}' if isinstance(dset.fillvalue, str): dset_fillvalue = dset.fillvalue.encode('utf-8') else: dset_fillvalue = dset.fillvalue affix_ = '_fixedlen~' dset_name = dset.name h5py_group.file.move(dset_name, dset_name + affix_) changed_dsets[dset_name + affix_] = dset_name dsetf = h5py_group.file.create_dataset_like( dset_name, dset, dtype=dt_fixedlen, fillvalue=dset_fillvalue) # TO DO, copy attrs after all string dataset are moved for key, val in dset.attrs.items(): if isinstance( val, (bytes, np.bool_, str, int, float, np.number)): dsetf.attrs[key] = val else: # TO DO # print( f"Moving variable-length string Datasets: attribute value of type\ {type(val)} is not processed. Attribute {key} of object {dsetf.name}" ) if dsetf.shape == (): if isinstance(vlen_stringarr, bytes): dsetf[...] = vlen_stringarr else: dsetf[...] = vlen_stringarr.encode('utf-8') else: dsetf[...] = vlen_stringarr.astype(dt_fixedlen) # Groups elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group) and not issubclass(obj_linkclass, h5py.SoftLink)): if issubclass(obj_linkclass, h5py.ExternalLink): print(f"Group {obj.name} is not processed: External Link") continue changed_dsets = HDF5Zarr._rewrite_vlen_to_fixed( obj, changed_dsets) return changed_dsets
def create_zarr_hierarchy(self, h5py_group, zgroup): """ Scan hdf5 file and recursively create zarr attributes, groups and dataset structures for accessing data Args: h5py_group: h5py.Group or h5py.File object where information is gathered from zgroup: Zarr Group """ if (not isinstance(h5py_group, h5py.File) and (not issubclass(self.file.get( h5py_group.name, getclass=True), h5py.Group) or not issubclass( self.file.get(h5py_group.name, getclass=True, getlink=True), h5py.HardLink))): raise TypeError( f"{h5py_group} should be a h5py.File or h5py.Group as a h5py.HardLink" ) self.copy_attrs_data_to_zarr_store(h5py_group, zgroup) # add hdf5 group address in file to self._address_dict self._address_dict[h5py.h5o.get_info( h5py_group.id).addr] = h5py_group.name # iterate through group members test_iter = [name for name in h5py_group.keys()] for name in test_iter: obj = h5py_group[name] # get group member's link class obj_linkclass = h5py_group.get(name, getclass=True, getlink=True) # Datasets # TO DO, Soft Links # if issubclass(h5py_group.get(name, getclass=True), h5py.Dataset): if issubclass(obj_linkclass, h5py.ExternalLink): print( f"Dataset {obj.name} is not processed: External Link") continue dset = obj # number of filters dcpl = dset.id.get_create_plist() nfilters = dcpl.get_nfilters() if nfilters > 1: # TO DO # print( f"Dataset {dset.name} with multiple filters is not processed" ) continue elif nfilters == 1: # get first filter information filter_tuple = dset.id.get_create_plist().get_filter(0) filter_code = filter_tuple[0] if filter_code in self._hdf5_regfilters_subset and self._hdf5_regfilters_subset[ filter_code] is not None: # TO DO if filter_code == 32001: # Blosc blosc_names = { 0: 'blosclz', 1: 'lz4', 2: 'lz4hc', 3: 'snappy', 4: 'zlib', 5: 'zstd' } clevel, shuffle, cname_id = filter_tuple[2][-3:] cname = blosc_names[cname_id] compression = self._hdf5_regfilters_subset[ filter_code](cname=cname, clevel=clevel, shuffle=shuffle) else: compression = self._hdf5_regfilters_subset[ filter_code](level=filter_tuple[2]) else: print( f"Dataset {dset.name} with compression filter {filter_tuple[3]}, hdf5 filter number {filter_tuple[0]} is not processed:\ no compatible zarr codec") continue else: compression = None object_codec = None if dset.dtype.names is not None: # Structured array with Reference dtype dset_type = dset.id.get_type() dt_nmembers = dset_type.get_nmembers() dtype_ = [] dset_fillvalue = list(dset.fillvalue) for dt_i in range(dt_nmembers): dtname = dset.dtype.names[dt_i] if dset_type.get_member_class( dt_i) == h5py.h5t.REFERENCE: fcid = dset.file.id.get_create_plist() unit_address_size, _ = fcid.get_sizes() dtype_ += [(dtname, np.dtype(f'uint{unit_address_size*8}')) ] if dset.fillvalue[dt_i]: dset_fillvalue[dt_i] = h5py.h5o.get_info([ h5py.h5r.dereference( dset.fillvalue[dt_i], self.file.id) ]).addr else: dset_fillvalue[dt_i] = 0 else: dtype_ += [(dtname, dset.dtype.base[dt_i])] zarray = zgroup.create_dataset( dset.name, shape=dset.shape, dtype=dtype_, chunks=dset.chunks or False, fill_value=tuple(dset_fillvalue), compression=compression, overwrite=True) # variable-length Datasets elif h5py.check_vlen_dtype(dset.dtype): if not h5py.check_string_dtype(dset.dtype): print( f"Dataset {dset.name} is not processed: Variable-length dataset, not string" ) continue else: object_codec = VLenHDF5String() zarray = zgroup.create_dataset( dset.name, shape=dset.shape, dtype=object, chunks=dset.chunks or False, fill_value=dset.fillvalue, compression=compression, overwrite=True, object_codec=object_codec) dset_chunks = dset.chunks elif dset.dtype.hasobject: # TO DO test # dset_type = dset.id.get_type() if dset_type.get_class() == h5py.h5t.REFERENCE: fcid = dset.file.id.get_create_plist() unit_address_size, _ = fcid.get_sizes() dtype_ = np.dtype(f'uint{unit_address_size*8}') if dset.fillvalue: dset_fillvalue = h5py.h5o.get_info([ h5py.h5r.dereference(dset.fillvalue, self.file.id) ]).addr else: dset_fillvalue = 0 zarray = zgroup.create_dataset( dset.name, shape=dset.shape, dtype=dtype_, chunks=dset.chunks or False, fill_value=dset_fillvalue, compression=compression, overwrite=True) elif dset_type.get_class() == h5py.h5t.STD_REF_DSETREG: print( f"Dataset {dset.name} is not processed: Region Reference dtype" ) continue else: print( f"Dataset {dset.name} is not processed: Object dtype" ) continue else: if compression is None and (dset.chunks is None or dset.chunks == dset.shape): dset_chunks = dset.chunks if dset.chunks else dset.shape if dset.shape != (): dset_chunks = list(dset_chunks) dim_ = 0 ratio_ = self.max_chunksize / ( np.prod(dset_chunks) * dset.dtype.itemsize) while ratio_ < 1: chunk_dim_ = int(ratio_ * dset_chunks[dim_]) chunk_dim_ = chunk_dim_ if chunk_dim_ else 1 chunk_dim_ -= np.argmax( dset_chunks[dim_] % np.arange(chunk_dim_, chunk_dim_ // 2, -1)) dset_chunks[dim_] = int(chunk_dim_) ratio_ = self.max_chunksize / ( np.prod(dset_chunks) * dset.dtype.itemsize) dim_ += 1 dset_chunks = tuple(dset_chunks) dset_chunks = dset_chunks or None else: dset_chunks = dset.chunks zarray = zgroup.create_dataset(dset.name, shape=dset.shape, dtype=dset.dtype, chunks=dset_chunks or False, fill_value=dset.fillvalue, compression=compression, overwrite=True) self.copy_attrs_data_to_zarr_store(dset, zarray) info = self.storage_info(dset, dset_chunks) if object_codec is not None: info = self.vlen_storage_info(dset, info) # Store metadata if info: info['source'] = {'uri': self.uri, 'array_name': dset.name} FileChunkStore.chunks_info(zarray, info) # Groups elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group) and not issubclass(obj_linkclass, h5py.SoftLink)): if issubclass(obj_linkclass, h5py.ExternalLink): print(f"Group {obj.name} is not processed: External Link") continue group_ = obj zgroup_ = self.zgroup.create_group(group_.name, overwrite=True) self.create_zarr_hierarchy(group_, zgroup_) # Groups, Soft Link elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group) and issubclass(obj_linkclass, h5py.SoftLink)): group_ = obj zgroup_ = self.zgroup.create_group(group_.name, overwrite=True) self.copy_attrs_data_to_zarr_store(group_, zgroup_) zgroup_path = zgroup_.create_group(SYMLINK, overwrite=True) zgroup_path.attrs[group_.name] = h5py_group.get( name, getlink=True).path
def index_to_pandas(dset: h5py.Dataset, fields: None | Sequence[str] = None) -> pd.MultiIndex: """Construct an MultiIndex from the passed ``index`` dataset. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import HDF5_READ as filename .. code:: python >>> from dataCAT import index_to_pandas >>> import h5py >>> filename = str(...) # doctest: +SKIP # Convert the entire dataset >>> with h5py.File(filename, "r") as f: ... dset: h5py.Dataset = f["ligand"]["index"] ... index_to_pandas(dset) MultiIndex([('O=C=O', 'O1'), ('O=C=O', 'O3'), ( 'CCCO', 'O4')], names=['ligand', 'ligand anchor']) # Convert a subset of fields >>> with h5py.File(filename, "r") as f: ... dset = f["ligand"]["index"] ... index_to_pandas(dset, fields=["ligand"]) MultiIndex([('O=C=O',), ('O=C=O',), ( 'CCCO',)], names=['ligand']) Parameters ---------- dset : :class:`h5py.Dataset` The relevant ``index`` dataset. fields : :class:`Sequence[str]<collections.abc.Sequence>` The names of the ``index`` fields that are to-be included in the returned MultiIndex. If :data:`None`, include all fields. Returns ------- :class:`pandas.MultiIndex` A multi-index constructed from the passed dataset. """ # Fast-path for non-void-based datasets if dset.dtype.fields is None: if h5py.check_string_dtype(dset.dtype): ar = dset[:].astype(str) elif h5py.check_vlen_dtype(dset.dtype): ar = _vlen_to_tuples(dset[:]) else: ar = dset[:] return pd.MultiIndex.from_arrays([ar]) # Parse the `fields` parameter if fields is None: field_names = list(dset.dtype.fields.keys()) iterator = ((name, f_dtype) for name, (f_dtype, *_) in dset.dtype.fields.items()) else: field_names = list(fields) iterator = ((name, dset.dtype.fields[name][0]) for name in fields) if len(field_names) == 0: raise ValueError("At least one field is required") fields_lst = [] index_ar = dset[:] for name, field_dtype in iterator: # It's a bytes-string; decode it if h5py.check_string_dtype(field_dtype): ar = index_ar[name].astype(str) # It's a h5py `vlen` dtype; convert it into a list of tuples elif h5py.check_vlen_dtype(field_dtype): ar = _vlen_to_tuples(index_ar[name]) else: ar = index_ar[name] fields_lst.append(ar) return pd.MultiIndex.from_arrays(fields_lst, names=field_names)
def _test_dsets_read(zobj, hobj, hobj_info): if (hobj_info.type == h5py.h5o.TYPE_DATASET and self._checkdtype_structobjref(hobj) == (False, False) and h5py.check_vlen_dtype(hobj.dtype)): zval = zobj[()]