def test_fixed_ascii(self): dt = h5py.string_dtype(encoding='ascii', length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def test_fixed_utf8(self): dt = h5py.string_dtype(length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def test_vlen_ascii(self): dt = h5py.string_dtype(encoding='ascii') string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is bytes
def test_vlen_utf8(self): dt = h5py.string_dtype() string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is six.text_type
def _decode_structured_array(arr: np.ndarray, dtype: Optional[np.dtype] = None, copy: bool = False) -> np.ndarray: """ h5py 3.0 now reads all strings as bytes. There is a helper method which can convert these to strings, but there isn't anything for fields of structured dtypes. Params ------ arr An array with structured dtype dtype dtype of the array. This is checked for h5py string data types. Passing this is allowed for cases where array may have been processed by another function before hand. """ if copy: arr = arr.copy() if dtype is None: dtype = arr.dtype # codecs.decode is 2x slower than this lambda, go figure decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1) for k, (dt, _) in dtype.fields.items(): check = h5py.check_string_dtype(dt) if check is not None and check.encoding == "utf-8": decode(arr[k], out=arr[k]) return arr
def test_fixed_ascii(self): dt = h5py.string_dtype(encoding='ascii', length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def __getitem__(self, key): value = self._node[key] if isinstance(value, h5py.Group): return HDF5Adapter(value) else: if value.dtype == numpy.dtype("O"): warnings.warn( f"The dataset {key} is of object type, using a " "Python-only feature of h5py that is not supported by " "HDF5 in general. Read more about that feature at " "https://docs.h5py.org/en/stable/special.html. " "Consider using a fixed-length field instead. " "Tiled will serve an empty placeholder, unless the " "object is of size 1, where it will attempt to repackage " "the data into a numpy array." ) check_str_dtype = h5py.check_string_dtype(value.dtype) if check_str_dtype.length is None: dataset_names = value.file[self._node.name + "/" + key][...][()] if value.size == 1: arr = MockHDF5Dataset(numpy.array(dataset_names), {}) return HDF5DatasetAdapter(arr) return HDF5DatasetAdapter(MockHDF5Dataset(numpy.array([]), {})) return HDF5DatasetAdapter(value)
def test_vlen_ascii(self): dt = h5py.string_dtype(encoding='ascii') string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is bytes
def test_vlen_utf8(self): dt = h5py.string_dtype() string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is str
def test_fixed_utf8(self): dt = h5py.string_dtype(length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def _read_attr_hdf5(attrs: h5py.AttributeManager, name: str, default: Optional[Any] = Empty): """ Read an HDF5 attribute and perform all necessary conversions. At the moment, this only implements conversions for string attributes, other types are passed through. String conversion is needed compatibility with other languages. For example Julia's HDF5.jl writes string attributes as fixed-size strings, which are read as bytes by h5py. """ if name not in attrs and default is not Empty: return default attr = attrs[name] attr_id = attrs.get_id(name) dtype = h5py.check_string_dtype(attr_id.dtype) if dtype is None: return attr else: if dtype.length is None: # variable-length string, no problem return attr elif len(attr_id.shape) == 0: # Python bytestring return attr.decode("utf-8") else: # NumPy array return [decode(s, "utf-8") for s in attr]
def test_vlen_bytes(self): """ Vlen bytes dataset maps to vlen ascii in the file """ dt = h5py.string_dtype(encoding='ascii') ds = self.f.create_dataset('x', (100,), dtype=dt) tid = ds.id.get_type() self.assertEqual(type(tid), h5py.h5t.TypeStringID) self.assertEqual(tid.get_cset(), h5py.h5t.CSET_ASCII) string_info = h5py.check_string_dtype(ds.dtype) self.assertEqual(string_info.encoding, 'ascii')
def test_vlen_unicode(self): """ Vlen unicode dataset maps to vlen utf-8 in the file """ dt = h5py.string_dtype() ds = self.f.create_dataset('x', (100,), dtype=dt) tid = ds.id.get_type() self.assertEqual(type(tid), h5py.h5t.TypeStringID) self.assertEqual(tid.get_cset(), h5py.h5t.CSET_UTF8) string_info = h5py.check_string_dtype(ds.dtype) self.assertEqual(string_info.encoding, 'utf-8')
def h5py_read_string(dataset): if version('h5py') >= '3': # In h5py >= 3.0.0, h5py no longer converts the data type to a # string automatically, and we have to do it manually... string_dtype = h5py.check_string_dtype(dataset.dtype) if string_dtype is not None and string_dtype.encoding == 'utf-8': dataset = dataset.asstr() return dataset[()]
def test_compound(self): fields = [] fields.append(('field_1', h5py.string_dtype())) fields.append(('field_2', np.int32)) dt = np.dtype(fields) self.f['mytype'] = np.dtype(dt) dt_out = self.f['mytype'].dtype.fields['field_1'][0] string_inf = h5py.check_string_dtype(dt_out) self.assertEqual(string_inf.encoding, 'utf-8')
def test_compound(self): fields = [] fields.append(('field_1', h5py.string_dtype())) fields.append(('field_2', np.int32)) dt = np.dtype(fields) self.f['mytype'] = np.dtype(dt) dt_out = self.f['mytype'].dtype.fields['field_1'][0] string_inf = h5py.check_string_dtype(dt_out) self.assertEqual(string_inf.encoding, 'utf-8')
def __getitem__(self, key): import h5py if key in _HIDDEN_ATTRS: raise KeyError(key) # see https://github.com/h5netcdf/h5netcdf/issues/94 for details if isinstance(self._h5attrs[key], h5py.Empty): string_info = h5py.check_string_dtype(self._h5attrs[key].dtype) if string_info and string_info.length == 1: return b"" return self._h5attrs[key]
def _unpack(key, value): if key.isdigit(): key = int(key) if isinstance(value, h5py.Dataset): check = h5py.check_string_dtype(value.dtype) value = value[()] if check is not None: value = value.decode(check.encoding) elif isinstance(value, h5py.Group): # Change key to integer if k is digit, so that we can use the dict like a tuple or list value = OrderedDict( (HDF5Storage._unpack(k, v) for k, v in value.items())) return key, value
def test_create_dataset(hfile): with h5py.File(hfile, "w") as root: root.create_dataset("item_name", data="the value") assert os.path.exists(hfile) tree = h5tree.Hdf5TreeView(hfile) assert tree.filename.endswith(hfile) assert not tree.isNeXus with h5py.File(hfile, "r") as h5root: item = h5root["item_name"] assert item[()] == b"the value" assert h5py.check_string_dtype(item.dtype)
def test_fixed_bytes(self): """ Fixed-length bytes dataset maps to fixed-length ascii in the file """ dt = np.dtype("|S10") ds = self.f.create_dataset('x', (100,), dtype=dt) tid = ds.id.get_type() self.assertEqual(type(tid), h5py.h5t.TypeStringID) self.assertFalse(tid.is_variable_str()) self.assertEqual(tid.get_size(), 10) self.assertEqual(tid.get_cset(), h5py.h5t.CSET_ASCII) string_info = h5py.check_string_dtype(ds.dtype) self.assertEqual(string_info.encoding, 'ascii') self.assertEqual(string_info.length, 10)
def _h5py_dataset_iterator(g, prefix=''): import h5py for key in g: item = g[key] path = '{}/{}'.format(prefix, key) if isinstance(item, h5py.Dataset): # test for dataset if Version(h5py.__version__) > Version('2.10.0'): string_type = h5py.check_string_dtype(item.dtype) if (string_type is not None) and (string_type.encoding == "utf-8"): item = item.asstr()[:] yield (path, item) elif isinstance(item, h5py.Group): # test for group (go down) for x in _h5py_dataset_iterator(item, path): yield x
def _column(self, k): cached_data = self._cache.get(k) if cached_data is not None: return cached_data dset = self._hf.get(k) if dset is not None: if h5py.check_string_dtype(dset.dtype): py_data = dset.asstr() else: enum_dict = h5py.check_enum_dtype(dset.dtype) if enum_dict: inv_enum_dict = dict((i, k) for k, i in enum_dict.items()) py_data = [inv_enum_dict[x] for x in np.array(dset)] else: py_data = np.array(dset) else: py_data = None self._cache[k] = py_data return py_data
def hdfgetdata(gID,field): val = gID.get(field) if val is None: return val if h5py.check_string_dtype(val.dtype): # string if val.len()==1: val=val[0].tobytes().decode('ascii') return val else: val2=[]; for x in val: val2.append(x.tobytes().decode('ascii')) val2=np.array(val2) return val2 val=np.array(val) if(val.ndim==1 and len(val)==1): val=val[0] return val
def _update_pdb_dsets(file: h5py.File, name: str, logger: Optional[Logger] = None) -> Optional[PDBContainer]: """Check for and update pre dataCAT 0.3 style databases.""" if not isinstance(file.get(name), h5py.Dataset): return None elif logger is not None: logger.info(f'Updating h5py Dataset to data-CAT >= 0.3 style: {name!r}') mol_list = [from_pdb_array(pdb, rdmol=False, warn=False) for pdb in file[name]] m = len(mol_list) del file[name] dtype = IDX_DTYPE[name] scale = np.rec.array(None, dtype=dtype, shape=(m,)) if dtype.fields is not None and scale.size: # Ensure that the sentinal value for vlen strings is an empty string, not `None` elem = list(scale.item(0)) iterator = (v for v, *_ in dtype.fields.values()) for i, sub_dt in enumerate(iterator): if h5py.check_string_dtype(sub_dt) is not None: elem[i] = '' scale[:] = tuple(elem) return PDBContainer.from_molecules(mol_list, scale=scale)
def read_dataset(dataset: h5py.Dataset): if H5PY_V3: string_dtype = h5py.check_string_dtype(dataset.dtype) if (string_dtype is not None) and (string_dtype.encoding == "utf-8"): dataset = dataset.asstr() value = dataset[()] if not hasattr(value, "dtype"): return value elif isinstance(value.dtype, str): pass elif issubclass(value.dtype.type, np.string_): value = value.astype(str) # Backwards compat, old datasets have strings as one element 1d arrays if len(value) == 1: return value[0] elif len(value.dtype.descr) > 1: # Compound dtype # For backwards compat, now strings are written as variable length dtype = value.dtype value = _from_fixed_length_strings(value) if H5PY_V3: value = _decode_structured_array(value, dtype=dtype) if value.shape == (): value = value[()] return value
def __getitem__(self, key): if getattr(self._root, "decode_vlen_strings", False): string_info = h5py.check_string_dtype(self._h5ds.dtype) if string_info and string_info.length is None: return self._h5ds.asstr()[key] return self._h5ds[key]
def index_to_pandas(dset: h5py.Dataset, fields: None | Sequence[str] = None) -> pd.MultiIndex: """Construct an MultiIndex from the passed ``index`` dataset. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import HDF5_READ as filename .. code:: python >>> from dataCAT import index_to_pandas >>> import h5py >>> filename = str(...) # doctest: +SKIP # Convert the entire dataset >>> with h5py.File(filename, "r") as f: ... dset: h5py.Dataset = f["ligand"]["index"] ... index_to_pandas(dset) MultiIndex([('O=C=O', 'O1'), ('O=C=O', 'O3'), ( 'CCCO', 'O4')], names=['ligand', 'ligand anchor']) # Convert a subset of fields >>> with h5py.File(filename, "r") as f: ... dset = f["ligand"]["index"] ... index_to_pandas(dset, fields=["ligand"]) MultiIndex([('O=C=O',), ('O=C=O',), ( 'CCCO',)], names=['ligand']) Parameters ---------- dset : :class:`h5py.Dataset` The relevant ``index`` dataset. fields : :class:`Sequence[str]<collections.abc.Sequence>` The names of the ``index`` fields that are to-be included in the returned MultiIndex. If :data:`None`, include all fields. Returns ------- :class:`pandas.MultiIndex` A multi-index constructed from the passed dataset. """ # Fast-path for non-void-based datasets if dset.dtype.fields is None: if h5py.check_string_dtype(dset.dtype): ar = dset[:].astype(str) elif h5py.check_vlen_dtype(dset.dtype): ar = _vlen_to_tuples(dset[:]) else: ar = dset[:] return pd.MultiIndex.from_arrays([ar]) # Parse the `fields` parameter if fields is None: field_names = list(dset.dtype.fields.keys()) iterator = ((name, f_dtype) for name, (f_dtype, *_) in dset.dtype.fields.items()) else: field_names = list(fields) iterator = ((name, dset.dtype.fields[name][0]) for name in fields) if len(field_names) == 0: raise ValueError("At least one field is required") fields_lst = [] index_ar = dset[:] for name, field_dtype in iterator: # It's a bytes-string; decode it if h5py.check_string_dtype(field_dtype): ar = index_ar[name].astype(str) # It's a h5py `vlen` dtype; convert it into a list of tuples elif h5py.check_vlen_dtype(field_dtype): ar = _vlen_to_tuples(index_ar[name]) else: ar = index_ar[name] fields_lst.append(ar) return pd.MultiIndex.from_arrays(fields_lst, names=field_names)
def validate(filename,fileOut=None): fileID = h5py.File(filename, 'r') formatVersion=hdfgetdata(fileID,"/formatVersion") def getallnames(gID,lst): if isinstance(gID, h5py.Dataset): lst.append(gID.name) else: for x in gID: getallnames(gID[x],lst) def checkdim(field,fID,foundInvalid,lstInvalid): val = fID.get(field); if "Pos2D" in field or "Pos3D" in field: dim = 2 elif "dataTimeSeries" in field: dim = 2 else: dim = 1 if dim != len(val.dims): return False lst=[] getallnames(fileID,lst) if fileOut == None: print('-' * 40) print('SNIRF Validator') print('Version 1.0') print('written by T. Huppert') print() print('File = {0}'.format(filename)) print('Version = {0}'.format(formatVersion)) print('-' * 40) foundInvalid=0; lstInvalid=[] for x in lst: print(Fore.WHITE + x) val = fileID.get(x) if h5py.check_string_dtype(val.dtype): # string if val.len()==1: val=val[0].tobytes().decode('ascii') print('\tHDF5-STRING: {0}'.format(val)) else: val2=[]; for y in val: val2.append(y.tobytes().decode('ascii')) val2=np.array(val2) print('\tHDF5-STRING 1D-Vector: <{0}x1>'.format(len(val2))) else: val=np.array(val) if(val.ndim==1 and len(val)==1): val=val[0] print('\tHDF5-FLOAT: {0}'.format(val)) elif val.ndim==1: print('\tHDF5-FLOAT 1D-Vector: <{0}x1>'.format(len(val))) else: print('\tHDF5-FLOAT 2D-Array: <{0}x{1}>'.format(len(val),int(val.size/len(val)))) dimcheck = checkdim(x, fileID, foundInvalid, lstInvalid) if dimcheck == False: val = len(fileID.get(x).dims) if val == 1: print(Fore.RED +'\tINVALID dimensions(Expected Number of Dimensions: 2)') else: print(Fore.RED +'\tINVALID dimensions(Expected Number of Dimensions: 1)') foundInvalid=foundInvalid+1; lstInvalid.append(x) if "/aux" in x or "/stim" in x: if isrequired(x) == True: print(Fore.BLUE + '\t\tRequired field when optional parent object is included') elif isoptional(x): print(Fore.GREEN + '\t\tOptional field when optional parent object is included') else: print(Fore.RED + '\t\tINVALID field') foundInvalid = foundInvalid + 1 lstInvalid.append(x) else: if isrequired(x) == True: print(Fore.BLUE + '\t\tRequired field') elif isoptional(x): print(Fore.GREEN + '\t\tOptional field') else: print(Fore.RED + '\t\tINVALID field') foundInvalid = foundInvalid + 1 lstInvalid.append(x) print('-' * 40) if(len(lstInvalid)!=0): print(Fore.RED+ "File is INVALID") print(Fore.RED +'\tINVALID ENTRIES FOUND') for x in lstInvalid: print(Fore.RED + x) else: print(Fore.WHITE+ "File is VALID") print(Style.RESET_ALL) else: # write to file text_file = open(fileOut, "w") text_file.write('\n' + '\n' + '-' * 40) text_file.write('\n' + '\n' + 'SNIRF Validator') text_file.write('\n' + '\n' + 'Version 1.0') text_file.write('\n' + 'written by T. Huppert') text_file.write('\n') text_file.write('\n' + 'File = {0}'.format(filename)) text_file.write('\n' + 'Version = {0}'.format(formatVersion)) text_file.write('\n' + '-' * 40) foundInvalid=0; lstInvalid=[] for x in lst: text_file.write('\n' + x) val = fileID.get(x) if h5py.check_string_dtype(val.dtype): # string if val.len()==1: val=val[0].tobytes().decode('ascii') text_file.write('\n' + '\tHDF5-STRING: {0}'.format(val)) else: val2=[]; for y in val: val2.append(y.tobytes().decode('ascii')) val2=np.array(val2) text_file.write('\n' + '\tHDF5-STRING 1D-Vector: <{0}x1>'.format(len(val2))) else: val=np.array(val) if(val.ndim==1 and len(val)==1): val=val[0] text_file.write('\n' + '\tHDF5-FLOAT: {0}'.format(val)) elif val.ndim==1: text_file.write('\n' + '\tHDF5-FLOAT 1D-Vector: <{0}x1>'.format(len(val))) else: text_file.write('\n' + '\tHDF5-FLOAT 2D-Array: <{0}x{1}>'.format(len(val),int(val.size/len(val)))) dimcheck = checkdim(x, fileID, foundInvalid, lstInvalid) if dimcheck == False: val = len(fileID.get(x).dims) if val == 1: text_file.write(Fore.RED +'\tINVALID dimensions(Expected Number of Dimensions: 2)') else: text_file.write(Fore.RED +'\tINVALID dimensions(Expected Number of Dimensions: 1)') foundInvalid=foundInvalid+1 lstInvalid.append(x) if isrequired(x)==True: text_file.write('\n' + '\t\tRequired field') elif isoptional(x): text_file.write('\n' + '\t\tOptional field') else: text_file.write('\n' + '\t\tINVALID field') foundInvalid=foundInvalid+1 lstInvalid.append(x) text_file.write('\n' + '-' * 40) if(len(lstInvalid)!=0): text_file.write('\n' + "File is INVALID") text_file.write('\n' + '\tINVALID ENTRIES FOUND') for x in lstInvalid: text_file.write('\n' + x) else: text_file.write('\n' + "File is VALID") text_file.close() return (foundInvalid==0)
def validate(filename, fileOut=None): fileID = h5py.File(filename, 'r') formatVersion = hdfgetdata(fileID, "/formatVersion") def getallnames(gID, lst): if isinstance(gID, h5py.Dataset): lst.append(gID.name) else: for x in gID: getallnames(gID[x], lst) lst = [] getallnames(fileID, lst) if fileOut == None: print('-' * 40) print('SNIRF Validator') print('Version 1.0') print('written by T. Huppert') print() print('File = {0}'.format(filename)) print('Version = {0}'.format(formatVersion)) print('-' * 40) foundInvalid = 0 lstInvalid = [] for x in lst: print(Fore.WHITE + x) val = fileID.get(x) if h5py.check_string_dtype(val.dtype): # string if val.len() == 1: val = val[0].tostring().decode('ascii') print('\tHDF5-STRING: {0}'.format(val)) else: val2 = [] for y in val: val2.append(y.tostring().decode('ascii')) val2 = np.array(val2) print('\tHDF5-STRING 1D-Vector: <{0}x1>'.format(len(val2))) else: val = np.array(val) if (val.ndim == 1 and len(val) == 1): val = val[0] print('\tHDF5-FLOAT: {0}'.format(val)) elif val.ndim == 1: print('\tHDF5-FLOAT 1D-Vector: <{0}x1>'.format(len(val))) else: print('\tHDF5-FLOAT 2D-Array: <{0}x{1}>'.format( len(val), int(val.size / len(val)))) if "Pos2D" in x: if (val.ndim != 2 or int(val.size / len(val))) != 2: print('\tINVALID dimensions') foundInvalid = foundInvalid + 1 lstInvalid.append(x) if "Pos3D" in x: if (val.ndim != 2 or int(val.size / len(val))) != 3: print(Fore.RED + '\tINVALID dimensions') foundInvalid = foundInvalid + 1 lstInvalid.append(x) if "dataTimeSeries" in x: if int(val.size / len(val)) > len(val): print(Fore.RED + '\tINVALID dimensions') foundInvalid = foundInvalid + 1 lstInvalid.append(x) if ("stim" in x) and ("data" in x): if (val.ndim != 2 or int(val.size / len(val))) != 3: print( Fore.RED + '\tPossible transpose. Should be <#trials x [onset, duration, amplitude, ...] >' ) foundInvalid = foundInvalid + 1 lstInvalid.append(x) if isrequired(x) == True: print(Fore.BLUE + '\t\tRequired field') elif isoptional(x): print(Fore.GREEN + '\t\tOptional field') else: print(Fore.RED + '\t\tINVALID field') foundInvalid = foundInvalid + 1 lstInvalid.append(x) print('-' * 40) if (len(lstInvalid) != 0): print(Fore.RED + "File is INVALID") print(Fore.RED + '\tINVALID ENTRIES FOUND') for x in lstInvalid: print(Fore.RED + x) else: print(Fore.WHITE + "File is VALID") else: # write to file text_file = open(fileOut, "w") text_file.write('\n' + '\n' + '-' * 40) text_file.write('\n' + '\n' + 'SNIRF Validator') text_file.write('\n' + '\n' + 'Version 1.0') text_file.write('\n' + 'written by T. Huppert') text_file.write('\n') text_file.write('\n' + 'File = {0}'.format(filename)) text_file.write('\n' + 'Version = {0}'.format(formatVersion)) text_file.write('\n' + '-' * 40) foundInvalid = 0 lstInvalid = [] for x in lst: text_file.write('\n' + x) val = fileID.get(x) if h5py.check_string_dtype(val.dtype): # string if val.len() == 1: val = val[0].tostring().decode('ascii') text_file.write('\n' + '\tHDF5-STRING: {0}'.format(val)) else: val2 = [] for y in val: val2.append(y.tostring().decode('ascii')) val2 = np.array(val2) text_file.write( '\n' + '\tHDF5-STRING 1D-Vector: <{0}x1>'.format(len(val2))) else: val = np.array(val) if (val.ndim == 1 and len(val) == 1): val = val[0] text_file.write('\n' + '\tHDF5-FLOAT: {0}'.format(val)) elif val.ndim == 1: text_file.write( '\n' + '\tHDF5-FLOAT 1D-Vector: <{0}x1>'.format(len(val))) else: text_file.write('\n' + '\tHDF5-FLOAT 2D-Array: <{0}x{1}>'.format( len(val), int(val.size / len(val)))) if "Pos2D" in x: if (val.ndim != 2 or int(val.size / len(val))) != 2: text_file.write('\n' + '\tINVALID dimensions') foundInvalid = foundInvalid + 1 lstInvalid.append(x) if "Pos3D" in x: if (val.ndim != 2 or int(val.size / len(val))) != 3: text_file.write('\n' + '\tINVALID dimensions') foundInvalid = foundInvalid + 1 lstInvalid.append(x) if "dataTimeSeries" in x: if int(val.size / len(val)) > len(val): text_file.write('\n' + '\tINVALID dimensions') foundInvalid = foundInvalid + 1 lstInvalid.append(x) if isrequired(x) == True: text_file.write('\n' + '\t\tRequired field') elif isoptional(x): text_file.write('\n' + '\t\tOptional field') else: text_file.write('\n' + '\t\tINVALID field') foundInvalid = foundInvalid + 1 lstInvalid.append(x) text_file.write('\n' + '-' * 40) if (len(lstInvalid) != 0): text_file.write('\n' + "File is INVALID") text_file.write('\n' + '\tINVALID ENTRIES FOUND') for x in lstInvalid: text_file.write('\n' + x) else: text_file.write('\n' + "File is VALID") text_file.close() return (foundInvalid == 0)
def _group2dict(self, group, dictionary=None, lazy=False): if dictionary is None: dictionary = {} for key, value in group.attrs.items(): if isinstance(value, bytes): value = value.decode() if isinstance(value, (np.string_, str)): if value == '_None_': value = None elif isinstance(value, np.bool_): value = bool(value) elif isinstance(value, np.ndarray) and value.dtype.char == "S": # Convert strings to unicode value = value.astype("U") if value.dtype.str.endswith("U1"): value = value.tolist() # skip signals - these are handled below. if key.startswith('_sig_'): pass elif key.startswith('_list_empty_'): dictionary[key[len('_list_empty_'):]] = [] elif key.startswith('_tuple_empty_'): dictionary[key[len('_tuple_empty_'):]] = () elif key.startswith('_bs_'): dictionary[key[len('_bs_'):]] = value.tobytes() # The following two elif stataments enable reading date and time from # v < 2 of HyperSpy's metadata specifications elif key.startswith('_datetime_date'): date_iso = datetime.date( *ast.literal_eval(value[value.index("("):])).isoformat() dictionary[key.replace("_datetime_", "")] = date_iso elif key.startswith('_datetime_time'): date_iso = datetime.time( *ast.literal_eval(value[value.index("("):])).isoformat() dictionary[key.replace("_datetime_", "")] = date_iso else: dictionary[key] = value if not isinstance(group, self.Dataset): for key in group.keys(): if key.startswith('_sig_'): from hyperspy.io import dict2signal dictionary[key[len('_sig_'):]] = ( dict2signal(self.group2signaldict( group[key], lazy=lazy))) elif isinstance(group[key], self.Dataset): dat = group[key] kn = key if key.startswith("_list_"): if (h5py.check_string_dtype(dat.dtype) and hasattr(dat, 'asstr')): # h5py 3.0 and newer # https://docs.h5py.org/en/3.0.0/strings.html dat = dat.asstr()[:] ans = np.array(dat) ans = ans.tolist() kn = key[6:] elif key.startswith("_tuple_"): ans = np.array(dat) ans = tuple(ans.tolist()) kn = key[7:] elif dat.dtype.char == "S": ans = np.array(dat) try: ans = ans.astype("U") except UnicodeDecodeError: # There are some strings that must stay in binary, # for example dill pickles. This will obviously also # let "wrong" binary string fail somewhere else... pass elif lazy: ans = da.from_array(dat, chunks=dat.chunks) else: ans = np.array(dat) dictionary[kn] = ans elif key.startswith('_hspy_AxesManager_'): dictionary[key[len('_hspy_AxesManager_'):]] = AxesManager( [i for k, i in sorted(iter( self._group2dict( group[key], lazy=lazy).items() ))]) elif key.startswith('_list_'): dictionary[key[7 + key[6:].find('_'):]] = \ [i for k, i in sorted(iter( self._group2dict( group[key], lazy=lazy).items() ))] elif key.startswith('_tuple_'): dictionary[key[8 + key[7:].find('_'):]] = tuple( [i for k, i in sorted(iter( self._group2dict( group[key], lazy=lazy).items() ))]) else: dictionary[key] = {} self._group2dict( group[key], dictionary[key], lazy=lazy) return dictionary
def create_zarr_hierarchy(self, h5py_group, zgroup): """ Scan hdf5 file and recursively create zarr attributes, groups and dataset structures for accessing data Args: h5py_group: h5py.Group or h5py.File object where information is gathered from zgroup: Zarr Group """ if (not isinstance(h5py_group, h5py.File) and (not issubclass(self.file.get( h5py_group.name, getclass=True), h5py.Group) or not issubclass( self.file.get(h5py_group.name, getclass=True, getlink=True), h5py.HardLink))): raise TypeError( f"{h5py_group} should be a h5py.File or h5py.Group as a h5py.HardLink" ) self.copy_attrs_data_to_zarr_store(h5py_group, zgroup) # add hdf5 group address in file to self._address_dict self._address_dict[h5py.h5o.get_info( h5py_group.id).addr] = h5py_group.name # iterate through group members test_iter = [name for name in h5py_group.keys()] for name in test_iter: obj = h5py_group[name] # get group member's link class obj_linkclass = h5py_group.get(name, getclass=True, getlink=True) # Datasets # TO DO, Soft Links # if issubclass(h5py_group.get(name, getclass=True), h5py.Dataset): if issubclass(obj_linkclass, h5py.ExternalLink): print( f"Dataset {obj.name} is not processed: External Link") continue dset = obj # number of filters dcpl = dset.id.get_create_plist() nfilters = dcpl.get_nfilters() if nfilters > 1: # TO DO # print( f"Dataset {dset.name} with multiple filters is not processed" ) continue elif nfilters == 1: # get first filter information filter_tuple = dset.id.get_create_plist().get_filter(0) filter_code = filter_tuple[0] if filter_code in self._hdf5_regfilters_subset and self._hdf5_regfilters_subset[ filter_code] is not None: # TO DO if filter_code == 32001: # Blosc blosc_names = { 0: 'blosclz', 1: 'lz4', 2: 'lz4hc', 3: 'snappy', 4: 'zlib', 5: 'zstd' } clevel, shuffle, cname_id = filter_tuple[2][-3:] cname = blosc_names[cname_id] compression = self._hdf5_regfilters_subset[ filter_code](cname=cname, clevel=clevel, shuffle=shuffle) else: compression = self._hdf5_regfilters_subset[ filter_code](level=filter_tuple[2]) else: print( f"Dataset {dset.name} with compression filter {filter_tuple[3]}, hdf5 filter number {filter_tuple[0]} is not processed:\ no compatible zarr codec") continue else: compression = None object_codec = None if dset.dtype.names is not None: # Structured array with Reference dtype dset_type = dset.id.get_type() dt_nmembers = dset_type.get_nmembers() dtype_ = [] dset_fillvalue = list(dset.fillvalue) for dt_i in range(dt_nmembers): dtname = dset.dtype.names[dt_i] if dset_type.get_member_class( dt_i) == h5py.h5t.REFERENCE: fcid = dset.file.id.get_create_plist() unit_address_size, _ = fcid.get_sizes() dtype_ += [(dtname, np.dtype(f'uint{unit_address_size*8}')) ] if dset.fillvalue[dt_i]: dset_fillvalue[dt_i] = h5py.h5o.get_info([ h5py.h5r.dereference( dset.fillvalue[dt_i], self.file.id) ]).addr else: dset_fillvalue[dt_i] = 0 else: dtype_ += [(dtname, dset.dtype.base[dt_i])] zarray = zgroup.create_dataset( dset.name, shape=dset.shape, dtype=dtype_, chunks=dset.chunks or False, fill_value=tuple(dset_fillvalue), compression=compression, overwrite=True) # variable-length Datasets elif h5py.check_vlen_dtype(dset.dtype): if not h5py.check_string_dtype(dset.dtype): print( f"Dataset {dset.name} is not processed: Variable-length dataset, not string" ) continue else: object_codec = VLenHDF5String() zarray = zgroup.create_dataset( dset.name, shape=dset.shape, dtype=object, chunks=dset.chunks or False, fill_value=dset.fillvalue, compression=compression, overwrite=True, object_codec=object_codec) dset_chunks = dset.chunks elif dset.dtype.hasobject: # TO DO test # dset_type = dset.id.get_type() if dset_type.get_class() == h5py.h5t.REFERENCE: fcid = dset.file.id.get_create_plist() unit_address_size, _ = fcid.get_sizes() dtype_ = np.dtype(f'uint{unit_address_size*8}') if dset.fillvalue: dset_fillvalue = h5py.h5o.get_info([ h5py.h5r.dereference(dset.fillvalue, self.file.id) ]).addr else: dset_fillvalue = 0 zarray = zgroup.create_dataset( dset.name, shape=dset.shape, dtype=dtype_, chunks=dset.chunks or False, fill_value=dset_fillvalue, compression=compression, overwrite=True) elif dset_type.get_class() == h5py.h5t.STD_REF_DSETREG: print( f"Dataset {dset.name} is not processed: Region Reference dtype" ) continue else: print( f"Dataset {dset.name} is not processed: Object dtype" ) continue else: if compression is None and (dset.chunks is None or dset.chunks == dset.shape): dset_chunks = dset.chunks if dset.chunks else dset.shape if dset.shape != (): dset_chunks = list(dset_chunks) dim_ = 0 ratio_ = self.max_chunksize / ( np.prod(dset_chunks) * dset.dtype.itemsize) while ratio_ < 1: chunk_dim_ = int(ratio_ * dset_chunks[dim_]) chunk_dim_ = chunk_dim_ if chunk_dim_ else 1 chunk_dim_ -= np.argmax( dset_chunks[dim_] % np.arange(chunk_dim_, chunk_dim_ // 2, -1)) dset_chunks[dim_] = int(chunk_dim_) ratio_ = self.max_chunksize / ( np.prod(dset_chunks) * dset.dtype.itemsize) dim_ += 1 dset_chunks = tuple(dset_chunks) dset_chunks = dset_chunks or None else: dset_chunks = dset.chunks zarray = zgroup.create_dataset(dset.name, shape=dset.shape, dtype=dset.dtype, chunks=dset_chunks or False, fill_value=dset.fillvalue, compression=compression, overwrite=True) self.copy_attrs_data_to_zarr_store(dset, zarray) info = self.storage_info(dset, dset_chunks) if object_codec is not None: info = self.vlen_storage_info(dset, info) # Store metadata if info: info['source'] = {'uri': self.uri, 'array_name': dset.name} FileChunkStore.chunks_info(zarray, info) # Groups elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group) and not issubclass(obj_linkclass, h5py.SoftLink)): if issubclass(obj_linkclass, h5py.ExternalLink): print(f"Group {obj.name} is not processed: External Link") continue group_ = obj zgroup_ = self.zgroup.create_group(group_.name, overwrite=True) self.create_zarr_hierarchy(group_, zgroup_) # Groups, Soft Link elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group) and issubclass(obj_linkclass, h5py.SoftLink)): group_ = obj zgroup_ = self.zgroup.create_group(group_.name, overwrite=True) self.copy_attrs_data_to_zarr_store(group_, zgroup_) zgroup_path = zgroup_.create_group(SYMLINK, overwrite=True) zgroup_path.attrs[group_.name] = h5py_group.get( name, getlink=True).path
def _rewrite_vlen_to_fixed(h5py_group, changed_dsets={}): """ Scan hdf5 file or hdf5 group object and recursively convert variable-length string dataset to fixed-length Args: h5py_group: h5py.Group or h5py.File object """ if (not isinstance(h5py_group, h5py.File) and (not issubclass( h5py_group.file.get(h5py_group.name, getclass=True), h5py.Group) or not issubclass( h5py_group.file.get(h5py_group.name, getclass=True, getlink=True), h5py.HardLink))): raise TypeError( f"{h5py_group} should be a h5py.File or h5py.Group as a h5py.HardLink" ) # iterate through group members group_iter = [name for name in h5py_group.keys()] for name in group_iter: obj = h5py_group[name] # get group member's link class obj_linkclass = h5py_group.get(name, getclass=True, getlink=True) # Datasets if issubclass(h5py_group.get(name, getclass=True), h5py.Dataset): if issubclass(obj_linkclass, h5py.ExternalLink): print( f"Skipped rewriting variable-length dataset {obj.name}: External Link" ) continue dset = obj # variable-length Datasets if h5py.check_vlen_dtype( dset.dtype) and h5py.check_string_dtype(dset.dtype): vlen_stringarr = dset[()] if dset.shape == (): string_lengths_ = len(vlen_stringarr) length_max = string_lengths_ else: length_max = max( len(el) for el in vlen_stringarr.flatten()) if dset.fillvalue is not None: length_max = max(length_max, len(dset.fillvalue)) length_max = length_max + (-length_max) % 8 dt_fixedlen = f'|S{length_max}' if isinstance(dset.fillvalue, str): dset_fillvalue = dset.fillvalue.encode('utf-8') else: dset_fillvalue = dset.fillvalue affix_ = '_fixedlen~' dset_name = dset.name h5py_group.file.move(dset_name, dset_name + affix_) changed_dsets[dset_name + affix_] = dset_name dsetf = h5py_group.file.create_dataset_like( dset_name, dset, dtype=dt_fixedlen, fillvalue=dset_fillvalue) # TO DO, copy attrs after all string dataset are moved for key, val in dset.attrs.items(): if isinstance( val, (bytes, np.bool_, str, int, float, np.number)): dsetf.attrs[key] = val else: # TO DO # print( f"Moving variable-length string Datasets: attribute value of type\ {type(val)} is not processed. Attribute {key} of object {dsetf.name}" ) if dsetf.shape == (): if isinstance(vlen_stringarr, bytes): dsetf[...] = vlen_stringarr else: dsetf[...] = vlen_stringarr.encode('utf-8') else: dsetf[...] = vlen_stringarr.astype(dt_fixedlen) # Groups elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group) and not issubclass(obj_linkclass, h5py.SoftLink)): if issubclass(obj_linkclass, h5py.ExternalLink): print(f"Group {obj.name} is not processed: External Link") continue changed_dsets = HDF5Zarr._rewrite_vlen_to_fixed( obj, changed_dsets) return changed_dsets