def test_clean_row_by_row(tmp_path): """tests clean row by row""" # ARRANGE work_file_path = tmp_path / "test.h5" expected_file_path = tmp_path / "expected.h5" with h5py.File(work_file_path, "w") as file: data = np.array([0, 1, 2, 3, "NaT"], dtype="datetime64[us]") file.create_dataset(name="d0", data=data.astype(h5py.opaque_dtype(data.dtype)), chunks=True) file.create_dataset(name="d1", data=[0, np.nan, 2, 3, 4], chunks=True) file.create_dataset(name="d2", data=[0., 1., 2., np.nan, 4.], chunks=True) with h5py.File(expected_file_path, "w") as file: data = np.array([0, 2], dtype="datetime64[us]") file.create_dataset(name=f"d0", data=data.astype(h5py.opaque_dtype(data.dtype))) file.create_dataset(name=f"d1", data=[0, 2]) file.create_dataset(name=f"d2", data=[0., 2.]) # ACT hdf_tools.clean_by_row(work_file_path) # ASSERT is_equal = os.system(f"h5diff {work_file_path} {expected_file_path}") == 0 assert is_equal
def test__load_dataset(tmpdir): """ Test load_dataset() function """ # ARRANGE creator = dataset_creator.DatasetCreator() path = tmpdir.join("context.hdf") context_dummy = h5py.File(path, 'w') dummy_is_bd_in_40ms_labels = np.ones((10,), dtype=bool) dummy_is_bd_in_20ms_labels = np.ones((10,), dtype=bool) dummy_is_bd_labels = np.ones((10,), dtype=bool) dummy_event_timestamps = np.array([np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09') ]) dummy_trend_timestamps = np.array([np.datetime64('2021-08-18T17:59:03'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:06'), np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:04'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:06'), np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:04') ]) dummy_is_healthy_labels = np.ones((10,), dtype=bool) with context_dummy as f: f.create_dataset("is_bd_in_40ms", data=dummy_is_bd_in_40ms_labels) f.create_dataset("is_bd_in_20ms", data=dummy_is_bd_in_20ms_labels) f.create_dataset("is_bd", data=dummy_is_bd_labels) f.create_dataset("test_data1", data=4 * np.ones((10,))) f.create_dataset("test_data2", data=np.zeros((10,))) f["Timestamp"] = dummy_event_timestamps.astype(h5py.opaque_dtype(dummy_event_timestamps.dtype)) f["PrevTrendData/Timestamp"] = dummy_trend_timestamps.astype( h5py.opaque_dtype(dummy_trend_timestamps.dtype)) f.create_dataset("clic_label/is_healthy", data=dummy_is_healthy_labels) f.create_dataset("is_healthy", data=dummy_is_healthy_labels) splits_expected = (0.7, 0.2, 0.1) # ACT np.random.seed(42) train, valid, test = dataset_creator.load_dataset(creator=creator, data_path=tmpdir / "context.hdf") sum_elements = len(train.idx) + len(valid.idx) + len(test.idx) splits = (len(train.idx) / sum_elements, len(valid.idx) / sum_elements, len(test.idx) / sum_elements) # ASSERT assert splits == splits_expected
def test_convert_iso8601_to_datetime__without_attrs(tmp_path): """tests conversion of iso strings to datetime without converting attributes, so only hdf-datsets.""" work_file_path = tmp_path / "test.h5" expected_file_path = tmp_path / "expected.h5" attr_data = np.array([b"2021-01-01T00:00:00.123456789Z"]) data = np.array( [b"2021-01-01T00:00:00.111222333Z", b"2021-01-01T00:00:00.444555666Z"]) with h5py.File(work_file_path, "w") as file: file.attrs.create("at1", data=attr_data) file.create_dataset("ds1", data=data) grp = file.create_group("test") grp.attrs.create("at2", data=attr_data) grp.create_dataset("ds2", data=data) data_converted = pd.to_datetime(data.astype(str)).to_numpy(np.datetime64) data_converted = data_converted.astype( h5py.opaque_dtype(data_converted.dtype)) with h5py.File(expected_file_path, "w") as file: file.attrs.create("at1", data=attr_data) file.create_dataset("ds1", data=data_converted) grp = file.create_group("test") grp.attrs.create("at2", data=attr_data) grp.create_dataset("ds2", data=data_converted) # ACT hdf_tools.convert_iso8601_to_datetime(work_file_path, also_convert_attrs=False) # ASSERT is_equal = os.system(f"h5diff {work_file_path} {expected_file_path}") == 0 assert is_equal
def test__select_events_from_list(tmpdir): """ Test select_events_from_list() function """ # ARRANGE path = tmpdir.join("dummy.hdf") context_dummy = h5py.File(path, 'w') dataset = np.ones((6, ), dtype=bool) dummy_event_timestamps = np.array([ np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:04'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:06'), np.datetime64('2021-08-18T17:59:07'), np.datetime64('2021-08-18T17:59:08') ]) dummy_trend_timestamps = np.array([ np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:01'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:03'), np.datetime64('2021-08-18T17:59:08'), np.datetime64('2021-08-18T17:59:09') ]) with context_dummy as f: f.create_dataset("Timestamp", data=dummy_event_timestamps.astype( h5py.opaque_dtype(dummy_event_timestamps.dtype))) f.create_dataset("PrevTrendData/Timestamp", data=dummy_trend_timestamps.astype( h5py.opaque_dtype(dummy_trend_timestamps.dtype))) f.create_dataset("clic_label/is_healthy", data=dataset) f.create_dataset("run_no", data=dataset) f.create_dataset("test1", data=dataset) f.create_dataset("test2", data=dataset) f.create_dataset("PSI Amplitude/pulse_amplitude", data=dataset) selection_list = ["test1", "test2"] selection_expected = np.array([False, True, False, True, False, False]) # ACT np.random.seed(42) selection_out = dataset_utils.select_events_from_list(path, selection_list) # ASSERT assert (selection_out == selection_expected).all()
def _get_timestamp(attrs: h5py.AttributeManager): """ returns the Timestamp from group properties/attribute in numpy datetime format :param attrs: the h5py.AttributeManager of an hdf.Group object :return: numpy datetime format of the timestamp """ datetime_str = attrs["Timestamp"][:-1] return np.datetime64(datetime_str).astype(h5py.opaque_dtype('M8[us]'))
def write_datasets(f): # dataset of special values data = np.array([ np.datetime64('2017-02-22T14:14:14'), np.datetime64('2018-02-22T14:14:14'), np.datetime64('2019-02-22T14:14:14'), np.datetime64('2020-02-22T14:14:14'), np.datetime64('2021-02-22T14:14:14'), ]) dataType = h5py.opaque_dtype(data.dtype) f.create_dataset('timestamp', data=data.astype(dataType)) # 2D String data data = np.arange(35).reshape(5, 7).astype(bytes) dataType = h5py.opaque_dtype(data.dtype) f.create_dataset('opaque_2d_string', data=data.astype(dataType)) f.flush() f.close()
def test_timedelta(self): fname = self.mktemp() for dt_unit in self.datetime_units: for dt_order in ['<', '>']: dt_descr = f'{dt_order}m8[{dt_unit}]' dt = h5py.opaque_dtype(np.dtype(dt_descr)) arr = np.array([np.timedelta64(500, dt_unit)], dtype=dt) with h5py.File(fname, 'w') as f: dset = f.create_dataset("default", data=arr, dtype=dt) self.assertArrayEqual(arr, dset) self.assertEqual(arr.dtype, dset.dtype)
def test__select_events(tmpdir, dummy_features, selection_filter_expected): """ Test create_breakdown_selection_filter() function """ # ARRANGE selector = XBOX2_event_all_bd_20ms.XBOX2EventAllBD20msSelect() path = tmpdir.join("dummy.hdf") context_dummy = h5py.File(path, 'w') dummy_event_timestamps = np.array([ np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:04'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:06') ]) dummy_trend_timestamps = np.array([ np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:01'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:03') ]) dummy_is_healthy_labels = np.array([True, True, False, False]) with context_dummy as f: f.create_dataset("is_bd_in_40ms", data=dummy_features) f.create_dataset("is_bd_in_20ms", data=dummy_features) f.create_dataset("is_bd", data=dummy_features) f["Timestamp"] = dummy_event_timestamps.astype( h5py.opaque_dtype(dummy_event_timestamps.dtype)) f["PrevTrendData/Timestamp"] = dummy_trend_timestamps.astype( h5py.opaque_dtype(dummy_trend_timestamps.dtype)) f.create_dataset("clic_label/is_healthy", data=dummy_is_healthy_labels) # ACT np.random.seed(42) selection_filter_out = selector.select_events(path) # ASSERT assert (selection_filter_expected == selection_filter_out).all()
def convert_attrs(_: str, hdf_obj): """This visitor function (hdf.File.visititems()) converts all the attributes of the given hdf_obj.""" for attrs_key, val in hdf_obj.attrs.items(): try: val = pd.to_datetime(val.astype(str), format="%Y-%m-%dT%H:%M:%S.%f") except ValueError: pass else: val = val.to_numpy(np.datetime64) del hdf_obj.attrs[attrs_key] hdf_obj.attrs.create(name=attrs_key, data=np.array(val).astype( h5py.opaque_dtype(val.dtype)))
def get_trend_data_features(length: int, trend_data_file_path: Path) -> typing.Generator: """This function generates all TrendDataFeatures for the xbox2 data set. :param length: number of values that will be calculated by each feature. :param trend_data_file_path: file path of the trend_data_file :return: generator of features""" with h5py.File(trend_data_file_path, "r") as file: for key in file.keys(): yield TrendDataFeature(name=key, func=_select(trend_data_file_path, key), output_dtype=h5py.opaque_dtype("M8[us]") if key == "Timestamp" else float, length=length, hdf_path="PrevTrendData", info=f"Previous Trend Data of {key}")
def get_event_attribute_features(length: int) -> typing.Generator: """This function generates all EventAttributeFeatures for the xbox2 data set. :param length: number of values that will be calculated by each feature. :return: generator of features""" yield EventAttributeFeature(name="Timestamp", func=_get_timestamp, length=length, hdf_path="/", output_dtype=h5py.opaque_dtype('M8[us]'), info="The timestamp of the EventData is a property of the event group. It is given in " "a datetime format with micro seconds precision.") for is_type in ["is_healthy", "is_bd_in_40ms", "is_bd_in_20ms", "is_bd"]: func = _log_type_creator(is_type) yield EventAttributeFeature(name=is_type, func=func, length=length, hdf_path="/clic_label/", output_dtype=bool, info="These values originated from the Log_Type assigned by the CLIC-Team." "Originally the Log_Type property had values in {0,1,2,3} where 0 stood for a" "healthy or normal log signal, and 3 for a breakdown. The label 1 and 2 stood" "for breakdown in 20ms and 40ms, so the signals prior to a breakdown.")
def convert_iso8601_to_datetime(file_path: Path, also_convert_attrs: bool = True) -> None: """converts datasets and attributes of strings of iso8601 format to numpy datetime format. :param file_path: Path of the hdf file to convert. :param also_convert_attrs: boolean value to define if attrs datetime should be converted too.""" def convert_attrs(_: str, hdf_obj): """This visitor function (hdf.File.visititems()) converts all the attributes of the given hdf_obj.""" for attrs_key, val in hdf_obj.attrs.items(): try: val = pd.to_datetime(val.astype(str), format="%Y-%m-%dT%H:%M:%S.%f") except ValueError: pass else: val = val.to_numpy(np.datetime64) del hdf_obj.attrs[attrs_key] hdf_obj.attrs.create(name=attrs_key, data=np.array(val).astype( h5py.opaque_dtype(val.dtype))) with h5py.File(file_path, mode="r+") as file: if also_convert_attrs: convert_attrs("/", file) file.visititems(convert_attrs) for key, channel in list(get_all_dataset_items(file)): try: data = pd.to_datetime(channel[:].astype(str), format="%Y-%m-%dT%H:%M:%S.%f") except ValueError: pass else: data = data.to_numpy(np.datetime64) del file[key] file.create_dataset(name=key, data=data.astype( h5py.opaque_dtype(data.dtype)))
def test__load_dataset(tmpdir): """ Test load_dataset() function """ # ARRANGE selector = XBOX2_event_all_bd_20ms.XBOX2EventAllBD20msSelect() path = tmpdir.join("context.hdf") context_dummy = h5py.File(path, 'w') dummy_is_bd_in_40ms_labels = np.ones((10, ), dtype=bool) dummy_is_bd_in_20ms_labels = np.ones((10, ), dtype=bool) dummy_is_bd_labels = np.ones((10, ), dtype=bool) dummy_event_timestamps = np.array([ np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09') ]) dummy_trend_timestamps = np.array([ np.datetime64('2021-08-18T17:59:03'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:06'), np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:04'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:06'), np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:04') ]) dummy_is_healthy_labels = np.ones((10, ), dtype=bool) with context_dummy as f: f.create_dataset("is_bd_in_40ms", data=dummy_is_bd_in_40ms_labels) f.create_dataset("is_bd_in_20ms", data=dummy_is_bd_in_20ms_labels) f.create_dataset("is_bd", data=dummy_is_bd_labels) selection_list = [ "DC_Down__D1", "DC_Down__D9", "DC_Down__tsfresh__mean", "DC_Down__tsfresh__maximum", "DC_Down__tsfresh__median", "DC_Down__tsfresh__minimum", "DC_Up__D1", "DC_Up__D9", "DC_Up__tsfresh__mean", "DC_Up__tsfresh__maximum", "DC_Up__tsfresh__median", "DC_Up__tsfresh__minimum", "PEI_Amplitude__pulse_length", "PEI_Amplitude__pulse_amplitude", "PKI_Amplitude__pulse_length", "PKI_Amplitude__pulse_amplitude", "PSI_Amplitude__pulse_length", "PSI_Amplitude__pulse_amplitude" ] for name in selection_list: f.create_dataset(name, data=np.ones((10, ))) f["Timestamp"] = dummy_event_timestamps.astype( h5py.opaque_dtype(dummy_event_timestamps.dtype)) f["PrevTrendData/Timestamp"] = dummy_trend_timestamps.astype( h5py.opaque_dtype(dummy_trend_timestamps.dtype)) f.create_dataset("clic_label/is_healthy", data=dummy_is_healthy_labels) f.create_dataset("is_healthy", data=dummy_is_healthy_labels) splits_expected = (0.7, 0.2, 0.1) # ACT np.random.seed(42) train, valid, test = dataset_creator.load_dataset(creator=selector, data_path=tmpdir) sum_elements = len(train.idx) + len(valid.idx) + len(test.idx) splits = (len(train.idx) / sum_elements, len(valid.idx) / sum_elements, len(test.idx) / sum_elements) # ASSERT assert splits == splits_expected
import h5py import numpy as np arr = np.array([np.datetime64('2019-09-22T17:38:30')]) with h5py.File('datetimes.h5', 'w') as f: # Create dataset f['data'] = arr.astype(h5py.opaque_dtype(arr.dtype)) # Read print(f['data'][:])
def test__load_dataset(tmpdir): """ Test load_dataset() function """ # ARRANGE selector = XBOX2_trend_all_bd_20ms.XBOX2TrendAllBD20msSelect() path = tmpdir.join("context.hdf") context_dummy = h5py.File(path, 'w') dummy_is_bd_in_40ms_labels = np.ones((10, ), dtype=bool) dummy_is_bd_in_20ms_labels = np.ones((10, ), dtype=bool) dummy_is_bd_labels = np.ones((10, ), dtype=bool) dummy_event_timestamps = np.array([ np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09'), np.datetime64('2021-08-18T17:59:09') ]) dummy_trend_timestamps = np.array([ np.datetime64('2021-08-18T17:59:03'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:06'), np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:04'), np.datetime64('2021-08-18T17:59:02'), np.datetime64('2021-08-18T17:59:06'), np.datetime64('2021-08-18T17:59:00'), np.datetime64('2021-08-18T17:59:04') ]) dummy_is_healthy_labels = np.ones((10, ), dtype=bool) with context_dummy as f: f.create_dataset("is_bd_in_40ms", data=dummy_is_bd_in_40ms_labels) f.create_dataset("is_bd_in_20ms", data=dummy_is_bd_in_20ms_labels) f.create_dataset("is_bd", data=dummy_is_bd_labels) selection_list = [ "PrevTrendData__Loadside_win", "PrevTrendData__Tubeside_win", "PrevTrendData__Collector", "PrevTrendData__Gun", "PrevTrendData__IP_before_PC", "PrevTrendData__PC_IP", "PrevTrendData__WG_IP", "PrevTrendData__IP_Load", "PrevTrendData__IP_before_structure", "PrevTrendData__US_Beam_Axis_IP", "PrevTrendData__Klystron_Flange_Temp", "PrevTrendData__Load_Temp", "PrevTrendData__PC_Left_Cavity_Temp", "PrevTrendData__PC_Right_Cavity_Temp", "PrevTrendData__Bunker_WG_Temp", "PrevTrendData__Structure_Input_Temp", "PrevTrendData__Chiller_1", "PrevTrendData__Chiller_2", "PrevTrendData__Chiller_3", "PrevTrendData__PKI_FT_avg", "PrevTrendData__PSI_FT_avg", "PrevTrendData__PSR_FT_avg", "PrevTrendData__PSI_max", "PrevTrendData__PSR_max", "PrevTrendData__PEI_max", "PrevTrendData__DC_Down_min", "PrevTrendData__DC_Up_min", "PrevTrendData__PSI_Pulse_Width" ] for name in selection_list: f.create_dataset(name, data=np.ones((10, ))) f["Timestamp"] = dummy_event_timestamps.astype( h5py.opaque_dtype(dummy_event_timestamps.dtype)) f["PrevTrendData/Timestamp"] = dummy_trend_timestamps.astype( h5py.opaque_dtype(dummy_trend_timestamps.dtype)) f.create_dataset("clic_label/is_healthy", data=dummy_is_healthy_labels) f.create_dataset("is_healthy", data=dummy_is_healthy_labels) f.create_dataset("run_no", data=dummy_is_bd_labels) path2 = tmpdir.join("context.hdf") splits_expected = (0.7, 0.2, 0.1) # ACT np.random.seed(42) train, valid, test = dataset_creator.load_dataset(creator=selector, data_path=tmpdir, splits=splits_expected) sum_elements = len(train.idx) + len(valid.idx) + len(test.idx) splits = (len(train.idx) / sum_elements, len(valid.idx) / sum_elements, len(test.idx) / sum_elements) # ASSERT assert splits == splits_expected
def _create_file(cls, name): """ create test hdf5 file """ srand = cls.srand # create hdf5 file cls.temp_file = tempfile.NamedTemporaryFile(suffix=".hdf5", prefix=name, delete=False) cls.temp_file.close() hfile = h5py.File(cls.temp_file.name, 'w') # create nested groups groupnames_prefix = [chr(65 + i) for i in range(cls.n_groups) ] # e.g. ['A', 'B', 'C'] group_list = [hfile] # list containing all groups def _create_groups(obj, d): nonlocal group_list for c in groupnames_prefix: g_name = c + str(cls.depth - d) g = obj.create_group(g_name) group_list.append(g) if d > 0: _create_groups(obj[g_name], d - 1) _create_groups(hfile, cls.depth) # create softlinks to groups for g in group_list: for i in range(cls.n_groupsoftlink): # do not use rand_rng.choice target_str = srand.choice(group_list).name g[f"SoftLg{i}"] = h5py.SoftLink(target_str) # create datasets # TO DO, external dsets # TO DO, compression srand.shuffle(cls.dset_dtypes) iter_dtypes = itertools.cycle( cls.dset_dtypes ) # shuffle dtypes to cycle over when creating dsets iter_chunks = itertools.cycle( [True, None]) # True or False cycle for auto chunking iter_track_times = itertools.cycle( [False, True]) # True or False cycle for track_times iter_track_order = itertools.cycle( [False, False, True, True]) # True or False cycle for track_order iter_fillvalue = itertools.cycle( [None, True, True, None]) # True or False cycle for track_order rand_rng = np.random.default_rng() dset_list = [] for g in group_list: # TO DO, add test with datasets with zero in dimensions for i in range(cls.n_dsets): shape = srand.choices(range(1, 90 // (i or 1)), k=i) # dseti has i dimensions size = np.prod(shape) dtype = next(iter_dtypes) if dtype == np.bool_: data = np.frombuffer(rand_rng.bytes(size * 8), dtype=np.int64) > 0 elif dtype == np.datetime64: data = np.datetime64( '1970-01-01T00:00:00', 'ns') + np.frombuffer( rand_rng.bytes(size * 8), dtype=np.uint64) dtype = h5py.opaque_dtype(data.dtype) data = data.astype(dtype) else: data = np.frombuffer(rand_rng.bytes( size * np.dtype(dtype).itemsize), dtype=dtype) # create_dataset options comptability if len(shape) > 0: chunks = next(iter_chunks) else: chunks = None # compression = None # compression_opts = None # shuffle = None # fletcher32 = None # scaleoffset = None fillvalue = None if ( next(iter_fillvalue) is None or data.dtype.char == 'M') else data.reshape(size)[rand_rng.integers(0, size)] dset = g.create_dataset( name='dset' + str(i), shape=shape, data=data, dtype=dtype, chunks=chunks, maxshape=None if chunks is None else tuple( (np.array(shape) + rand_rng.integers(0, 5)) * rand_rng.integers(1, 5, size=len(shape))), track_times=next(iter_track_times), track_order=next(iter_track_order), fillvalue=fillvalue) dset_list.append(dset) # create softlinks to datasets for g in group_list: for i in range(cls.n_dsetsoftlink): # do not use rand_rng.choice target_str = srand.choice(dset_list).name g[f"SoftLd{i}"] = h5py.SoftLink(target_str) # add attributes srand.shuffle(cls.dset_dtypes) iter_dtypes = itertools.cycle( cls.dset_dtypes ) # shuffle dtypes to cycle over when creating attributes for obj in itertools.chain(group_list, dset_list): for i in range( rand_rng.integers(cls.n_attributes_min, 26, endpoint=True)): dtype = next(iter_dtypes) attr_name = chr(97 + i) if dtype == np.bool_: attr = np.frombuffer(rand_rng.bytes(8), dtype=np.int64) > 0 elif dtype == np.datetime64: continue else: attr = np.frombuffer(rand_rng.bytes( np.dtype(dtype).itemsize), dtype=dtype) obj.attrs[attr_name] = attr[0] # add array attributes for i in range( rand_rng.integers(cls.n_attributes_min, 26, endpoint=True)): shape = srand.choices(range(1, 10 // (i // 5 or 1)), k=i // 5) # attributes has i//5 dimensions size = np.prod(shape) dtype = next(iter_dtypes) attr_name = chr(65 + i) + '_array_attr' if dtype == np.bool_: attr = np.frombuffer(rand_rng.bytes(size * 8), dtype=np.int64) > 0 elif dtype == np.datetime64: attr = np.datetime64( '1970-01-01T00:00:00', 'ns') + np.frombuffer( rand_rng.bytes(size * 8), dtype=np.uint64) attr = attr.astype(h5py.opaque_dtype(attr.dtype)) else: attr = np.frombuffer(rand_rng.bytes( size * np.dtype(dtype).itemsize), dtype=dtype) obj.attrs[attr_name] = attr return hfile