Ejemplos de opaque_dtype en Python, ejemplos de h5py.opaque_dtype en Python

Ejemplo n.º 1

0

Mostrar archivo

def test_clean_row_by_row(tmp_path):
    """tests clean row by row"""
    # ARRANGE
    work_file_path = tmp_path / "test.h5"
    expected_file_path = tmp_path / "expected.h5"

    with h5py.File(work_file_path, "w") as file:
        data = np.array([0, 1, 2, 3, "NaT"], dtype="datetime64[us]")
        file.create_dataset(name="d0",
                            data=data.astype(h5py.opaque_dtype(data.dtype)),
                            chunks=True)
        file.create_dataset(name="d1", data=[0, np.nan, 2, 3, 4], chunks=True)
        file.create_dataset(name="d2",
                            data=[0., 1., 2., np.nan, 4.],
                            chunks=True)
    with h5py.File(expected_file_path, "w") as file:
        data = np.array([0, 2], dtype="datetime64[us]")
        file.create_dataset(name=f"d0",
                            data=data.astype(h5py.opaque_dtype(data.dtype)))
        file.create_dataset(name=f"d1", data=[0, 2])
        file.create_dataset(name=f"d2", data=[0., 2.])

    # ACT
    hdf_tools.clean_by_row(work_file_path)

    # ASSERT
    is_equal = os.system(f"h5diff {work_file_path} {expected_file_path}") == 0
    assert is_equal

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_dataset_creator.py Proyecto: cobermai/rfstudies

def test__load_dataset(tmpdir):
    """
    Test load_dataset() function
    """
    # ARRANGE
    creator = dataset_creator.DatasetCreator()
    path = tmpdir.join("context.hdf")
    context_dummy = h5py.File(path, 'w')
    dummy_is_bd_in_40ms_labels = np.ones((10,), dtype=bool)
    dummy_is_bd_in_20ms_labels = np.ones((10,), dtype=bool)
    dummy_is_bd_labels = np.ones((10,), dtype=bool)
    dummy_event_timestamps = np.array([np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09'),
                                       np.datetime64('2021-08-18T17:59:09')
                                       ])
    dummy_trend_timestamps = np.array([np.datetime64('2021-08-18T17:59:03'),
                                       np.datetime64('2021-08-18T17:59:02'),
                                       np.datetime64('2021-08-18T17:59:02'),
                                       np.datetime64('2021-08-18T17:59:06'),
                                       np.datetime64('2021-08-18T17:59:00'),
                                       np.datetime64('2021-08-18T17:59:04'),
                                       np.datetime64('2021-08-18T17:59:02'),
                                       np.datetime64('2021-08-18T17:59:06'),
                                       np.datetime64('2021-08-18T17:59:00'),
                                       np.datetime64('2021-08-18T17:59:04')
                                       ])
    dummy_is_healthy_labels = np.ones((10,), dtype=bool)
    with context_dummy as f:
        f.create_dataset("is_bd_in_40ms", data=dummy_is_bd_in_40ms_labels)
        f.create_dataset("is_bd_in_20ms", data=dummy_is_bd_in_20ms_labels)
        f.create_dataset("is_bd", data=dummy_is_bd_labels)
        f.create_dataset("test_data1", data=4 * np.ones((10,)))
        f.create_dataset("test_data2", data=np.zeros((10,)))
        f["Timestamp"] = dummy_event_timestamps.astype(h5py.opaque_dtype(dummy_event_timestamps.dtype))
        f["PrevTrendData/Timestamp"] = dummy_trend_timestamps.astype(
            h5py.opaque_dtype(dummy_trend_timestamps.dtype))
        f.create_dataset("clic_label/is_healthy", data=dummy_is_healthy_labels)
        f.create_dataset("is_healthy", data=dummy_is_healthy_labels)

    splits_expected = (0.7, 0.2, 0.1)

    # ACT
    np.random.seed(42)

    train, valid, test = dataset_creator.load_dataset(creator=creator, data_path=tmpdir / "context.hdf")
    sum_elements = len(train.idx) + len(valid.idx) + len(test.idx)
    splits = (len(train.idx) / sum_elements, len(valid.idx) / sum_elements, len(test.idx) / sum_elements)

    # ASSERT
    assert splits == splits_expected

Ejemplo n.º 3

0

Mostrar archivo

def test_convert_iso8601_to_datetime__without_attrs(tmp_path):
    """tests conversion of iso strings to datetime without converting attributes, so only hdf-datsets."""
    work_file_path = tmp_path / "test.h5"
    expected_file_path = tmp_path / "expected.h5"

    attr_data = np.array([b"2021-01-01T00:00:00.123456789Z"])
    data = np.array(
        [b"2021-01-01T00:00:00.111222333Z", b"2021-01-01T00:00:00.444555666Z"])
    with h5py.File(work_file_path, "w") as file:
        file.attrs.create("at1", data=attr_data)
        file.create_dataset("ds1", data=data)
        grp = file.create_group("test")
        grp.attrs.create("at2", data=attr_data)
        grp.create_dataset("ds2", data=data)

    data_converted = pd.to_datetime(data.astype(str)).to_numpy(np.datetime64)
    data_converted = data_converted.astype(
        h5py.opaque_dtype(data_converted.dtype))
    with h5py.File(expected_file_path, "w") as file:
        file.attrs.create("at1", data=attr_data)
        file.create_dataset("ds1", data=data_converted)
        grp = file.create_group("test")
        grp.attrs.create("at2", data=attr_data)
        grp.create_dataset("ds2", data=data_converted)

    # ACT
    hdf_tools.convert_iso8601_to_datetime(work_file_path,
                                          also_convert_attrs=False)

    # ASSERT
    is_equal = os.system(f"h5diff {work_file_path} {expected_file_path}") == 0
    assert is_equal

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_dataset_utils.py Proyecto: cobermai/rfstudies

def test__select_events_from_list(tmpdir):
    """
    Test select_events_from_list() function
    """
    # ARRANGE
    path = tmpdir.join("dummy.hdf")
    context_dummy = h5py.File(path, 'w')
    dataset = np.ones((6, ), dtype=bool)
    dummy_event_timestamps = np.array([
        np.datetime64('2021-08-18T17:59:00'),
        np.datetime64('2021-08-18T17:59:04'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:06'),
        np.datetime64('2021-08-18T17:59:07'),
        np.datetime64('2021-08-18T17:59:08')
    ])
    dummy_trend_timestamps = np.array([
        np.datetime64('2021-08-18T17:59:00'),
        np.datetime64('2021-08-18T17:59:01'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:03'),
        np.datetime64('2021-08-18T17:59:08'),
        np.datetime64('2021-08-18T17:59:09')
    ])
    with context_dummy as f:
        f.create_dataset("Timestamp",
                         data=dummy_event_timestamps.astype(
                             h5py.opaque_dtype(dummy_event_timestamps.dtype)))
        f.create_dataset("PrevTrendData/Timestamp",
                         data=dummy_trend_timestamps.astype(
                             h5py.opaque_dtype(dummy_trend_timestamps.dtype)))
        f.create_dataset("clic_label/is_healthy", data=dataset)
        f.create_dataset("run_no", data=dataset)
        f.create_dataset("test1", data=dataset)
        f.create_dataset("test2", data=dataset)
        f.create_dataset("PSI Amplitude/pulse_amplitude", data=dataset)

    selection_list = ["test1", "test2"]

    selection_expected = np.array([False, True, False, True, False, False])

    # ACT
    np.random.seed(42)
    selection_out = dataset_utils.select_events_from_list(path, selection_list)

    # ASSERT
    assert (selection_out == selection_expected).all()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: attribute.py Proyecto: cobermai/rfstudies

def _get_timestamp(attrs: h5py.AttributeManager):
    """
    returns the Timestamp from group properties/attribute in numpy datetime format
    :param attrs: the h5py.AttributeManager of an hdf.Group object
    :return: numpy datetime format of the timestamp
    """
    datetime_str = attrs["Timestamp"][:-1]
    return np.datetime64(datetime_str).astype(h5py.opaque_dtype('M8[us]'))

Ejemplo n.º 6

0

Mostrar archivo

def write_datasets(f):
    # dataset of special values
    data = np.array([
        np.datetime64('2017-02-22T14:14:14'),
        np.datetime64('2018-02-22T14:14:14'),
        np.datetime64('2019-02-22T14:14:14'),
        np.datetime64('2020-02-22T14:14:14'),
        np.datetime64('2021-02-22T14:14:14'),
    ])

    dataType = h5py.opaque_dtype(data.dtype)
    f.create_dataset('timestamp', data=data.astype(dataType))

    # 2D String data
    data = np.arange(35).reshape(5, 7).astype(bytes)
    dataType = h5py.opaque_dtype(data.dtype)
    f.create_dataset('opaque_2d_string', data=data.astype(dataType))

    f.flush()
    f.close()

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_dtype.py Proyecto: bomber8013/h5py

    def test_timedelta(self):
        fname = self.mktemp()

        for dt_unit in self.datetime_units:
            for dt_order in ['<', '>']:
                dt_descr = f'{dt_order}m8[{dt_unit}]'
                dt = h5py.opaque_dtype(np.dtype(dt_descr))
                arr = np.array([np.timedelta64(500, dt_unit)], dtype=dt)

                with h5py.File(fname, 'w') as f:
                    dset = f.create_dataset("default", data=arr, dtype=dt)
                    self.assertArrayEqual(arr, dset)
                    self.assertEqual(arr.dtype, dset.dtype)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_XBOX2_event_all_bd_20ms.py Proyecto: cobermai/rfstudies

def test__select_events(tmpdir, dummy_features, selection_filter_expected):
    """
    Test create_breakdown_selection_filter() function
    """
    # ARRANGE
    selector = XBOX2_event_all_bd_20ms.XBOX2EventAllBD20msSelect()
    path = tmpdir.join("dummy.hdf")
    context_dummy = h5py.File(path, 'w')
    dummy_event_timestamps = np.array([
        np.datetime64('2021-08-18T17:59:00'),
        np.datetime64('2021-08-18T17:59:04'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:06')
    ])
    dummy_trend_timestamps = np.array([
        np.datetime64('2021-08-18T17:59:00'),
        np.datetime64('2021-08-18T17:59:01'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:03')
    ])
    dummy_is_healthy_labels = np.array([True, True, False, False])

    with context_dummy as f:
        f.create_dataset("is_bd_in_40ms", data=dummy_features)
        f.create_dataset("is_bd_in_20ms", data=dummy_features)
        f.create_dataset("is_bd", data=dummy_features)
        f["Timestamp"] = dummy_event_timestamps.astype(
            h5py.opaque_dtype(dummy_event_timestamps.dtype))
        f["PrevTrendData/Timestamp"] = dummy_trend_timestamps.astype(
            h5py.opaque_dtype(dummy_trend_timestamps.dtype))
        f.create_dataset("clic_label/is_healthy", data=dummy_is_healthy_labels)

    # ACT
    np.random.seed(42)
    selection_filter_out = selector.select_events(path)

    # ASSERT
    assert (selection_filter_expected == selection_filter_out).all()

Ejemplo n.º 9

0

Mostrar archivo

 def convert_attrs(_: str, hdf_obj):
     """This visitor function (hdf.File.visititems()) converts all the attributes of the given hdf_obj."""
     for attrs_key, val in hdf_obj.attrs.items():
         try:
             val = pd.to_datetime(val.astype(str),
                                  format="%Y-%m-%dT%H:%M:%S.%f")
         except ValueError:
             pass
         else:
             val = val.to_numpy(np.datetime64)
             del hdf_obj.attrs[attrs_key]
             hdf_obj.attrs.create(name=attrs_key,
                                  data=np.array(val).astype(
                                      h5py.opaque_dtype(val.dtype)))

Ejemplo n.º 10

0

Mostrar archivo

def get_trend_data_features(length: int,
                            trend_data_file_path: Path) -> typing.Generator:
    """This function generates all TrendDataFeatures for the xbox2 data set.
    :param length: number of values that will be calculated by each feature.
    :param trend_data_file_path: file path of the trend_data_file
    :return: generator of features"""
    with h5py.File(trend_data_file_path, "r") as file:
        for key in file.keys():
            yield TrendDataFeature(name=key,
                                   func=_select(trend_data_file_path, key),
                                   output_dtype=h5py.opaque_dtype("M8[us]")
                                   if key == "Timestamp" else float,
                                   length=length,
                                   hdf_path="PrevTrendData",
                                   info=f"Previous Trend Data of {key}")

Ejemplo n.º 11

0

Mostrar archivo

Archivo: attribute.py Proyecto: cobermai/rfstudies

def get_event_attribute_features(length: int) -> typing.Generator:
    """This function generates all EventAttributeFeatures for the xbox2 data set.
    :param length: number of values that will be calculated by each feature.
    :return: generator of features"""
    yield EventAttributeFeature(name="Timestamp",
                                func=_get_timestamp,
                                length=length,
                                hdf_path="/",
                                output_dtype=h5py.opaque_dtype('M8[us]'),
                                info="The timestamp of the EventData is a property of the event group. It is given in "
                                     "a datetime format with micro seconds precision.")

    for is_type in ["is_healthy", "is_bd_in_40ms", "is_bd_in_20ms", "is_bd"]:
        func = _log_type_creator(is_type)
        yield EventAttributeFeature(name=is_type,
                                    func=func,
                                    length=length,
                                    hdf_path="/clic_label/",
                                    output_dtype=bool,
                                    info="These values originated from the Log_Type assigned by the CLIC-Team."
                                         "Originally the Log_Type property had values in {0,1,2,3} where 0 stood for a"
                                         "healthy or normal log signal, and 3 for a breakdown. The label 1 and 2 stood"
                                         "for breakdown in 20ms and 40ms, so the signals prior to a breakdown.")

Ejemplo n.º 12

0

Mostrar archivo

def convert_iso8601_to_datetime(file_path: Path,
                                also_convert_attrs: bool = True) -> None:
    """converts datasets and attributes of strings of iso8601 format to numpy datetime format.
    :param file_path: Path of the hdf file to convert.
    :param also_convert_attrs: boolean value to define if attrs datetime should be converted too."""
    def convert_attrs(_: str, hdf_obj):
        """This visitor function (hdf.File.visititems()) converts all the attributes of the given hdf_obj."""
        for attrs_key, val in hdf_obj.attrs.items():
            try:
                val = pd.to_datetime(val.astype(str),
                                     format="%Y-%m-%dT%H:%M:%S.%f")
            except ValueError:
                pass
            else:
                val = val.to_numpy(np.datetime64)
                del hdf_obj.attrs[attrs_key]
                hdf_obj.attrs.create(name=attrs_key,
                                     data=np.array(val).astype(
                                         h5py.opaque_dtype(val.dtype)))

    with h5py.File(file_path, mode="r+") as file:
        if also_convert_attrs:
            convert_attrs("/", file)
            file.visititems(convert_attrs)

        for key, channel in list(get_all_dataset_items(file)):
            try:
                data = pd.to_datetime(channel[:].astype(str),
                                      format="%Y-%m-%dT%H:%M:%S.%f")
            except ValueError:
                pass
            else:
                data = data.to_numpy(np.datetime64)
                del file[key]
                file.create_dataset(name=key,
                                    data=data.astype(
                                        h5py.opaque_dtype(data.dtype)))

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_XBOX2_event_all_bd_20ms.py Proyecto: cobermai/rfstudies

def test__load_dataset(tmpdir):
    """
    Test load_dataset() function
    """
    # ARRANGE
    selector = XBOX2_event_all_bd_20ms.XBOX2EventAllBD20msSelect()
    path = tmpdir.join("context.hdf")
    context_dummy = h5py.File(path, 'w')
    dummy_is_bd_in_40ms_labels = np.ones((10, ), dtype=bool)
    dummy_is_bd_in_20ms_labels = np.ones((10, ), dtype=bool)
    dummy_is_bd_labels = np.ones((10, ), dtype=bool)
    dummy_event_timestamps = np.array([
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09')
    ])
    dummy_trend_timestamps = np.array([
        np.datetime64('2021-08-18T17:59:03'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:06'),
        np.datetime64('2021-08-18T17:59:00'),
        np.datetime64('2021-08-18T17:59:04'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:06'),
        np.datetime64('2021-08-18T17:59:00'),
        np.datetime64('2021-08-18T17:59:04')
    ])
    dummy_is_healthy_labels = np.ones((10, ), dtype=bool)
    with context_dummy as f:
        f.create_dataset("is_bd_in_40ms", data=dummy_is_bd_in_40ms_labels)
        f.create_dataset("is_bd_in_20ms", data=dummy_is_bd_in_20ms_labels)
        f.create_dataset("is_bd", data=dummy_is_bd_labels)
        selection_list = [
            "DC_Down__D1", "DC_Down__D9", "DC_Down__tsfresh__mean",
            "DC_Down__tsfresh__maximum", "DC_Down__tsfresh__median",
            "DC_Down__tsfresh__minimum", "DC_Up__D1", "DC_Up__D9",
            "DC_Up__tsfresh__mean", "DC_Up__tsfresh__maximum",
            "DC_Up__tsfresh__median", "DC_Up__tsfresh__minimum",
            "PEI_Amplitude__pulse_length", "PEI_Amplitude__pulse_amplitude",
            "PKI_Amplitude__pulse_length", "PKI_Amplitude__pulse_amplitude",
            "PSI_Amplitude__pulse_length", "PSI_Amplitude__pulse_amplitude"
        ]
        for name in selection_list:
            f.create_dataset(name, data=np.ones((10, )))
        f["Timestamp"] = dummy_event_timestamps.astype(
            h5py.opaque_dtype(dummy_event_timestamps.dtype))
        f["PrevTrendData/Timestamp"] = dummy_trend_timestamps.astype(
            h5py.opaque_dtype(dummy_trend_timestamps.dtype))
        f.create_dataset("clic_label/is_healthy", data=dummy_is_healthy_labels)
        f.create_dataset("is_healthy", data=dummy_is_healthy_labels)

    splits_expected = (0.7, 0.2, 0.1)

    # ACT
    np.random.seed(42)
    train, valid, test = dataset_creator.load_dataset(creator=selector,
                                                      data_path=tmpdir)
    sum_elements = len(train.idx) + len(valid.idx) + len(test.idx)
    splits = (len(train.idx) / sum_elements, len(valid.idx) / sum_elements,
              len(test.idx) / sum_elements)

    # ASSERT
    assert splits == splits_expected

Ejemplo n.º 14

0

Mostrar archivo

import h5py
import numpy as np

arr = np.array([np.datetime64('2019-09-22T17:38:30')])

with h5py.File('datetimes.h5', 'w') as f:
    # Create dataset
    f['data'] = arr.astype(h5py.opaque_dtype(arr.dtype))

    # Read
    print(f['data'][:])

Ejemplo n.º 15

0

Mostrar archivo

def test__load_dataset(tmpdir):
    """
    Test load_dataset() function
    """
    # ARRANGE
    selector = XBOX2_trend_all_bd_20ms.XBOX2TrendAllBD20msSelect()
    path = tmpdir.join("context.hdf")
    context_dummy = h5py.File(path, 'w')
    dummy_is_bd_in_40ms_labels = np.ones((10, ), dtype=bool)
    dummy_is_bd_in_20ms_labels = np.ones((10, ), dtype=bool)
    dummy_is_bd_labels = np.ones((10, ), dtype=bool)
    dummy_event_timestamps = np.array([
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09'),
        np.datetime64('2021-08-18T17:59:09')
    ])
    dummy_trend_timestamps = np.array([
        np.datetime64('2021-08-18T17:59:03'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:06'),
        np.datetime64('2021-08-18T17:59:00'),
        np.datetime64('2021-08-18T17:59:04'),
        np.datetime64('2021-08-18T17:59:02'),
        np.datetime64('2021-08-18T17:59:06'),
        np.datetime64('2021-08-18T17:59:00'),
        np.datetime64('2021-08-18T17:59:04')
    ])
    dummy_is_healthy_labels = np.ones((10, ), dtype=bool)
    with context_dummy as f:
        f.create_dataset("is_bd_in_40ms", data=dummy_is_bd_in_40ms_labels)
        f.create_dataset("is_bd_in_20ms", data=dummy_is_bd_in_20ms_labels)
        f.create_dataset("is_bd", data=dummy_is_bd_labels)
        selection_list = [
            "PrevTrendData__Loadside_win", "PrevTrendData__Tubeside_win",
            "PrevTrendData__Collector", "PrevTrendData__Gun",
            "PrevTrendData__IP_before_PC", "PrevTrendData__PC_IP",
            "PrevTrendData__WG_IP", "PrevTrendData__IP_Load",
            "PrevTrendData__IP_before_structure",
            "PrevTrendData__US_Beam_Axis_IP",
            "PrevTrendData__Klystron_Flange_Temp", "PrevTrendData__Load_Temp",
            "PrevTrendData__PC_Left_Cavity_Temp",
            "PrevTrendData__PC_Right_Cavity_Temp",
            "PrevTrendData__Bunker_WG_Temp",
            "PrevTrendData__Structure_Input_Temp", "PrevTrendData__Chiller_1",
            "PrevTrendData__Chiller_2", "PrevTrendData__Chiller_3",
            "PrevTrendData__PKI_FT_avg", "PrevTrendData__PSI_FT_avg",
            "PrevTrendData__PSR_FT_avg", "PrevTrendData__PSI_max",
            "PrevTrendData__PSR_max", "PrevTrendData__PEI_max",
            "PrevTrendData__DC_Down_min", "PrevTrendData__DC_Up_min",
            "PrevTrendData__PSI_Pulse_Width"
        ]
        for name in selection_list:
            f.create_dataset(name, data=np.ones((10, )))
        f["Timestamp"] = dummy_event_timestamps.astype(
            h5py.opaque_dtype(dummy_event_timestamps.dtype))
        f["PrevTrendData/Timestamp"] = dummy_trend_timestamps.astype(
            h5py.opaque_dtype(dummy_trend_timestamps.dtype))
        f.create_dataset("clic_label/is_healthy", data=dummy_is_healthy_labels)
        f.create_dataset("is_healthy", data=dummy_is_healthy_labels)
        f.create_dataset("run_no", data=dummy_is_bd_labels)

    path2 = tmpdir.join("context.hdf")

    splits_expected = (0.7, 0.2, 0.1)

    # ACT
    np.random.seed(42)
    train, valid, test = dataset_creator.load_dataset(creator=selector,
                                                      data_path=tmpdir,
                                                      splits=splits_expected)
    sum_elements = len(train.idx) + len(valid.idx) + len(test.idx)
    splits = (len(train.idx) / sum_elements, len(valid.idx) / sum_elements,
              len(test.idx) / sum_elements)

    # ASSERT
    assert splits == splits_expected

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_hdf5zarr.py Proyecto: d-sot/HDF5Zarr

    def _create_file(cls, name):
        """ create test hdf5 file """

        srand = cls.srand

        # create hdf5 file
        cls.temp_file = tempfile.NamedTemporaryFile(suffix=".hdf5",
                                                    prefix=name,
                                                    delete=False)
        cls.temp_file.close()
        hfile = h5py.File(cls.temp_file.name, 'w')

        # create nested groups
        groupnames_prefix = [chr(65 + i) for i in range(cls.n_groups)
                             ]  # e.g. ['A', 'B', 'C']
        group_list = [hfile]  # list containing all groups

        def _create_groups(obj, d):
            nonlocal group_list

            for c in groupnames_prefix:
                g_name = c + str(cls.depth - d)
                g = obj.create_group(g_name)
                group_list.append(g)
                if d > 0:
                    _create_groups(obj[g_name], d - 1)

        _create_groups(hfile, cls.depth)

        # create softlinks to groups
        for g in group_list:
            for i in range(cls.n_groupsoftlink):
                # do not use rand_rng.choice
                target_str = srand.choice(group_list).name
                g[f"SoftLg{i}"] = h5py.SoftLink(target_str)

        # create datasets
        # TO DO, external dsets
        # TO DO, compression
        srand.shuffle(cls.dset_dtypes)
        iter_dtypes = itertools.cycle(
            cls.dset_dtypes
        )  # shuffle dtypes to cycle over when creating dsets
        iter_chunks = itertools.cycle(
            [True, None])  # True or False cycle for auto chunking
        iter_track_times = itertools.cycle(
            [False, True])  # True or False cycle for track_times
        iter_track_order = itertools.cycle(
            [False, False, True, True])  # True or False cycle for track_order
        iter_fillvalue = itertools.cycle(
            [None, True, True, None])  # True or False cycle for track_order
        rand_rng = np.random.default_rng()
        dset_list = []
        for g in group_list:
            # TO DO, add test with datasets with zero in dimensions
            for i in range(cls.n_dsets):
                shape = srand.choices(range(1, 90 // (i or 1)),
                                      k=i)  # dseti has i dimensions
                size = np.prod(shape)
                dtype = next(iter_dtypes)
                if dtype == np.bool_:
                    data = np.frombuffer(rand_rng.bytes(size * 8),
                                         dtype=np.int64) > 0
                elif dtype == np.datetime64:
                    data = np.datetime64(
                        '1970-01-01T00:00:00', 'ns') + np.frombuffer(
                            rand_rng.bytes(size * 8), dtype=np.uint64)
                    dtype = h5py.opaque_dtype(data.dtype)
                    data = data.astype(dtype)
                else:
                    data = np.frombuffer(rand_rng.bytes(
                        size * np.dtype(dtype).itemsize),
                                         dtype=dtype)

                # create_dataset options comptability
                if len(shape) > 0:
                    chunks = next(iter_chunks)
                else:
                    chunks = None
                    # compression = None
                    # compression_opts = None
                    # shuffle = None
                    # fletcher32 = None
                    # scaleoffset = None
                fillvalue = None if (
                    next(iter_fillvalue) is None or data.dtype.char
                    == 'M') else data.reshape(size)[rand_rng.integers(0, size)]

                dset = g.create_dataset(
                    name='dset' + str(i),
                    shape=shape,
                    data=data,
                    dtype=dtype,
                    chunks=chunks,
                    maxshape=None if chunks is None else tuple(
                        (np.array(shape) + rand_rng.integers(0, 5)) *
                        rand_rng.integers(1, 5, size=len(shape))),
                    track_times=next(iter_track_times),
                    track_order=next(iter_track_order),
                    fillvalue=fillvalue)

                dset_list.append(dset)

        # create softlinks to datasets
        for g in group_list:
            for i in range(cls.n_dsetsoftlink):
                # do not use rand_rng.choice
                target_str = srand.choice(dset_list).name
                g[f"SoftLd{i}"] = h5py.SoftLink(target_str)

        # add attributes
        srand.shuffle(cls.dset_dtypes)
        iter_dtypes = itertools.cycle(
            cls.dset_dtypes
        )  # shuffle dtypes to cycle over when creating attributes
        for obj in itertools.chain(group_list, dset_list):
            for i in range(
                    rand_rng.integers(cls.n_attributes_min, 26,
                                      endpoint=True)):
                dtype = next(iter_dtypes)
                attr_name = chr(97 + i)
                if dtype == np.bool_:
                    attr = np.frombuffer(rand_rng.bytes(8), dtype=np.int64) > 0
                elif dtype == np.datetime64:
                    continue
                else:
                    attr = np.frombuffer(rand_rng.bytes(
                        np.dtype(dtype).itemsize),
                                         dtype=dtype)
                obj.attrs[attr_name] = attr[0]

            # add array attributes
            for i in range(
                    rand_rng.integers(cls.n_attributes_min, 26,
                                      endpoint=True)):
                shape = srand.choices(range(1, 10 // (i // 5 or 1)), k=i //
                                      5)  # attributes has i//5 dimensions
                size = np.prod(shape)
                dtype = next(iter_dtypes)
                attr_name = chr(65 + i) + '_array_attr'
                if dtype == np.bool_:
                    attr = np.frombuffer(rand_rng.bytes(size * 8),
                                         dtype=np.int64) > 0
                elif dtype == np.datetime64:
                    attr = np.datetime64(
                        '1970-01-01T00:00:00', 'ns') + np.frombuffer(
                            rand_rng.bytes(size * 8), dtype=np.uint64)
                    attr = attr.astype(h5py.opaque_dtype(attr.dtype))
                else:
                    attr = np.frombuffer(rand_rng.bytes(
                        size * np.dtype(dtype).itemsize),
                                         dtype=dtype)
                obj.attrs[attr_name] = attr

        return hfile