def make_data(kind): global data global dt if kind is bytes: s = b"xx" else: s = b"xx".decode('utf8') dt = h5py.vlen_dtype(kind) data = np.array([s*100 for idx in xrange(1000)])
def test_vlen_enum(self): fname = self.mktemp() arr1 = [[1], [1, 2]] dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i')) with h5py.File(fname, 'w') as f: df1 = f.create_dataset('test', (len(arr1),), dtype=dt1) df1[:] = np.array(arr1, dtype=object) with h5py.File(fname, 'r') as f: df2 = f['test'] dt2 = df2.dtype arr2 = [e.tolist() for e in df2[:]] self.assertEqual(arr1, arr2) self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)), h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
def test_convert(self): dt = h5py.vlen_dtype(int) ds = self.f.create_dataset('vlen', (3, ), dtype=dt) ds[0] = np.array([1.4, 1.2]) ds[1] = np.array([1.2]) ds[2] = [1.2, 2, 3] self.assertArrayEqual(ds[0], np.array([1, 1])) self.assertArrayEqual(ds[1], np.array([1])) self.assertArrayEqual(ds[2], np.array([1, 2, 3])) ds[0:2] = np.array([[0.1, 1.1, 2.1, 3.1, 4], np.arange(4)]) self.assertArrayEqual(ds[0], np.arange(5)) self.assertArrayEqual(ds[1], np.arange(4)) ds[0:2] = np.array( [np.array([0.1, 1.2, 2.2]), np.array([0.2, 1.2, 2.2])]) self.assertArrayEqual(ds[0], np.arange(3)) self.assertArrayEqual(ds[1], np.arange(3))
def test_int(self): dt = h5py.vlen_dtype(int) ds = self.f.create_dataset('vlen', (4, ), dtype=dt) ds[0] = np.arange(3) ds[1] = np.arange(0) ds[2] = [1, 2, 3] ds[3] = np.arange(1) self.assertArrayEqual(ds[0], np.arange(3)) self.assertArrayEqual(ds[1], np.arange(0)) self.assertArrayEqual(ds[2], np.array([1, 2, 3])) self.assertArrayEqual(ds[1], np.arange(0)) ds[0:2] = np.array([np.arange(5), np.arange(4)]) self.assertArrayEqual(ds[0], np.arange(5)) self.assertArrayEqual(ds[1], np.arange(4)) ds[0:2] = np.array([np.arange(3), np.arange(3)]) self.assertArrayEqual(ds[0], np.arange(3)) self.assertArrayEqual(ds[1], np.arange(3))
def test_vlen_enum(self): fname = self.mktemp() arr1 = [[1],[1,2]] dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i')) with h5py.File(fname,'w') as f: df1 = f.create_dataset('test', (len(arr1),), dtype=dt1) df1[:] = np.array(arr1) with h5py.File(fname,'r') as f: df2 = f['test'] dt2 = df2.dtype arr2 = [e.tolist() for e in df2[:]] self.assertEqual(arr1, arr2) self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)), h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
def test_compound_vlen_bool(self): vidt = h5py.vlen_dtype(np.uint8) def a(items): return np.array(items, dtype=np.uint8) f = self.f dt_vb = np.dtype([ ('foo', vidt), ('logical', bool)]) vb = f.create_dataset('dt_vb', shape=(4,), dtype=dt_vb) data = np.array([(a([1, 2, 3]), True), (a([1 ]), False), (a([1, 5 ]), True), (a([],), False), ], dtype=dt_vb) vb[:] = data actual = f['dt_vb'][:] self.assertVlenArrayEqual(data['foo'], actual['foo']) self.assertArrayEqual(data['logical'], actual['logical']) dt_vv = np.dtype([ ('foo', vidt), ('bar', vidt)]) f.create_dataset('dt_vv', shape=(4,), dtype=dt_vv) dt_vvb = np.dtype([ ('foo', vidt), ('bar', vidt), ('logical', bool)]) vvb = f.create_dataset('dt_vvb', shape=(2,), dtype=dt_vvb) dt_bvv = np.dtype([ ('logical', bool), ('foo', vidt), ('bar', vidt)]) bvv = f.create_dataset('dt_bvv', shape=(2,), dtype=dt_bvv) data = np.array([(True, a([1, 2, 3]), a([1, 2])), (False, a([]), a([2, 4, 6])), ], dtype=bvv) bvv[:] = data actual = bvv[:] self.assertVlenArrayEqual(data['foo'], actual['foo']) self.assertVlenArrayEqual(data['bar'], actual['bar']) self.assertArrayEqual(data['logical'], actual['logical'])
def test_compound_vlen_bool(self): vidt = h5py.vlen_dtype(np.uint8) def a(items): return np.array(items, dtype=np.uint8) f = self.f dt_vb = np.dtype([ ('foo', vidt), ('logical', np.bool)]) vb = f.create_dataset('dt_vb', shape=(4,), dtype=dt_vb) data = np.array([(a([1,2,3]), True), (a([1 ]), False), (a([1,5 ]), True), (a([], ), False),], dtype=dt_vb) vb[:] = data actual = f['dt_vb'][:] self.assertVlenArrayEqual(data['foo'], actual['foo']) self.assertArrayEqual(data['logical'], actual['logical']) dt_vv = np.dtype([ ('foo', vidt), ('bar', vidt)]) f.create_dataset('dt_vv', shape=(4,), dtype=dt_vv) dt_vvb = np.dtype([ ('foo', vidt), ('bar', vidt), ('logical', np.bool)]) vvb = f.create_dataset('dt_vvb', shape=(2,), dtype=dt_vvb) dt_bvv = np.dtype([ ('logical', np.bool), ('foo', vidt), ('bar', vidt)]) bvv = f.create_dataset('dt_bvv', shape=(2,), dtype=dt_bvv) data = np.array([(True, a([1,2,3]), a([1,2]) ), (False, a([]), a([2,4,6])),], dtype=bvv) bvv[:] = data actual = bvv[:] self.assertVlenArrayEqual(data['foo'], actual['foo']) self.assertVlenArrayEqual(data['bar'], actual['bar']) self.assertArrayEqual(data['logical'], actual['logical'])
def add_brep_from_string(self, name, shape_data): """ Add a brep contained in a string. """ if name not in self._ref: shape = self._ref.create_dataset(name, (1, ), dtype=h5py.vlen_dtype(str)) if type(shape_data) == str: # raw str shape[:] = shape_data else: # __getstate__ as with pythonocc shape[:] = shape_data[0] shape.attrs['occ_indx'] = shape_data[1] shape.attrs['id'] = self._number_of_shapes shape.attrs['type'] = 'brep' self._number_of_shapes += 1
def export_set(output_dir, name, data, labels, classes): """Stores paired data and labels into passed h5 file pointer.""" assert len(data) == len(labels) # Variable-length datatypes for encoded png streams and label names dt_int = h5py.vlen_dtype(np.dtype('uint8')) dt_str = h5py.string_dtype(encoding='utf-8') # Initialize hdf5 file pointer f = h5py.File(f"{output_dir}/{name}_{len(data)}.h5", "w") # Create group and store data/labels x = f.create_dataset("data", (len(data), ), dtype=dt_int, data=data) y = f.create_dataset("label", data=np.array(labels, dtype=int)) # Store <mapping from (0, 1 ...) to class names> as group attribute y.attrs.create("class_names", data=np.array(classes, dtype=dt_str)) f.close()
def write_contact_map_h5(h5_file, rows, cols): # Helper function to create ragged array def ragged(data): a = np.empty(len(data), dtype=object) a[...] = data return a # Specify variable length arrays dt = h5py.vlen_dtype(np.dtype("int16")) # list of np arrays of shape (2 * X) where X varies data = ragged([np.concatenate(row_col) for row_col in zip(rows, cols)]) h5_file.create_dataset( "contact_map", data=data, dtype=dt, fletcher32=True, chunks=(1, ) + data.shape[1:], )
def test_compound_vlen_enum(self): eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8) vidt = h5py.vlen_dtype(np.uint8) def a(items): return np.array(items, dtype=np.uint8) f = self.f dt_vve = np.dtype([ ('foo', vidt), ('bar', vidt), ('switch', eidt)]) vve = f.create_dataset('dt_vve', shape=(2,), dtype=dt_vve) data = np.array([(a([1,2,3]), a([1,2]), 1), (a([]), a([2,4,6]), 0),], dtype=dt_vve) vve[:] = data actual = vve[:] self.assertVlenArrayEqual(data['foo'], actual['foo']) self.assertVlenArrayEqual(data['bar'], actual['bar']) self.assertArrayEqual(data['switch'], actual['switch'])
def test_compound_vlen_enum(self): eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8) vidt = h5py.vlen_dtype(np.uint8) def a(items): return np.array(items, dtype=np.uint8) f = self.f dt_vve = np.dtype([('foo', vidt), ('bar', vidt), ('switch', eidt)]) vve = f.create_dataset('dt_vve', shape=(2, ), dtype=dt_vve) data = np.array([ (a([1, 2, 3]), a([1, 2]), 1), (a([]), a([2, 4, 6]), 0), ], dtype=dt_vve) vve[:] = data actual = vve[:] self.assertVlenArrayEqual(data['foo'], actual['foo']) self.assertVlenArrayEqual(data['bar'], actual['bar']) self.assertArrayEqual(data['switch'], actual['switch'])
def to_hdf5(self, parent: h5py.Group) -> None: """Stores this instance in an HDF5 group inside of the provided parent group. See also :func:`~qiskit_nature.hdf5.HDF5Storable.to_hdf5` for more details. Args: parent: the parent HDF5 group. """ group = parent.require_group(self.name) group.attrs["__class__"] = self.__class__.__name__ group.attrs["__module__"] = self.__class__.__module__ group.attrs["__version__"] = self.VERSION group.attrs["num_body_terms"] = self._num_body_terms dtype = h5py.vlen_dtype(np.dtype("int32")) integrals_dset = group.create_dataset("integrals", (len(self.integrals),), dtype=dtype) coeffs_dset = group.create_dataset("coefficients", (len(self.integrals),), dtype=float) for idx, ints in enumerate(self.integrals): coeffs_dset[idx] = ints[0] integrals_dset[idx] = list(ints[1])
def __init__(self, root_group: h5py.Group, total_length: int, feature_dim: int): self.root_group = root_group LOG.debug(f"Creating dataset '{_FEATURE_GROUP}'") self._feature_dataset = self.root_group.create_dataset( _FEATURE_GROUP, (total_length, ), dtype=h5py.vlen_dtype(np.dtype("float32")), ) self._feature_dataset.attrs["feature_dim"] = feature_dim LOG.debug(f"Creating group '{_LABEL_GROUP}'") self._label_group = self.root_group.create_group(_LABEL_GROUP) self._label_datasets = dict() LOG.debug(f"Creating dataset '{_UID_GROUP}'") self._uid_dataset = self.root_group.create_dataset(_UID_GROUP, (total_length, ), dtype=string_dtype) self.total_length = total_length self.feature_dim = feature_dim self._current_index = 0
def run(self) -> None: if self.cache_exists() and not self.force_update: self.logger.info( "Cached version of tokenized data already exists. " + "Skipping tokenization.") return None with h5py.File(self.hdf5_path, "a") as hdf5_store: for hdf5_group_name in self.raw_data_group_names.values(): hdf5_group = hdf5_store.get(hdf5_group_name) captions = numpy.array(hdf5_group["caption_cleaned"]) captions_tokenized = [] captions_tokenized_id = [] for caption in tqdm(captions): caption_tokenized = ( self.tokenizer.encode_with_bos_eos(caption)) caption_tokenized_id = ( self.tokenizer.encode_ids_with_bos_eos(caption)) captions_tokenized.append(caption_tokenized) captions_tokenized_id.append(caption_tokenized_id) if "caption_cleaned_tokenized" in hdf5_group.keys(): del hdf5_group["caption_cleaned_tokenized"] if "caption_cleaned_tokenized_id" in hdf5_group.keys(): del hdf5_group["caption_cleaned_tokenized_id"] hdf5_group.create_dataset( "caption_cleaned_tokenized", data=numpy.array( captions_tokenized, dtype=h5py.string_dtype(encoding="utf-8"))) token_id_dataset = hdf5_group.create_dataset( "caption_cleaned_tokenized_id", shape=(len(captions_tokenized_id), ), dtype=h5py.vlen_dtype(numpy.dtype("int32"))) token_id_dataset[...] = captions_tokenized_id
def test_compound_vlen(self): vidt = h5py.vlen_dtype(np.uint8) eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8) for np_align in (False, True): dt = np.dtype([ ('a', eidt), ('foo', vidt), ('bar', vidt), ('switch', eidt)], align=np_align) np_offsets = [dt.fields[i][1] for i in dt.names] for logical in (False, True): if logical and np_align: # Vlen types have different size in the numpy struct self.assertRaises(TypeError, h5py.h5t.py_create, dt, logical=logical) else: ht = h5py.h5t.py_create(dt, logical=logical) offsets = [ht.get_member_offset(i) for i in range(ht.get_nmembers())] if np_align: self.assertEqual(np_offsets, offsets)
def _extract_features_mp(timestamps: Sequence[Sequence[float]], sizes: Sequence[Sequence[float]], max_size: int = DEFAULT_NUM_FEATURES, n_jobs: Optional[int] = None) -> np.ndarray: features = np.zeros((len(sizes), max_size), float) # Serialise the timestamps and sizes to file with tempfile.TemporaryDirectory(prefix="kfp-extract-") as directory: with h5py.File(f"{directory}/data.hdf", mode="w") as h5file: dtype = h5py.vlen_dtype(np.dtype("float")) h5file.create_dataset("sizes", data=sizes, dtype=dtype) h5file.create_dataset("timestamps", data=timestamps, dtype=dtype) offset = 0 # Use our own splits as imap chunking would yield them one at a time chunksize = 5000 n_chunks = max(len(sizes) // chunksize, 1) splits = np.array_split(np.arange(len(sizes)), n_chunks) assert n_chunks == len(splits) _LOGGER.info("Extracting features in %d batches...", n_chunks) with multiprocessing.Pool(n_jobs) as pool: # Pass the filenames and indices to the background process for i, batch in enumerate( pool.imap(functools.partial(_run_extraction, directory=directory, max_size=max_size), splits, chunksize=1)): # Recombine them filenames and indices features[offset:offset + len(batch), :] = batch offset += len(batch) _LOGGER.info("Extraction is %.2f%% complete.", ((i + 1) * 100 / n_chunks)) return features
def save_commute_hubs_to_hdf5(commute_hubs: CommuteHubs, file_path: str): n_hubs = len(commute_hubs) dt = h5py.vlen_dtype(np.dtype("int32")) with h5py.File(file_path, "a") as f: commute_hubs_dset = f.create_group("commute_hubs") ids = [] cities = [] commute_units_list = [] for hub in commute_hubs: ids.append(hub.id) cities.append(hub.city) commute_units = [] for commute_unit in hub.commuteunits: commute_units.append(commute_unit.id) commute_units_list.append(np.array(commute_units, dtype=np.int)) ids = np.array(ids, dtype=np.int) cities = np.array(cities, dtype="S20") commute_units_list = np.array(commute_units_list, dtype=dt) commute_hubs_dset.attrs["n_commute_hubs"] = n_hubs commute_hubs_dset.create_dataset("id", data=ids) commute_hubs_dset.create_dataset("city_names", data=cities) commute_hubs_dset.create_dataset("commute_units", data=commute_units_list)
def test_reuse_from_other(self): dt = h5py.vlen_dtype(int) ds = self.f.create_dataset('vlen', (1,), dtype=dt) self.f.create_dataset('vlen2', (1,), ds[()].dtype)
import h5py import numpy as np from june.groups import Schools, School from june.world import World from .utils import read_dataset nan_integer = -999 int_vlen_type = h5py.vlen_dtype(np.dtype("int64")) def save_schools_to_hdf5(schools: Schools, file_path: str, chunk_size: int = 50000): """ Saves the schools object to hdf5 format file ``file_path``. Currently for each person, the following values are stored: - id, n_pupils_max, age_min, age_max, sector, coordiantes Parameters ---------- schools population object file_path path of the saved hdf5 file chunk_size number of people to save at a time. Note that they have to be copied to be saved, so keep the number below 1e6. """ n_schools = len(schools)
def save_hospitals_to_hdf5(hospitals: Hospitals, file_path: str, chunk_size: int = 50000): """ Saves the Hospitals object to hdf5 format file ``file_path``. Currently for each person, the following values are stored: - id, n_beds, n_icu_beds, super_area, coordinates Parameters ---------- companies population object file_path path of the saved hdf5 file chunk_size number of people to save at a time. Note that they have to be copied to be saved, so keep the number below 1e6. """ n_hospitals = len(hospitals) n_chunks = int(np.ceil(n_hospitals / chunk_size)) vlen_type = h5py.vlen_dtype(np.dtype("float64")) with h5py.File(file_path, "a") as f: hospitals_dset = f.create_group("hospitals") for chunk in range(n_chunks): idx1 = chunk * chunk_size idx2 = min((chunk + 1) * chunk_size, n_hospitals) ids = [] n_beds = [] n_icu_beds = [] super_areas = [] coordinates = [] trust_code = [] for hospital in hospitals[idx1:idx2]: ids.append(hospital.id) if hospital.super_area is None: super_areas.append(nan_integer) else: super_areas.append(hospital.super_area) n_beds.append(hospital.n_beds) n_icu_beds.append(hospital.n_icu_beds) coordinates.append(np.array(hospital.coordinates)) trust_code.append(hospital.trust_code) ids = np.array(ids, dtype=np.int) super_areas = np.array(super_areas, dtype="S20") trust_code = np.array(trust_code, dtype="S10") n_beds = np.array(n_beds, dtype=np.int) n_icu_beds = np.array(n_icu_beds, dtype=np.int) coordinates = np.array(coordinates, dtype=np.float) if chunk == 0: hospitals_dset.attrs["n_hospitals"] = n_hospitals hospitals_dset.create_dataset("id", data=ids, maxshape=(None, )) hospitals_dset.create_dataset("super_area", data=super_areas, maxshape=(None, )) hospitals_dset.create_dataset("trust_code", data=trust_code, maxshape=(None, )) hospitals_dset.create_dataset("n_beds", data=n_beds, maxshape=(None, )) hospitals_dset.create_dataset("n_icu_beds", data=n_icu_beds, maxshape=(None, )) hospitals_dset.create_dataset("coordinates", data=coordinates, maxshape=(None, coordinates.shape[1])) else: newshape = (hospitals_dset["id"].shape[0] + ids.shape[0], ) hospitals_dset["id"].resize(newshape) hospitals_dset["id"][idx1:idx2] = ids hospitals_dset["super_area"].resize(newshape) hospitals_dset["super_area"][idx1:idx2] = super_areas hospitals_dset["trust_code"].resize(newshape) hospitals_dset["trust_code"][idx1:idx2] = trust_code hospitals_dset["n_beds"].resize(newshape) hospitals_dset["n_beds"][idx1:idx2] = n_beds hospitals_dset["n_icu_beds"].resize(newshape) hospitals_dset["n_icu_beds"][idx1:idx2] = n_icu_beds hospitals_dset["coordinates"].resize(newshape[0], axis=0) hospitals_dset["coordinates"][idx1:idx2] = coordinates
import h5py import numpy as np from collections import defaultdict from june.groups import ExternalGroup, ExternalSubgroup from june.geography import Geography, Area, SuperArea, Areas, SuperAreas, Region, Regions from .utils import read_dataset from june.world import World nan_integer = -999 int_vlen_type = h5py.vlen_dtype(np.dtype("int64")) str_vlen_type = h5py.vlen_dtype(np.dtype("S40")) social_venues_spec_mapper = { "pubs": "pubs", "household_visits": "households", "care_home_visits": "care_homes", "cinemas": "cinemas", "groceries": "groceries", } super_group_to_group_mapper = { "pubs": "pub", "groceries": "grocery", "cinemas": "cinema", } def save_geography_to_hdf5(geography: Geography, file_path: str): """ Saves the households object to hdf5 format file ``file_path``. Currently for each person,
def write(self, ds: Dataset): import h5py # For data checksums dataset_kwargs = {"chunks": True, "fletcher32": True} n_records = len(ds.data.records) default_shape = (n_records, ) if h5py.__version__ >= distutils.version.StrictVersion("2.10.0"): vlen_double_t = h5py.vlen_dtype(np.dtype("float64")) utf8_t = h5py.string_dtype(encoding="utf-8") bytes_t = h5py.vlen_dtype(np.dtype("uint8")) vlen_utf8_t = h5py.vlen_dtype(utf8_t) else: vlen_double_t = h5py.special_dtype(vlen=np.dtype("float64")) utf8_t = h5py.special_dtype(vlen=str) bytes_t = h5py.special_dtype(vlen=np.dtype("uint8")) vlen_utf8_t = h5py.special_dtype(vlen=utf8_t) driver_dataspec = { "energy": { "dtype": np.dtype("float64"), "shape": default_shape }, "gradient": { "dtype": vlen_double_t, "shape": default_shape }, "hessian": { "dtype": vlen_double_t, "shape": default_shape }, "dipole": { "dtype": np.dtype("float64"), "shape": (n_records, 3) } } def _write_dataset(dataset, column, entry_dset): assert column.shape[1] == 1 for i, name in enumerate(entry_dset): element = column.loc[name][0] if not h5py.check_dtype(vlen=dataset.dtype): dataset[i] = element # Variable length datatypes require flattening of the array and special handling of missing values else: try: dataset[i] = element.ravel() except AttributeError: if np.isnan(element): pass else: raise with self._write_file() as f: # Collection attributes for field in { "name", "collection", "provenance", "tagline", "tags", "id", "history_keys" }: f.attrs[field] = self._serialize_field(getattr(ds.data, field)) if ds.client is not None: f.attrs["server_information"] = self._serialize_field( ds.client.server_information()) f.attrs["server_address"] = self._serialize_field( ds.client.address) # Export molecules molecule_group = f.create_group("molecule") if "stoichiometry" in ds.data.history_keys: molecules = ds.get_molecules(stoich=list(ds.valid_stoich), force=True) else: molecules = ds.get_molecules(force=True) mol_shape = (len(molecules), ) mol_geometry = molecule_group.create_dataset("geometry", shape=mol_shape, dtype=vlen_double_t, **dataset_kwargs) mol_symbols = molecule_group.create_dataset("symbols", shape=mol_shape, dtype=vlen_utf8_t, **dataset_kwargs) mol_schema = molecule_group.create_dataset("schema", shape=mol_shape, dtype=bytes_t, **dataset_kwargs) mol_charge = molecule_group.create_dataset( "charge", shape=mol_shape, dtype=np.dtype('float64'), **dataset_kwargs) mol_spin = molecule_group.create_dataset("multiplicity", shape=mol_shape, dtype=np.dtype('int32'), **dataset_kwargs) mol_id_server_view = {} for i, mol_row in enumerate(molecules.to_dict("records")): molecule = mol_row["molecule"] mol_geometry[i] = molecule.geometry.ravel() mol_schema[i] = self._serialize_data(molecule) mol_symbols[i] = molecule.symbols mol_charge[i] = molecule.molecular_charge mol_spin[i] = molecule.molecular_multiplicity mol_id_server_view[molecule.id] = i # Export entries entry_group = f.create_group("entry") entry_dset = entry_group.create_dataset("entry", shape=default_shape, dtype=utf8_t, **dataset_kwargs) entry_dset[:] = ds.get_index() entries = ds.get_entries(force=True) if isinstance(ds.data.records[0], MoleculeEntry): entry_group.attrs["model"] = "MoleculeEntry" entries["hdf5_molecule_id"] = entries["molecule_id"].map( mol_id_server_view) entry_group.create_dataset("name", data=entries["name"], dtype=utf8_t, **dataset_kwargs) entry_group.create_dataset("molecule_id", data=entries["hdf5_molecule_id"], dtype=np.dtype("int64"), **dataset_kwargs) elif isinstance(ds.data.records[0], ReactionEntry): entry_group.attrs["model"] = "ReactionEntry" entries["hdf5_molecule_id"] = entries["molecule"].map( mol_id_server_view) entry_group.create_dataset("name", data=entries["name"], dtype=utf8_t, **dataset_kwargs) entry_group.create_dataset("stoichiometry", data=entries["stoichiometry"], dtype=utf8_t, **dataset_kwargs) entry_group.create_dataset("molecule", data=entries["hdf5_molecule_id"], dtype=np.dtype("int64"), **dataset_kwargs) entry_group.create_dataset("coefficient", data=entries["coefficient"], dtype=np.dtype("float64"), **dataset_kwargs) else: raise ValueError( f"Unknown entry class ({type(ds.data.records[0])}) while writing HDF5 entries." ) # Export native data columns value_group = f.create_group("value") history = ds.list_values( native=True, force=True).reset_index().to_dict("records") for specification in history: gv_spec = specification.copy() name = gv_spec.pop("name") if "stoichiometry" in gv_spec: gv_spec["stoich"] = gv_spec.pop("stoichiometry") dataset_name = self._normalize_hdf5_name(name) df = ds.get_values(**gv_spec, force=True) assert df.shape[1] == 1 driver = specification["driver"] dataspec = driver_dataspec[driver] dataset = value_group.create_dataset(dataset_name, **dataspec, **dataset_kwargs) for key in specification: dataset.attrs[key] = self._serialize_field( specification[key]) dataset.attrs["units"] = self._serialize_field(ds.units) _write_dataset(dataset, df, entry_dset) # Export contributed data columns contributed_group = f.create_group("contributed_value") for cv_name in ds.list_values(force=True, native=False)["name"]: cv_df = ds.get_values(name=cv_name, force=True, native=False) cv_model = ds.data.contributed_values[cv_name.lower()] try: dataspec = driver_dataspec[ cv_model.theory_level_details["driver"]] except (KeyError, TypeError): warnings.warn( f"Contributed values column {cv_name} does not provide driver in theory_level_details. " f"Assuming default driver for the dataset ({ds.data.default_driver})." ) dataspec = driver_dataspec[ds.data.default_driver] dataset = contributed_group.create_dataset( self._normalize_hdf5_name(cv_name), **dataspec, **dataset_kwargs) for field in { "name", "theory_level", "units", "doi", "comments", "theory_level", "theory_level_details" }: dataset.attrs[field] = self._serialize_field( getattr(cv_model, field)) _write_dataset(dataset, cv_df, entry_dset) # Clean up any caches self._entries = None
def log_population( self, population: Population, chunk_size: int = 100000, ): """ Saves the Population object to hdf5 format file ``self.save_path``. Currently for each person, the following values are stored: - id, age, sex, super_area Parameters ---------- population: population object chunk_size: number of people to save at a time. Note that they have to be copied to be saved, so keep the number below 1e6. """ n_people = len(population.people) dt = h5py.vlen_dtype(np.dtype("int32")) # dt = tuple n_chunks = int(np.ceil(n_people / chunk_size)) with h5py.File(self.file_path, "a", libver="latest") as f: people_dset = f.create_group("population") people_dset.attrs["n_people"] = n_people for chunk in range(n_chunks): idx1 = chunk * chunk_size idx2 = min((chunk + 1) * chunk_size, n_people) ids = [] ages = [] sexes = [] ethnicities = [] socioeconomic_indcs = [] super_areas = [] for person in population.people[idx1:idx2]: ids.append(person.id) ages.append(person.age) ethnicities.append( person.ethnicity.encode("ascii", "ignore")) socioeconomic_indcs.append(person.socioecon_index) sexes.append(person.sex.encode("ascii", "ignore")) super_areas.append(person.area.super_area.name) ids = np.array(ids, dtype=np.int) ages = np.array(ages, dtype=np.int16) sexes = np.array(sexes, dtype="S10") super_areas = np.array(super_areas, dtype="S10") ethnicities = np.array(ethnicities, dtype="S10") socioeconomic_indcs = np.array(socioeconomic_indcs, dtype=np.int8) if chunk == 0: people_dset.create_dataset("id", data=ids, maxshape=(None, ), compression="gzip") people_dset.create_dataset("age", data=ages, maxshape=(None, ), compression="gzip") people_dset.create_dataset("sex", data=sexes, maxshape=(None, ), compression="gzip") people_dset.create_dataset( "ethnicity", data=ethnicities, maxshape=(None, ), compression="gzip", ) people_dset.create_dataset( "socioeconomic_index", data=socioeconomic_indcs, maxshape=(None, ), compression="gzip", ) people_dset.create_dataset( "super_area", data=super_areas, maxshape=(None, ), compression="gzip", ) else: newshape = (people_dset["id"].shape[0] + ids.shape[0], ) people_dset["id"].resize(newshape) people_dset["id"][idx1:idx2] = ids people_dset["age"].resize(newshape) people_dset["age"][idx1:idx2] = ages people_dset["sex"].resize(newshape) people_dset["sex"][idx1:idx2] = sexes people_dset["super_area"].resize(newshape) people_dset["super_area"][idx1:idx2] = super_areas people_dset["ethnicity"].resize(newshape) people_dset["ethnicity"][idx1:idx2] = ethnicities people_dset["socioeconomic_index"].resize(newshape) people_dset["socioeconomic_index"][ idx1:idx2] = socioeconomic_indcs
#!/usr/bin/python3 # -*- coding: utf-8 -*- """ Created on Fri Feb 22 10:35:01 2019 @author: Vedran Furtula """ import h5py, random import numpy run_test = 1 if run_test == 0: dt_ = h5py.vlen_dtype(numpy.dtype('float32')) with h5py.File('resize_dataset.hdf5', 'w') as f: d1 = f.create_dataset('dataset1', (0, ), maxshape=(None, ), dtype=dt_) d2 = f.create_dataset('dataset2', (0, ), maxshape=(None, )) #d1[:10] = np.random.randn(10) #d2[:5] = np.random.randn(5) #d.resize((200,)) #d[100:200] = np.random.randn(100) with h5py.File('resize_dataset.hdf5', 'r') as f: dset = f['dataset1'] print("dset: ", dset[:]) for tal in range(10): with h5py.File('resize_dataset.hdf5', 'a') as f:
def _fixmatlabstruct(fp): # noqa: C901 """Verify MATLAB structs: It cannot load mixed non-scalar structs""" groups = [] def collectgroups(name, obj): """Callback function to collect all suitable struct groups""" if (isinstance(obj, h5py._hl.group.Group) and name != '#refs#' and obj.attrs.get('MATLAB_class', None) != b'struct'): groups.append(obj) def dynamiciterator(): """Dynamically reassessing groups iterator""" while True: fp.visititems(collectgroups) if groups: yield groups[-1] # Start with last else: return # Iterate over all groups to make them MATLAB compatible structs for group in dynamiciterator(): groups = [] # Reset groups for iterator group.attrs['MATLAB_class'] = np.bytes_('struct') # Create struct fields fieldnames = np.empty(len(group.keys()), dtype=h5py.vlen_dtype(np.dtype('|S1'))) fieldnames[:] = [np.fromiter(f, '|S1') for f in group.keys()] group.attrs['MATLAB_fields'] = fieldnames # Recurse into groups to obtain shape (visititems not suitable) def groupshape(obj): """Determine common shape""" if isinstance(obj, h5py._hl.group.Group): # Collect shapes from children dims = [groupshape(chld) for chld in obj.values()] # Obtain first n common dimensions commondim = () for d in zip(*dims): if len(set(d)) != 1: break commondim += (d[0], ) # Pass upward return commondim else: if 'MATLAB_empty' in obj.attrs: return (-np.random.randint(100), ) # Make non-scalar if obj.ndim == 2 and obj.shape[1] == 1: return (obj.shape[0], ) else: # Reversed, because MATLAB transposes return obj.shape[::-1] # Iterate over all children to determine if it should be scalar commondim = groupshape(group) idx = len(commondim) commondim = commondim[::-1] if len(commondim) == 1: commondim += (1, ) # Different shapes = non-scalar: nothing to do if not idx or len(group.keys()) == 1: for child in group.values(): if not isinstance(child, h5py.h5r.Reference): continue # One-sized references can just be resolved into group if child.size == 1: childname = child.name del fp[child.name] group.move(fp[child[()].item()].name, childname) else: # Object arrays might need to be cell arrays child.attrs['MATLAB_class'] = np.bytes_('cell') continue # Turn all children into references to make it non-scalar refs = fp.require_group('#refs#') # Simple loop over all group items. Assumes there are no more # groups within this group that haven't been resolved already. # Reshape a dataset/group/reference and turn it into reference for childname, child in group.items(): # Skip references with correct shape if (getattr(child, 'dtype', None) == h5py.h5r.Reference and getattr(child, 'shape', ()) == commondim): continue # Create a new dataset without any filters rf = group.create_dataset('__h5dereftemp__', shape=commondim, dtype=h5py.ref_dtype) # Iterate over dataset entries fi = np.nditer(rf, flags=['refs_ok', 'multi_index'], itershape=commondim) # Datasets are just turned into references, groups are # split into smaller groups referenced by datasets if isinstance(child, h5py._hl.dataset.Dataset): for _ in fi: # Obtain index for dataset if child.ndim == 2 and child.shape[1] == 1: index = fi.multi_index[:idx] + (Ellipsis, ) else: index = (Ellipsis, ) + fi.multi_index[:idx] # Differentiate between data and reference if child.dtype == h5py.h5r.Reference: v = fp[child.name][index] else: v = child[index] # Fix dimensions if v.ndim < 2: v = np.atleast_2d(v).T else: v = v[()] # Create dataset for each element with filters incr = str(len(refs.items())) refs.create_dataset_like(incr, child, shape=v.shape, chunks=None, maxshape=None) refs[incr][()] = v # Copy attributes for atr_key, atr_val in child.attrs.items(): refs[incr].attrs[atr_key] = atr_val rf[fi.multi_index] = refs[incr].ref else: # Get the group names fieldnames = np.empty(len(child.keys()), dtype=h5py.vlen_dtype(np.dtype('|S1'))) fieldnames[:] = [np.fromiter(f, '|S1') for f in child.keys()] for _ in fi: # Create new group for each split incr = str(len(refs.items())) refs.create_group(incr, track_order=True) # Add struct info refs[incr].attrs['MATLAB_class'] = np.bytes_('struct') refs[incr].attrs['MATLAB_fields'] = fieldnames # Iterate over group children for ckdname, ckd in child.items(): # Leave it like this, until needed if isinstance(ckd, h5py._hl.group.Group): raise NotImplementedError('Nested group') # Obtain index for dataset if ckd.ndim == 2 and ckd.shape[1] == 1: index = fi.multi_index[:idx] + (Ellipsis, ) else: index = (Ellipsis, ) + fi.multi_index[:idx] # Differentiate between data and reference if ckd.dtype == h5py.h5r.Reference: v = fp[ckd.name][index] else: v = ckd[index] # Fix dimensions if v.ndim < 2: v = np.atleast_2d(v).T else: v = v[()] # Create dataset for each element with filters refs[incr].create_dataset_like(ckdname, ckd, dtype=v.dtype, shape=v.shape, chunks=None, maxshape=None) refs[incr][ckdname][()] = v # Copy attributes for atr_key, atr_val in ckd.attrs.items(): refs[incr][ckdname].attrs[atr_key] = atr_val rf[fi.multi_index] = refs[incr].ref # Re-add ALL children to maintain tracking order for ckdname, ckd in group.items(): if ckdname == childname: del group[childname] group[childname] = group['__h5dereftemp__'] del group['__h5dereftemp__'] elif ckdname != '__h5dereftemp__': a = group[ckdname] del group[ckdname] group[ckdname] = a del a
def write_compound_datasets(f): utf8 = h5py.special_dtype(vlen=str) gender_enum_dtype = h5py.enum_dtype({"MALE": 0, "FEMALE": 1}, basetype=np.uint8) dt = np.dtype([ ('firstName', utf8), # variable lentgh utf8 ('surname', 'S20'), # fixed length ASCII ('gender', gender_enum_dtype), # enum type ('age', np.uint8), # uint ('fav_number', np.float32), # float ('vector', np.float32, (3,))]) # array data = np.zeros(4, dtype=dt) # Set the example data data[0] = ('Bob', 'Smith', 0, 32, 1.0, [1, 2, 3]) data[1] = ('Peter', 'Fletcher', 0, 43, 2.0, [16.2, 2.2, -32.4]) data[2] = ('James', 'Mudd', 0, 12, 3.0, [-32.1,-774.1,-3.0]) data[3] = ('Ellie', 'Kyle', 1, 22, 4.0, [2.1,74.1,-3.8]) f.create_dataset('contiguous_compound', data=data) f.create_dataset('chunked_compound', data=data, chunks=(1,), compression="gzip") # 2d compound use img number example imgdt = np.dtype([ ('real', np.float32), ('img', np.float32) ]) data = np.zeros((3, 3), dtype=imgdt) data[0][0] = (2.3, -7.3) data[0][1] = (12.3, -17.3) data[0][2] = (-32.3, -0.3) data[1][0] = (2.3, -7.3) data[1][1] = (12.3, -17.3) data[1][2] = (-32.3, -0.3) data[2][0] = (2.3, -7.3) data[2][1] = (12.3, -17.3) data[2][2] = (-32.3, -0.3) f.create_dataset('2d_contiguous_compound', data=data) f.create_dataset('2d_chunked_compound', data=data, chunks=(1,2), compression="gzip") # Compound dataset containing ragged arrays uint8_vlen_type = h5py.vlen_dtype(np.uint8) compound_vlen_dtype = np.dtype([ ('one', uint8_vlen_type), ('two', uint8_vlen_type) ]) data = np.zeros(3, dtype=compound_vlen_dtype) data[0] = (np.array([1]), np.array([2])) data[1] = (np.array([1,1]), np.array([2,2])) data[2] = (np.array([1,1,1]), np.array([2,2,2])) f.create_dataset('vlen_contiguous_compound', data=data, dtype=compound_vlen_dtype) f.create_dataset('vlen_chunked_compound', data=data, dtype=compound_vlen_dtype, chunks=(1,), compression="gzip") # Compound dataset arrays of vlen type compound_vlen_dtype = np.dtype([ ('name', utf8, 2) ]) pointData = np.zeros(2, dtype=utf8) pointData[0] = "James" pointData[1] = "Ellie" data = np.zeros(1, dtype=compound_vlen_dtype) data['name'] = np.array(pointData) f.create_dataset('array_vlen_contiguous_compound', data=data, dtype=compound_vlen_dtype) f.create_dataset('array_vlen_chunked_compound', data=data, dtype=compound_vlen_dtype, chunks=(1,), compression="gzip") # Nested compound datasets use 2 img numbers as an example nested_dt = np.dtype([ ('firstNumber', imgdt), ('secondNumber', imgdt), ]) data = np.zeros(3, dtype=nested_dt) data[1] = ((1,1), (1,1)) data[2] = ((2,2), (2,2)) f.create_dataset('nested_contiguous_compound', data=data, dtype=nested_dt) f.create_dataset('nested_chunked_compound', data=data, dtype=nested_dt, chunks=(2,), compression="gzip") f.flush() f.close()
def save_households_to_hdf5(households: Households, file_path: str, chunk_size: int = 50000): """ Saves the households object to hdf5 format file ``file_path``. Currently for each person, the following values are stored: - id, n_beds, n_icu_beds, super_area, coordinates Parameters ---------- companies population object file_path path of the saved hdf5 file chunk_size number of people to save at a time. Note that they have to be copied to be saved, so keep the number below 1e6. """ n_households = len(households) n_chunks = int(np.ceil(n_households / chunk_size)) int_vlen_type = h5py.vlen_dtype(np.dtype("int64")) str_vlen_type = h5py.vlen_dtype(np.dtype("S20")) with h5py.File(file_path, "a") as f: households_dset = f.create_group("households") for chunk in range(n_chunks): idx1 = chunk * chunk_size idx2 = min((chunk + 1) * chunk_size, n_households) ids = [] areas = [] types = [] max_sizes = [] household_complacencies = [] for household in households[idx1:idx2]: ids.append(household.id) if household.area is None: areas.append(nan_integer) else: areas.append(household.area.id) if household.type is None: types.append(" ".encode("ascii", "ignore")) else: types.append(household.type.encode("ascii", "ignore")) max_sizes.append(household.max_size) household_complacencies.append(household.household_complacency) ids = np.array(ids, dtype=np.int) areas = np.array(areas, dtype=np.int) types = np.array(types, dtype="S15") max_sizes = np.array(max_sizes, dtype=np.float) household_complacencies = np.array(household_complacencies, dtype=np.float) if chunk == 0: households_dset.attrs["n_households"] = n_households households_dset.create_dataset("id", data=ids, maxshape=(None, )) households_dset.create_dataset("area", data=areas, maxshape=(None, )) households_dset.create_dataset("type", data=types, maxshape=(None, )) households_dset.create_dataset("max_size", data=max_sizes, maxshape=(None, )) households_dset.create_dataset("household_complacency", data=household_complacencies, maxshape=(None, )) else: newshape = (households_dset["id"].shape[0] + ids.shape[0], ) households_dset["id"].resize(newshape) households_dset["id"][idx1:idx2] = ids households_dset["area"].resize(newshape) households_dset["area"][idx1:idx2] = areas households_dset["type"].resize(newshape) households_dset["type"][idx1:idx2] = types households_dset["max_size"].resize(newshape) households_dset["max_size"][idx1:idx2] = max_sizes households_dset["household_complacency"].resize(newshape) households_dset["household_complacency"][ idx1:idx2] = household_complacencies # I dont know how to chunk these... relatives_in_households = [] relatives_in_care_homes = [] social_venues_specs_list = [] social_venues_ids_list = [] for household in households: if (household.relatives_in_households is None or len(household.relatives_in_households) == 0): relatives_in_households.append( np.array([nan_integer], dtype=np.int)) else: relatives_in_households.append( np.array( [ person.id for person in household.relatives_in_households ], dtype=np.int, )) if (household.relatives_in_care_homes is None or len(household.relatives_in_care_homes) == 0): relatives_in_care_homes.append( np.array([nan_integer], dtype=np.int)) else: relatives_in_care_homes.append( np.array( [ person.id for person in household.relatives_in_care_homes ], dtype=np.int, )) social_venues_ids = [] social_venues_specs = [] for spec in household.social_venues.keys(): for social_venue in household.social_venues[spec]: social_venues_specs.append(spec.encode("ascii", "ignore")) social_venues_ids.append(social_venue.id) social_venues_specs_list.append( np.array(social_venues_specs, dtype="S20")) social_venues_ids_list.append( np.array(social_venues_ids, dtype=np.int)) relatives_in_households = np.array(relatives_in_households, dtype=int_vlen_type) relatives_in_care_homes = np.array(relatives_in_care_homes, dtype=int_vlen_type) social_venues_specs_list = np.array(social_venues_specs_list, dtype=str_vlen_type) social_venues_ids_list = np.array(social_venues_ids_list, dtype=int_vlen_type) try: households_dset.create_dataset( "relatives_in_households", data=relatives_in_households, ) except: relatives_in_households = np.array(relatives_in_households, dtype=np.int) households_dset.create_dataset( "relatives_in_households", data=relatives_in_households, ) try: households_dset.create_dataset( "relatives_in_care_homes", data=relatives_in_care_homes, ) except: relatives_in_care_homes = np.array(relatives_in_care_homes, dtype=np.int) households_dset.create_dataset( "relatives_in_care_homes", data=relatives_in_care_homes, ) households_dset.create_dataset( "social_venues_specs", data=social_venues_specs_list, ) households_dset.create_dataset( "social_venues_ids", data=social_venues_ids_list, )
''' import time import warnings import h5py import numpy as np #: Most up-to-date raw larpix hdf5 format version. latest_version = '0.0' #: Description of the datasets and their dtypes used in each version of the raw larpix hdf5 format. #: #: Structured as ``dataset_dtypes['<version>']['<dataset>'] = <dtype>``. dataset_dtypes = { '0.0': { 'msgs': h5py.vlen_dtype(np.dtype('u1')), 'msg_headers': np.dtype([('io_groups', 'u1')]) } } def _store_msgs_v0_0(msgs, version): msg_dtype = np.dtype('u1') arr_dtype = dataset_dtypes[version]['msgs'] return np.array([np.frombuffer(msg, dtype=msg_dtype) for msg in msgs], dtype=arr_dtype) def _store_msg_headers_v0_0(msg_headers, version): length = len(msg_headers['io_groups']) arr = np.zeros((length, ), dtype=dataset_dtypes[version]['msg_headers'])
def saveh5(filename, X, ORF, y): dt = h5py.vlen_dtype(np.dtype('int32')) with h5py.File(filename, 'w') as h5file: h5file.create_dataset('X', dtype=dt, data=X) h5file.create_dataset('ORF', dtype=dt, data=ORF) h5file.create_dataset('y', data=y)
def write_vlen_datasets(f): # Unsigned int uint8_vlen_type = h5py.vlen_dtype(np.uint8) uint8_vlen_dataset = f.create_dataset("vlen_uint8_data", (3, ), dtype=uint8_vlen_type) uint8_vlen_dataset[0] = [0] uint8_vlen_dataset[1] = [1, 2] uint8_vlen_dataset[2] = [3, 4, 5] uint16_vlen_type_chunked = h5py.vlen_dtype(np.uint16) uint16_vlen_dataset = f.create_dataset("vlen_uint16_data", (3, ), dtype=uint16_vlen_type_chunked) uint16_vlen_dataset[0] = [0] uint16_vlen_dataset[1] = [1, 2] uint16_vlen_dataset[2] = [3, 4, 5] uint32_vlen_type = h5py.vlen_dtype(np.uint32) uint32_vlen_dataset = f.create_dataset("vlen_uint32_data", (3, ), dtype=uint32_vlen_type) uint32_vlen_dataset[0] = [0] uint32_vlen_dataset[1] = [1, 2] uint32_vlen_dataset[2] = [3, 4, 5] uint64_vlen_type = h5py.vlen_dtype(np.uint64) uint64_vlen_dataset = f.create_dataset("vlen_uint64_data", (3, ), dtype=uint64_vlen_type) uint64_vlen_dataset[0] = [0] uint64_vlen_dataset[1] = [1, 2] uint64_vlen_dataset[2] = [3, 4, 5] # Signed int int8_vlen_type = h5py.vlen_dtype(np.int8) int8_vlen_dataset = f.create_dataset("vlen_int8_data", (3, ), dtype=int8_vlen_type) int8_vlen_dataset[0] = [0] int8_vlen_dataset[1] = [1, 2] int8_vlen_dataset[2] = [3, 4, 5] int16_vlen_type_chunked = h5py.vlen_dtype(np.int16) int16_vlen_dataset = f.create_dataset("vlen_int16_data", (3, ), dtype=int16_vlen_type_chunked) int16_vlen_dataset[0] = [0] int16_vlen_dataset[1] = [1, 2] int16_vlen_dataset[2] = [3, 4, 5] int32_vlen_type = h5py.vlen_dtype(np.int32) int32_vlen_dataset = f.create_dataset("vlen_int32_data", (3, ), dtype=int32_vlen_type) int32_vlen_dataset[0] = [0] int32_vlen_dataset[1] = [1, 2] int32_vlen_dataset[2] = [3, 4, 5] int64_vlen_type = h5py.vlen_dtype(np.int64) int64_vlen_dataset = f.create_dataset("vlen_int64_data", (3, ), dtype=int64_vlen_type) int64_vlen_dataset[0] = [0] int64_vlen_dataset[1] = [1, 2] int64_vlen_dataset[2] = [3, 4, 5] # Floating point float32_vlen_type = h5py.vlen_dtype(np.float32) float32_vlen_dataset = f.create_dataset("vlen_float32_data", (3, ), dtype=float32_vlen_type) float32_vlen_dataset[0] = [0] float32_vlen_dataset[1] = [1, 2] float32_vlen_dataset[2] = [3, 4, 5] float64_vlen_type = h5py.vlen_dtype(np.float64) float64_vlen_dataset = f.create_dataset("vlen_float64_data", (3, ), dtype=float64_vlen_type) float64_vlen_dataset[0] = [0] float64_vlen_dataset[1] = [1, 2] float64_vlen_dataset[2] = [3, 4, 5] # https://github.com/jamesmudd/jhdf/issues/247 int32_vlen_type = h5py.vlen_dtype(np.dtype(np.int32)) int32_vlen_dataset = f.create_dataset('vlen_issue_247', (3, ), dtype=int32_vlen_type) int32_vlen_dataset[0] = [1, 2, 3] int32_vlen_dataset[1] = [] int32_vlen_dataset[2] = [1, 2, 3, 4, 5] # Chunked # Unsigned int uint8_vlen_type = h5py.vlen_dtype(np.uint8) uint8_vlen_dataset_chunked = f.create_dataset("vlen_uint8_data_chunked", (3, ), dtype=uint8_vlen_type, chunks=(3, )) uint8_vlen_dataset_chunked[0] = [0] uint8_vlen_dataset_chunked[1] = [1, 2] uint8_vlen_dataset_chunked[2] = [3, 4, 5] uint16_vlen_type_chunked = h5py.vlen_dtype(np.uint16) uint16_vlen_dataset = f.create_dataset("vlen_uint16_data_chunked", (3, ), dtype=uint16_vlen_type_chunked, chunks=(3, )) uint16_vlen_dataset[0] = [0] uint16_vlen_dataset[1] = [1, 2] uint16_vlen_dataset[2] = [3, 4, 5] uint32_vlen_type = h5py.vlen_dtype(np.uint32) uint32_vlen_dataset_chunked = f.create_dataset("vlen_uint32_data_chunked", (3, ), dtype=uint32_vlen_type, chunks=(3, )) uint32_vlen_dataset_chunked[0] = [0] uint32_vlen_dataset_chunked[1] = [1, 2] uint32_vlen_dataset_chunked[2] = [3, 4, 5] uint64_vlen_type = h5py.vlen_dtype(np.uint64) uint64_vlen_dataset_chunked = f.create_dataset("vlen_uint64_data_chunked", (3, ), dtype=uint64_vlen_type, chunks=(3, )) uint64_vlen_dataset_chunked[0] = [0] uint64_vlen_dataset_chunked[1] = [1, 2] uint64_vlen_dataset_chunked[2] = [3, 4, 5] # Signed int int8_vlen_type = h5py.vlen_dtype(np.int8) int8_vlen_dataset = f.create_dataset("vlen_int8_data_chunked", (3, ), dtype=int8_vlen_type, chunks=(3, )) int8_vlen_dataset[0] = [0] int8_vlen_dataset[1] = [1, 2] int8_vlen_dataset[2] = [3, 4, 5] int16_vlen_type_chunked = h5py.vlen_dtype(np.int16) int16_vlen_dataset = f.create_dataset("vlen_int16_data_chunked", (3, ), dtype=int16_vlen_type_chunked, chunks=(3, )) int16_vlen_dataset[0] = [0] int16_vlen_dataset[1] = [1, 2] int16_vlen_dataset[2] = [3, 4, 5] int32_vlen_type = h5py.vlen_dtype(np.int32) int32_vlen_dataset = f.create_dataset("vlen_int32_data_chunked", (3, ), dtype=int32_vlen_type, chunks=(3, )) int32_vlen_dataset[0] = [0] int32_vlen_dataset[1] = [1, 2] int32_vlen_dataset[2] = [3, 4, 5] int64_vlen_type = h5py.vlen_dtype(np.int64) int64_vlen_dataset = f.create_dataset("vlen_int64_data_chunked", (3, ), dtype=int64_vlen_type, chunks=(3, )) int64_vlen_dataset[0] = [0] int64_vlen_dataset[1] = [1, 2] int64_vlen_dataset[2] = [3, 4, 5] # Floating point float32_vlen_type = h5py.vlen_dtype(np.float32) float32_vlen_dataset_chunked = f.create_dataset( "vlen_float32_data_chunked", (3, ), dtype=float32_vlen_type, chunks=(3, )) float32_vlen_dataset_chunked[0] = [0] float32_vlen_dataset_chunked[1] = [1, 2] float32_vlen_dataset_chunked[2] = [3, 4, 5] float64_vlen_type = h5py.vlen_dtype(np.float64) float64_vlen_dataset_chunked = f.create_dataset( "vlen_float64_data_chunked", (3, ), dtype=float64_vlen_type, chunks=(3, )) float64_vlen_dataset_chunked[0] = [0] float64_vlen_dataset_chunked[1] = [1, 2] float64_vlen_dataset_chunked[2] = [3, 4, 5] # https://github.com/jamesmudd/jhdf/issues/247 int32_vlen_type = h5py.vlen_dtype(np.dtype(np.int32)) int32_vlen_dataset = f.create_dataset('vlen_issue_247_chunked', (3, ), dtype=int32_vlen_type, chunks=(3, )) int32_vlen_dataset[0] = [1, 2, 3] int32_vlen_dataset[1] = [] int32_vlen_dataset[2] = [1, 2, 3, 4, 5] f.flush() f.close()
map(lambda x: np.array(x, dtype=np.dtype("int32")), encoder.transform(df_train["review"]))) train_scores = df_train["userscore"].to_numpy(np.dtype("int32")) valid_tokens = list( map(lambda x: np.array(x, dtype=np.dtype("int32")), encoder.transform(df_valid["review"]))) valid_scores = df_valid["userscore"].to_numpy(np.dtype("int32")) test_tokens = list( map(lambda x: np.array(x, dtype=np.dtype("int32")), encoder.transform(df_test["review"]))) test_scores = df_test["userscore"].to_numpy(np.dtype("int32")) with h5py.File("../data/reviews/tokenized.h5", "w") as f: dt = h5py.vlen_dtype(np.dtype("int32")) f.create_group("data") f.create_group("data/train") f.create_dataset("data/train/tokens", data=train_tokens, dtype=dt) f.create_dataset("data/train/scores", data=train_scores) f.create_group("data/valid") f.create_dataset("data/valid/tokens", data=valid_tokens, dtype=dt) f.create_dataset("data/valid/scores", data=valid_scores) f.create_group("data/test") f.create_dataset("data/test/tokens", data=test_tokens, dtype=dt) f.create_dataset("data/test/scores", data=test_scores) dt = h5py.string_dtype(encoding='utf-8') f.create_group("metadata") f.create_dataset("metadata/encoder", data=json.dumps(encoder.vocabs_to_dict()),
def convert_to_hdf5(base_directory, override): batch_size = 1000 for labels_filepath in sorted(Path(base_directory).rglob('*labels.csv')): print(f'Processing {labels_filepath}...') basedir = os.path.dirname(labels_filepath) dataset_name = os.path.relpath(basedir, base_directory) dataset_path = f'{base_directory}/{dataset_name.replace("/", "_")}.hdf5' if os.path.isfile(dataset_path): if override: os.remove(dataset_path) else: print(f'Dataset already exists, skipping {dataset_name}... \n') continue dataset = h5py.File(dataset_path, 'a') with open(labels_filepath, newline='') as csv_file: csv_data = np.asarray(list(csv.reader(csv_file))) labels = csv_data[:, 1:5].astype(np.float) image_paths = np.asarray( [f'{basedir}/{image_name}' for image_name in csv_data[:, 0]]) mask_paths = np.asarray([ f'{os.path.splitext(image_path)[0]}.pgm' for image_path in image_paths ]) load_masks = np.all( [os.path.isfile(mask_path) for mask_path in mask_paths]) with tqdm(total=image_paths.shape[0], file=sys.stdout, unit=' Images') as progress: dataset.create_dataset(f'{dataset_name}/labels', data=labels, maxshape=labels.shape, dtype=np.float) images_dataset = dataset.create_dataset( f'{dataset_name}/images', (image_paths.shape[0], ), dtype=h5py.vlen_dtype(np.uint8)) masks_dataset = dataset.create_dataset( f'{dataset_name}/masks', (mask_paths.shape[0], ), dtype=h5py.vlen_dtype(np.uint8)) if load_masks else None if os.path.isfile(f'{basedir}/mapping.json'): dataset.create_dataset(f'{dataset_name}/mapping', data=json.dumps(json.loads( open(f'{basedir}/mapping.json', 'r+').read()), indent=4)) for index in range(0, image_paths.shape[0], batch_size): if load_masks: images, masks = load_synthetic_data( image_paths[index:index + batch_size], mask_paths[index:index + batch_size]) images_dataset[index:index + batch_size] = images masks_dataset[index:index + batch_size] = masks else: images_dataset[index:index + batch_size] = [ np.frombuffer(open(file, 'rb').read(), dtype=np.uint8) for file in image_paths[index:index + batch_size] ] progress.update(image_paths[index:index + batch_size].shape[0]) dataset.flush() dataset.close()
def save_companies_to_hdf5(companies: Companies, file_path: str, chunk_size: int = 500000): """ Saves the Population object to hdf5 format file ``file_path``. Currently for each person, the following values are stored: - id, super_area, sector, n_workers_max, Parameters ---------- companies population object file_path path of the saved hdf5 file chunk_size number of people to save at a time. Note that they have to be copied to be saved, so keep the number below 1e6. """ n_companies = len(companies) n_chunks = int(np.ceil(n_companies / chunk_size)) vlen_type = h5py.vlen_dtype(np.dtype("float64")) with h5py.File(file_path, "a") as f: companies_dset = f.create_group("companies") first_company_idx = companies[0].id for chunk in range(n_chunks): idx1 = chunk * chunk_size idx2 = min((chunk + 1) * chunk_size, n_companies) ids = [] super_areas = [] sectors = [] n_workers_max = [] company_idx = [company.id for company in companies[idx1:idx2]] # sort companies by id companies_sorted = [ companies[i - first_company_idx] for i in np.sort(company_idx) ] for company in companies_sorted: ids.append(company.id) if company.super_area is None: super_areas.append(nan_integer) else: super_areas.append(company.super_area.id) sectors.append(company.sector.encode("ascii", "ignore")) n_workers_max.append(company.n_workers_max) ids = np.array(ids, dtype=np.int) super_areas = np.array(super_areas, dtype=np.int) sectors = np.array(sectors, dtype="S10") n_workers_max = np.array(n_workers_max, dtype=np.float) if chunk == 0: companies_dset.attrs["n_companies"] = n_companies companies_dset.create_dataset("id", data=ids, maxshape=(None, )) companies_dset.create_dataset("super_area", data=super_areas, maxshape=(None, )) companies_dset.create_dataset("sector", data=sectors, maxshape=(None, )) companies_dset.create_dataset("n_workers_max", data=n_workers_max, maxshape=(None, )) else: newshape = (companies_dset["id"].shape[0] + ids.shape[0], ) companies_dset["id"].resize(newshape) companies_dset["id"][idx1:idx2] = ids companies_dset["super_area"].resize(newshape) companies_dset["super_area"][idx1:idx2] = super_areas companies_dset["sector"].resize(newshape) companies_dset["sector"][idx1:idx2] = sectors companies_dset["n_workers_max"].resize(newshape) companies_dset["n_workers_max"][idx1:idx2] = n_workers_max