Ejemplo n.º 1
0
def make_data(kind):
    global data
    global dt

    if kind is bytes:
        s = b"xx"
    else:
        s = b"xx".decode('utf8')

    dt = h5py.vlen_dtype(kind)
    data = np.array([s*100 for idx in xrange(1000)])
Ejemplo n.º 2
0
    def test_vlen_enum(self):
        fname = self.mktemp()
        arr1 = [[1], [1, 2]]
        dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i'))

        with h5py.File(fname, 'w') as f:
            df1 = f.create_dataset('test', (len(arr1),), dtype=dt1)
            df1[:] = np.array(arr1, dtype=object)

        with h5py.File(fname, 'r') as f:
            df2 = f['test']
            dt2 = df2.dtype
            arr2 = [e.tolist() for e in df2[:]]

        self.assertEqual(arr1, arr2)
        self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)),
                         h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
Ejemplo n.º 3
0
 def test_convert(self):
     dt = h5py.vlen_dtype(int)
     ds = self.f.create_dataset('vlen', (3, ), dtype=dt)
     ds[0] = np.array([1.4, 1.2])
     ds[1] = np.array([1.2])
     ds[2] = [1.2, 2, 3]
     self.assertArrayEqual(ds[0], np.array([1, 1]))
     self.assertArrayEqual(ds[1], np.array([1]))
     self.assertArrayEqual(ds[2], np.array([1, 2, 3]))
     ds[0:2] = np.array([[0.1, 1.1, 2.1, 3.1, 4], np.arange(4)])
     self.assertArrayEqual(ds[0], np.arange(5))
     self.assertArrayEqual(ds[1], np.arange(4))
     ds[0:2] = np.array(
         [np.array([0.1, 1.2, 2.2]),
          np.array([0.2, 1.2, 2.2])])
     self.assertArrayEqual(ds[0], np.arange(3))
     self.assertArrayEqual(ds[1], np.arange(3))
Ejemplo n.º 4
0
 def test_int(self):
     dt = h5py.vlen_dtype(int)
     ds = self.f.create_dataset('vlen', (4, ), dtype=dt)
     ds[0] = np.arange(3)
     ds[1] = np.arange(0)
     ds[2] = [1, 2, 3]
     ds[3] = np.arange(1)
     self.assertArrayEqual(ds[0], np.arange(3))
     self.assertArrayEqual(ds[1], np.arange(0))
     self.assertArrayEqual(ds[2], np.array([1, 2, 3]))
     self.assertArrayEqual(ds[1], np.arange(0))
     ds[0:2] = np.array([np.arange(5), np.arange(4)])
     self.assertArrayEqual(ds[0], np.arange(5))
     self.assertArrayEqual(ds[1], np.arange(4))
     ds[0:2] = np.array([np.arange(3), np.arange(3)])
     self.assertArrayEqual(ds[0], np.arange(3))
     self.assertArrayEqual(ds[1], np.arange(3))
Ejemplo n.º 5
0
    def test_vlen_enum(self):
        fname = self.mktemp()
        arr1 = [[1],[1,2]]
        dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i'))

        with h5py.File(fname,'w') as f:
            df1 = f.create_dataset('test', (len(arr1),), dtype=dt1)
            df1[:] = np.array(arr1)

        with h5py.File(fname,'r') as f:
            df2  = f['test']
            dt2  = df2.dtype
            arr2 = [e.tolist() for e in df2[:]]

        self.assertEqual(arr1, arr2)
        self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)),
                         h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
Ejemplo n.º 6
0
    def test_compound_vlen_bool(self):
        vidt = h5py.vlen_dtype(np.uint8)
        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vb = np.dtype([
            ('foo', vidt),
            ('logical', bool)])
        vb = f.create_dataset('dt_vb', shape=(4,), dtype=dt_vb)
        data = np.array([(a([1, 2, 3]), True),
                         (a([1    ]), False),
                         (a([1, 5  ]), True),
                         (a([],), False), ],
                     dtype=dt_vb)
        vb[:] = data
        actual = f['dt_vb'][:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertArrayEqual(data['logical'], actual['logical'])

        dt_vv = np.dtype([
            ('foo', vidt),
            ('bar', vidt)])
        f.create_dataset('dt_vv', shape=(4,), dtype=dt_vv)

        dt_vvb = np.dtype([
            ('foo', vidt),
            ('bar', vidt),
            ('logical', bool)])
        vvb = f.create_dataset('dt_vvb', shape=(2,), dtype=dt_vvb)

        dt_bvv = np.dtype([
            ('logical', bool),
            ('foo', vidt),
            ('bar', vidt)])
        bvv = f.create_dataset('dt_bvv', shape=(2,), dtype=dt_bvv)
        data = np.array([(True, a([1, 2, 3]), a([1, 2])),
                         (False, a([]), a([2, 4, 6])), ],
                         dtype=bvv)
        bvv[:] = data
        actual = bvv[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['logical'], actual['logical'])
Ejemplo n.º 7
0
    def test_compound_vlen_bool(self):
        vidt = h5py.vlen_dtype(np.uint8)
        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vb = np.dtype([
            ('foo', vidt),
            ('logical', np.bool)])
        vb = f.create_dataset('dt_vb', shape=(4,), dtype=dt_vb)
        data = np.array([(a([1,2,3]), True),
                         (a([1    ]), False),
                         (a([1,5  ]), True),
                         (a([],    ), False),],
                     dtype=dt_vb)
        vb[:] = data
        actual = f['dt_vb'][:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertArrayEqual(data['logical'], actual['logical'])

        dt_vv = np.dtype([
            ('foo', vidt),
            ('bar', vidt)])
        f.create_dataset('dt_vv', shape=(4,), dtype=dt_vv)

        dt_vvb = np.dtype([
            ('foo', vidt),
            ('bar', vidt),
            ('logical', np.bool)])
        vvb = f.create_dataset('dt_vvb', shape=(2,), dtype=dt_vvb)

        dt_bvv = np.dtype([
            ('logical', np.bool),
            ('foo', vidt),
            ('bar', vidt)])
        bvv = f.create_dataset('dt_bvv', shape=(2,), dtype=dt_bvv)
        data = np.array([(True,  a([1,2,3]), a([1,2]) ),
                         (False, a([]),      a([2,4,6])),],
                         dtype=bvv)
        bvv[:] = data
        actual = bvv[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['logical'], actual['logical'])
Ejemplo n.º 8
0
    def add_brep_from_string(self, name, shape_data):
        """
        Add a brep contained in a string.
        """
        if name not in self._ref:
            shape = self._ref.create_dataset(name, (1, ),
                                             dtype=h5py.vlen_dtype(str))
            if type(shape_data) == str:
                # raw str
                shape[:] = shape_data
            else:
                # __getstate__ as with pythonocc
                shape[:] = shape_data[0]
                shape.attrs['occ_indx'] = shape_data[1]

            shape.attrs['id'] = self._number_of_shapes
            shape.attrs['type'] = 'brep'

            self._number_of_shapes += 1
Ejemplo n.º 9
0
def export_set(output_dir, name, data, labels, classes):
    """Stores paired data and labels into passed h5 file pointer."""

    assert len(data) == len(labels)

    # Variable-length datatypes for encoded png streams and label names
    dt_int = h5py.vlen_dtype(np.dtype('uint8'))
    dt_str = h5py.string_dtype(encoding='utf-8')

    # Initialize hdf5 file pointer
    f = h5py.File(f"{output_dir}/{name}_{len(data)}.h5", "w")

    # Create group and store data/labels
    x = f.create_dataset("data", (len(data), ), dtype=dt_int, data=data)
    y = f.create_dataset("label", data=np.array(labels, dtype=int))

    # Store <mapping from (0, 1 ...) to class names> as group attribute
    y.attrs.create("class_names", data=np.array(classes, dtype=dt_str))

    f.close()
def write_contact_map_h5(h5_file, rows, cols):

    # Helper function to create ragged array
    def ragged(data):
        a = np.empty(len(data), dtype=object)
        a[...] = data
        return a

    # Specify variable length arrays
    dt = h5py.vlen_dtype(np.dtype("int16"))

    # list of np arrays of shape (2 * X) where X varies
    data = ragged([np.concatenate(row_col) for row_col in zip(rows, cols)])
    h5_file.create_dataset(
        "contact_map",
        data=data,
        dtype=dt,
        fletcher32=True,
        chunks=(1, ) + data.shape[1:],
    )
Ejemplo n.º 11
0
    def test_compound_vlen_enum(self):
        eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8)
        vidt = h5py.vlen_dtype(np.uint8)
        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vve = np.dtype([
            ('foo', vidt),
            ('bar', vidt),
            ('switch', eidt)])
        vve = f.create_dataset('dt_vve', shape=(2,), dtype=dt_vve)
        data = np.array([(a([1,2,3]), a([1,2]),   1),
                         (a([]),      a([2,4,6]), 0),],
                         dtype=dt_vve)
        vve[:] = data
        actual = vve[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['switch'], actual['switch'])
Ejemplo n.º 12
0
    def test_compound_vlen_enum(self):
        eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8)
        vidt = h5py.vlen_dtype(np.uint8)

        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vve = np.dtype([('foo', vidt), ('bar', vidt), ('switch', eidt)])
        vve = f.create_dataset('dt_vve', shape=(2, ), dtype=dt_vve)
        data = np.array([
            (a([1, 2, 3]), a([1, 2]), 1),
            (a([]), a([2, 4, 6]), 0),
        ],
                        dtype=dt_vve)
        vve[:] = data
        actual = vve[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['switch'], actual['switch'])
    def to_hdf5(self, parent: h5py.Group) -> None:
        """Stores this instance in an HDF5 group inside of the provided parent group.

        See also :func:`~qiskit_nature.hdf5.HDF5Storable.to_hdf5` for more details.

        Args:
            parent: the parent HDF5 group.
        """
        group = parent.require_group(self.name)
        group.attrs["__class__"] = self.__class__.__name__
        group.attrs["__module__"] = self.__class__.__module__
        group.attrs["__version__"] = self.VERSION

        group.attrs["num_body_terms"] = self._num_body_terms

        dtype = h5py.vlen_dtype(np.dtype("int32"))
        integrals_dset = group.create_dataset("integrals", (len(self.integrals),), dtype=dtype)
        coeffs_dset = group.create_dataset("coefficients", (len(self.integrals),), dtype=float)

        for idx, ints in enumerate(self.integrals):
            coeffs_dset[idx] = ints[0]
            integrals_dset[idx] = list(ints[1])
Ejemplo n.º 14
0
    def __init__(self, root_group: h5py.Group, total_length: int,
                 feature_dim: int):
        self.root_group = root_group
        LOG.debug(f"Creating dataset '{_FEATURE_GROUP}'")
        self._feature_dataset = self.root_group.create_dataset(
            _FEATURE_GROUP,
            (total_length, ),
            dtype=h5py.vlen_dtype(np.dtype("float32")),
        )
        self._feature_dataset.attrs["feature_dim"] = feature_dim

        LOG.debug(f"Creating group '{_LABEL_GROUP}'")
        self._label_group = self.root_group.create_group(_LABEL_GROUP)
        self._label_datasets = dict()

        LOG.debug(f"Creating dataset '{_UID_GROUP}'")
        self._uid_dataset = self.root_group.create_dataset(_UID_GROUP,
                                                           (total_length, ),
                                                           dtype=string_dtype)
        self.total_length = total_length
        self.feature_dim = feature_dim
        self._current_index = 0
Ejemplo n.º 15
0
    def run(self) -> None:
        if self.cache_exists() and not self.force_update:
            self.logger.info(
                "Cached version of tokenized data already exists. " +
                "Skipping tokenization.")
            return None

        with h5py.File(self.hdf5_path, "a") as hdf5_store:
            for hdf5_group_name in self.raw_data_group_names.values():
                hdf5_group = hdf5_store.get(hdf5_group_name)
                captions = numpy.array(hdf5_group["caption_cleaned"])

                captions_tokenized = []
                captions_tokenized_id = []

                for caption in tqdm(captions):
                    caption_tokenized = (
                        self.tokenizer.encode_with_bos_eos(caption))
                    caption_tokenized_id = (
                        self.tokenizer.encode_ids_with_bos_eos(caption))
                    captions_tokenized.append(caption_tokenized)
                    captions_tokenized_id.append(caption_tokenized_id)

                if "caption_cleaned_tokenized" in hdf5_group.keys():
                    del hdf5_group["caption_cleaned_tokenized"]
                if "caption_cleaned_tokenized_id" in hdf5_group.keys():
                    del hdf5_group["caption_cleaned_tokenized_id"]

                hdf5_group.create_dataset(
                    "caption_cleaned_tokenized",
                    data=numpy.array(
                        captions_tokenized,
                        dtype=h5py.string_dtype(encoding="utf-8")))
                token_id_dataset = hdf5_group.create_dataset(
                    "caption_cleaned_tokenized_id",
                    shape=(len(captions_tokenized_id), ),
                    dtype=h5py.vlen_dtype(numpy.dtype("int32")))
                token_id_dataset[...] = captions_tokenized_id
Ejemplo n.º 16
0
    def test_compound_vlen(self):
        vidt = h5py.vlen_dtype(np.uint8)
        eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8)

        for np_align in (False, True):
            dt = np.dtype([
                ('a', eidt),
                ('foo', vidt),
                ('bar', vidt),
                ('switch', eidt)], align=np_align)
            np_offsets = [dt.fields[i][1] for i in dt.names]

            for logical in (False, True):
                if logical and np_align:
                    # Vlen types have different size in the numpy struct
                    self.assertRaises(TypeError, h5py.h5t.py_create, dt,
                            logical=logical)
                else:
                    ht = h5py.h5t.py_create(dt, logical=logical)
                    offsets = [ht.get_member_offset(i)
                               for i in range(ht.get_nmembers())]
                    if np_align:
                        self.assertEqual(np_offsets, offsets)
Ejemplo n.º 17
0
    def test_compound_vlen(self):
        vidt = h5py.vlen_dtype(np.uint8)
        eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8)

        for np_align in (False, True):
            dt = np.dtype([
                ('a', eidt),
                ('foo', vidt),
                ('bar', vidt),
                ('switch', eidt)], align=np_align)
            np_offsets = [dt.fields[i][1] for i in dt.names]

            for logical in (False, True):
                if logical and np_align:
                    # Vlen types have different size in the numpy struct
                    self.assertRaises(TypeError, h5py.h5t.py_create, dt,
                            logical=logical)
                else:
                    ht = h5py.h5t.py_create(dt, logical=logical)
                    offsets = [ht.get_member_offset(i)
                               for i in range(ht.get_nmembers())]
                    if np_align:
                        self.assertEqual(np_offsets, offsets)
Ejemplo n.º 18
0
def _extract_features_mp(timestamps: Sequence[Sequence[float]],
                         sizes: Sequence[Sequence[float]],
                         max_size: int = DEFAULT_NUM_FEATURES,
                         n_jobs: Optional[int] = None) -> np.ndarray:
    features = np.zeros((len(sizes), max_size), float)

    # Serialise the timestamps and sizes to file
    with tempfile.TemporaryDirectory(prefix="kfp-extract-") as directory:
        with h5py.File(f"{directory}/data.hdf", mode="w") as h5file:
            dtype = h5py.vlen_dtype(np.dtype("float"))
            h5file.create_dataset("sizes", data=sizes, dtype=dtype)
            h5file.create_dataset("timestamps", data=timestamps, dtype=dtype)

        offset = 0
        # Use our own splits as imap chunking would yield them one at a time
        chunksize = 5000
        n_chunks = max(len(sizes) // chunksize, 1)
        splits = np.array_split(np.arange(len(sizes)), n_chunks)
        assert n_chunks == len(splits)
        _LOGGER.info("Extracting features in %d batches...", n_chunks)

        with multiprocessing.Pool(n_jobs) as pool:
            # Pass the filenames and indices to the background process
            for i, batch in enumerate(
                    pool.imap(functools.partial(_run_extraction,
                                                directory=directory,
                                                max_size=max_size),
                              splits,
                              chunksize=1)):
                # Recombine them filenames and indices
                features[offset:offset + len(batch), :] = batch
                offset += len(batch)

                _LOGGER.info("Extraction is %.2f%% complete.",
                             ((i + 1) * 100 / n_chunks))

    return features
Ejemplo n.º 19
0
def save_commute_hubs_to_hdf5(commute_hubs: CommuteHubs, file_path: str):
    n_hubs = len(commute_hubs)
    dt = h5py.vlen_dtype(np.dtype("int32"))
    with h5py.File(file_path, "a") as f:
        commute_hubs_dset = f.create_group("commute_hubs")
        ids = []
        cities = []
        commute_units_list = []
        for hub in commute_hubs:
            ids.append(hub.id)
            cities.append(hub.city)
            commute_units = []
            for commute_unit in hub.commuteunits:
                commute_units.append(commute_unit.id)
            commute_units_list.append(np.array(commute_units, dtype=np.int))

        ids = np.array(ids, dtype=np.int)
        cities = np.array(cities, dtype="S20")
        commute_units_list = np.array(commute_units_list, dtype=dt)
        commute_hubs_dset.attrs["n_commute_hubs"] = n_hubs
        commute_hubs_dset.create_dataset("id", data=ids)
        commute_hubs_dset.create_dataset("city_names", data=cities)
        commute_hubs_dset.create_dataset("commute_units",
                                         data=commute_units_list)
Ejemplo n.º 20
0
 def test_reuse_from_other(self):
     dt = h5py.vlen_dtype(int)
     ds = self.f.create_dataset('vlen', (1,), dtype=dt)
     self.f.create_dataset('vlen2', (1,), ds[()].dtype)
Ejemplo n.º 21
0
import h5py
import numpy as np

from june.groups import Schools, School
from june.world import World
from .utils import read_dataset

nan_integer = -999

int_vlen_type = h5py.vlen_dtype(np.dtype("int64"))


def save_schools_to_hdf5(schools: Schools,
                         file_path: str,
                         chunk_size: int = 50000):
    """
    Saves the schools object to hdf5 format file ``file_path``. Currently for each person,
    the following values are stored:
    - id, n_pupils_max,  age_min, age_max, sector, coordiantes

    Parameters
    ----------
    schools 
        population object
    file_path
        path of the saved hdf5 file
    chunk_size
        number of people to save at a time. Note that they have to be copied to be saved,
        so keep the number below 1e6.
    """
    n_schools = len(schools)
Ejemplo n.º 22
0
def save_hospitals_to_hdf5(hospitals: Hospitals,
                           file_path: str,
                           chunk_size: int = 50000):
    """
    Saves the Hospitals object to hdf5 format file ``file_path``. Currently for each person,
    the following values are stored:
    - id, n_beds, n_icu_beds, super_area, coordinates

    Parameters
    ----------
    companies 
        population object
    file_path
        path of the saved hdf5 file
    chunk_size
        number of people to save at a time. Note that they have to be copied to be saved,
        so keep the number below 1e6.
    """
    n_hospitals = len(hospitals)
    n_chunks = int(np.ceil(n_hospitals / chunk_size))
    vlen_type = h5py.vlen_dtype(np.dtype("float64"))
    with h5py.File(file_path, "a") as f:
        hospitals_dset = f.create_group("hospitals")
        for chunk in range(n_chunks):
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_hospitals)
            ids = []
            n_beds = []
            n_icu_beds = []
            super_areas = []
            coordinates = []
            trust_code = []
            for hospital in hospitals[idx1:idx2]:
                ids.append(hospital.id)
                if hospital.super_area is None:
                    super_areas.append(nan_integer)
                else:
                    super_areas.append(hospital.super_area)
                n_beds.append(hospital.n_beds)
                n_icu_beds.append(hospital.n_icu_beds)
                coordinates.append(np.array(hospital.coordinates))
                trust_code.append(hospital.trust_code)

            ids = np.array(ids, dtype=np.int)
            super_areas = np.array(super_areas, dtype="S20")
            trust_code = np.array(trust_code, dtype="S10")
            n_beds = np.array(n_beds, dtype=np.int)
            n_icu_beds = np.array(n_icu_beds, dtype=np.int)
            coordinates = np.array(coordinates, dtype=np.float)
            if chunk == 0:
                hospitals_dset.attrs["n_hospitals"] = n_hospitals
                hospitals_dset.create_dataset("id",
                                              data=ids,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("super_area",
                                              data=super_areas,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("trust_code",
                                              data=trust_code,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("n_beds",
                                              data=n_beds,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("n_icu_beds",
                                              data=n_icu_beds,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("coordinates",
                                              data=coordinates,
                                              maxshape=(None,
                                                        coordinates.shape[1]))
            else:
                newshape = (hospitals_dset["id"].shape[0] + ids.shape[0], )
                hospitals_dset["id"].resize(newshape)
                hospitals_dset["id"][idx1:idx2] = ids
                hospitals_dset["super_area"].resize(newshape)
                hospitals_dset["super_area"][idx1:idx2] = super_areas
                hospitals_dset["trust_code"].resize(newshape)
                hospitals_dset["trust_code"][idx1:idx2] = trust_code
                hospitals_dset["n_beds"].resize(newshape)
                hospitals_dset["n_beds"][idx1:idx2] = n_beds
                hospitals_dset["n_icu_beds"].resize(newshape)
                hospitals_dset["n_icu_beds"][idx1:idx2] = n_icu_beds
                hospitals_dset["coordinates"].resize(newshape[0], axis=0)
                hospitals_dset["coordinates"][idx1:idx2] = coordinates
Ejemplo n.º 23
0
import h5py
import numpy as np
from collections import defaultdict

from june.groups import ExternalGroup, ExternalSubgroup
from june.geography import Geography, Area, SuperArea, Areas, SuperAreas, Region, Regions
from .utils import read_dataset
from june.world import World

nan_integer = -999
int_vlen_type = h5py.vlen_dtype(np.dtype("int64"))
str_vlen_type = h5py.vlen_dtype(np.dtype("S40"))

social_venues_spec_mapper = {
    "pubs": "pubs",
    "household_visits": "households",
    "care_home_visits": "care_homes",
    "cinemas": "cinemas",
    "groceries": "groceries",
}

super_group_to_group_mapper = {
    "pubs": "pub",
    "groceries": "grocery",
    "cinemas": "cinema",
}


def save_geography_to_hdf5(geography: Geography, file_path: str):
    """
    Saves the households object to hdf5 format file ``file_path``. Currently for each person,
Ejemplo n.º 24
0
    def write(self, ds: Dataset):
        import h5py
        # For data checksums
        dataset_kwargs = {"chunks": True, "fletcher32": True}

        n_records = len(ds.data.records)
        default_shape = (n_records, )

        if h5py.__version__ >= distutils.version.StrictVersion("2.10.0"):
            vlen_double_t = h5py.vlen_dtype(np.dtype("float64"))
            utf8_t = h5py.string_dtype(encoding="utf-8")
            bytes_t = h5py.vlen_dtype(np.dtype("uint8"))
            vlen_utf8_t = h5py.vlen_dtype(utf8_t)
        else:
            vlen_double_t = h5py.special_dtype(vlen=np.dtype("float64"))
            utf8_t = h5py.special_dtype(vlen=str)
            bytes_t = h5py.special_dtype(vlen=np.dtype("uint8"))
            vlen_utf8_t = h5py.special_dtype(vlen=utf8_t)

        driver_dataspec = {
            "energy": {
                "dtype": np.dtype("float64"),
                "shape": default_shape
            },
            "gradient": {
                "dtype": vlen_double_t,
                "shape": default_shape
            },
            "hessian": {
                "dtype": vlen_double_t,
                "shape": default_shape
            },
            "dipole": {
                "dtype": np.dtype("float64"),
                "shape": (n_records, 3)
            }
        }

        def _write_dataset(dataset, column, entry_dset):
            assert column.shape[1] == 1
            for i, name in enumerate(entry_dset):
                element = column.loc[name][0]
                if not h5py.check_dtype(vlen=dataset.dtype):
                    dataset[i] = element
                # Variable length datatypes require flattening of the array and special handling of missing values
                else:
                    try:
                        dataset[i] = element.ravel()
                    except AttributeError:
                        if np.isnan(element):
                            pass
                        else:
                            raise

        with self._write_file() as f:
            # Collection attributes
            for field in {
                    "name", "collection", "provenance", "tagline", "tags",
                    "id", "history_keys"
            }:
                f.attrs[field] = self._serialize_field(getattr(ds.data, field))
            if ds.client is not None:
                f.attrs["server_information"] = self._serialize_field(
                    ds.client.server_information())
                f.attrs["server_address"] = self._serialize_field(
                    ds.client.address)

            # Export molecules
            molecule_group = f.create_group("molecule")

            if "stoichiometry" in ds.data.history_keys:
                molecules = ds.get_molecules(stoich=list(ds.valid_stoich),
                                             force=True)
            else:
                molecules = ds.get_molecules(force=True)
            mol_shape = (len(molecules), )
            mol_geometry = molecule_group.create_dataset("geometry",
                                                         shape=mol_shape,
                                                         dtype=vlen_double_t,
                                                         **dataset_kwargs)
            mol_symbols = molecule_group.create_dataset("symbols",
                                                        shape=mol_shape,
                                                        dtype=vlen_utf8_t,
                                                        **dataset_kwargs)
            mol_schema = molecule_group.create_dataset("schema",
                                                       shape=mol_shape,
                                                       dtype=bytes_t,
                                                       **dataset_kwargs)
            mol_charge = molecule_group.create_dataset(
                "charge",
                shape=mol_shape,
                dtype=np.dtype('float64'),
                **dataset_kwargs)
            mol_spin = molecule_group.create_dataset("multiplicity",
                                                     shape=mol_shape,
                                                     dtype=np.dtype('int32'),
                                                     **dataset_kwargs)
            mol_id_server_view = {}
            for i, mol_row in enumerate(molecules.to_dict("records")):
                molecule = mol_row["molecule"]
                mol_geometry[i] = molecule.geometry.ravel()
                mol_schema[i] = self._serialize_data(molecule)
                mol_symbols[i] = molecule.symbols
                mol_charge[i] = molecule.molecular_charge
                mol_spin[i] = molecule.molecular_multiplicity
                mol_id_server_view[molecule.id] = i

            # Export entries
            entry_group = f.create_group("entry")
            entry_dset = entry_group.create_dataset("entry",
                                                    shape=default_shape,
                                                    dtype=utf8_t,
                                                    **dataset_kwargs)
            entry_dset[:] = ds.get_index()

            entries = ds.get_entries(force=True)
            if isinstance(ds.data.records[0], MoleculeEntry):
                entry_group.attrs["model"] = "MoleculeEntry"
                entries["hdf5_molecule_id"] = entries["molecule_id"].map(
                    mol_id_server_view)
                entry_group.create_dataset("name",
                                           data=entries["name"],
                                           dtype=utf8_t,
                                           **dataset_kwargs)
                entry_group.create_dataset("molecule_id",
                                           data=entries["hdf5_molecule_id"],
                                           dtype=np.dtype("int64"),
                                           **dataset_kwargs)
            elif isinstance(ds.data.records[0], ReactionEntry):
                entry_group.attrs["model"] = "ReactionEntry"
                entries["hdf5_molecule_id"] = entries["molecule"].map(
                    mol_id_server_view)
                entry_group.create_dataset("name",
                                           data=entries["name"],
                                           dtype=utf8_t,
                                           **dataset_kwargs)
                entry_group.create_dataset("stoichiometry",
                                           data=entries["stoichiometry"],
                                           dtype=utf8_t,
                                           **dataset_kwargs)
                entry_group.create_dataset("molecule",
                                           data=entries["hdf5_molecule_id"],
                                           dtype=np.dtype("int64"),
                                           **dataset_kwargs)
                entry_group.create_dataset("coefficient",
                                           data=entries["coefficient"],
                                           dtype=np.dtype("float64"),
                                           **dataset_kwargs)
            else:
                raise ValueError(
                    f"Unknown entry class ({type(ds.data.records[0])}) while writing HDF5 entries."
                )

            # Export native data columns
            value_group = f.create_group("value")
            history = ds.list_values(
                native=True, force=True).reset_index().to_dict("records")
            for specification in history:
                gv_spec = specification.copy()
                name = gv_spec.pop("name")
                if "stoichiometry" in gv_spec:
                    gv_spec["stoich"] = gv_spec.pop("stoichiometry")
                dataset_name = self._normalize_hdf5_name(name)
                df = ds.get_values(**gv_spec, force=True)
                assert df.shape[1] == 1

                driver = specification["driver"]
                dataspec = driver_dataspec[driver]
                dataset = value_group.create_dataset(dataset_name, **dataspec,
                                                     **dataset_kwargs)

                for key in specification:
                    dataset.attrs[key] = self._serialize_field(
                        specification[key])
                dataset.attrs["units"] = self._serialize_field(ds.units)

                _write_dataset(dataset, df, entry_dset)

            # Export contributed data columns
            contributed_group = f.create_group("contributed_value")
            for cv_name in ds.list_values(force=True, native=False)["name"]:
                cv_df = ds.get_values(name=cv_name, force=True, native=False)
                cv_model = ds.data.contributed_values[cv_name.lower()]

                try:
                    dataspec = driver_dataspec[
                        cv_model.theory_level_details["driver"]]
                except (KeyError, TypeError):
                    warnings.warn(
                        f"Contributed values column {cv_name} does not provide driver in theory_level_details. "
                        f"Assuming default driver for the dataset ({ds.data.default_driver})."
                    )
                    dataspec = driver_dataspec[ds.data.default_driver]

                dataset = contributed_group.create_dataset(
                    self._normalize_hdf5_name(cv_name), **dataspec,
                    **dataset_kwargs)
                for field in {
                        "name", "theory_level", "units", "doi", "comments",
                        "theory_level", "theory_level_details"
                }:
                    dataset.attrs[field] = self._serialize_field(
                        getattr(cv_model, field))

                _write_dataset(dataset, cv_df, entry_dset)

        # Clean up any caches
        self._entries = None
Ejemplo n.º 25
0
    def log_population(
        self,
        population: Population,
        chunk_size: int = 100000,
    ):
        """
        Saves the Population object to hdf5 format file ``self.save_path``. Currently for each person,
        the following values are stored:
        - id, age, sex, super_area

        Parameters
        ----------
        population:
            population object
        chunk_size:
            number of people to save at a time. Note that they have to be copied to be saved,
            so keep the number below 1e6.
        """
        n_people = len(population.people)
        dt = h5py.vlen_dtype(np.dtype("int32"))
        # dt = tuple
        n_chunks = int(np.ceil(n_people / chunk_size))
        with h5py.File(self.file_path, "a", libver="latest") as f:
            people_dset = f.create_group("population")
            people_dset.attrs["n_people"] = n_people
            for chunk in range(n_chunks):
                idx1 = chunk * chunk_size
                idx2 = min((chunk + 1) * chunk_size, n_people)
                ids = []
                ages = []
                sexes = []
                ethnicities = []
                socioeconomic_indcs = []
                super_areas = []

                for person in population.people[idx1:idx2]:
                    ids.append(person.id)
                    ages.append(person.age)
                    ethnicities.append(
                        person.ethnicity.encode("ascii", "ignore"))
                    socioeconomic_indcs.append(person.socioecon_index)
                    sexes.append(person.sex.encode("ascii", "ignore"))
                    super_areas.append(person.area.super_area.name)

                ids = np.array(ids, dtype=np.int)
                ages = np.array(ages, dtype=np.int16)
                sexes = np.array(sexes, dtype="S10")
                super_areas = np.array(super_areas, dtype="S10")
                ethnicities = np.array(ethnicities, dtype="S10")
                socioeconomic_indcs = np.array(socioeconomic_indcs,
                                               dtype=np.int8)

                if chunk == 0:
                    people_dset.create_dataset("id",
                                               data=ids,
                                               maxshape=(None, ),
                                               compression="gzip")
                    people_dset.create_dataset("age",
                                               data=ages,
                                               maxshape=(None, ),
                                               compression="gzip")
                    people_dset.create_dataset("sex",
                                               data=sexes,
                                               maxshape=(None, ),
                                               compression="gzip")
                    people_dset.create_dataset(
                        "ethnicity",
                        data=ethnicities,
                        maxshape=(None, ),
                        compression="gzip",
                    )
                    people_dset.create_dataset(
                        "socioeconomic_index",
                        data=socioeconomic_indcs,
                        maxshape=(None, ),
                        compression="gzip",
                    )
                    people_dset.create_dataset(
                        "super_area",
                        data=super_areas,
                        maxshape=(None, ),
                        compression="gzip",
                    )
                else:
                    newshape = (people_dset["id"].shape[0] + ids.shape[0], )
                    people_dset["id"].resize(newshape)
                    people_dset["id"][idx1:idx2] = ids
                    people_dset["age"].resize(newshape)
                    people_dset["age"][idx1:idx2] = ages
                    people_dset["sex"].resize(newshape)
                    people_dset["sex"][idx1:idx2] = sexes
                    people_dset["super_area"].resize(newshape)
                    people_dset["super_area"][idx1:idx2] = super_areas
                    people_dset["ethnicity"].resize(newshape)
                    people_dset["ethnicity"][idx1:idx2] = ethnicities
                    people_dset["socioeconomic_index"].resize(newshape)
                    people_dset["socioeconomic_index"][
                        idx1:idx2] = socioeconomic_indcs
Ejemplo n.º 26
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 22 10:35:01 2019

@author: Vedran Furtula
"""

import h5py, random
import numpy

run_test = 1

if run_test == 0:

    dt_ = h5py.vlen_dtype(numpy.dtype('float32'))

    with h5py.File('resize_dataset.hdf5', 'w') as f:
        d1 = f.create_dataset('dataset1', (0, ), maxshape=(None, ), dtype=dt_)
        d2 = f.create_dataset('dataset2', (0, ), maxshape=(None, ))
        #d1[:10] = np.random.randn(10)
        #d2[:5] = np.random.randn(5)
        #d.resize((200,))
        #d[100:200] = np.random.randn(100)

    with h5py.File('resize_dataset.hdf5', 'r') as f:
        dset = f['dataset1']
        print("dset: ", dset[:])

    for tal in range(10):
        with h5py.File('resize_dataset.hdf5', 'a') as f:
Ejemplo n.º 27
0
def _fixmatlabstruct(fp):  # noqa: C901
    """Verify MATLAB structs: It cannot load mixed non-scalar structs"""
    groups = []

    def collectgroups(name, obj):
        """Callback function to collect all suitable struct groups"""
        if (isinstance(obj, h5py._hl.group.Group) and name != '#refs#'
                and obj.attrs.get('MATLAB_class', None) != b'struct'):
            groups.append(obj)

    def dynamiciterator():
        """Dynamically reassessing groups iterator"""
        while True:
            fp.visititems(collectgroups)
            if groups:
                yield groups[-1]  # Start with last
            else:
                return

    # Iterate over all groups to make them MATLAB compatible structs
    for group in dynamiciterator():
        groups = []  # Reset groups for iterator
        group.attrs['MATLAB_class'] = np.bytes_('struct')

        # Create struct fields
        fieldnames = np.empty(len(group.keys()),
                              dtype=h5py.vlen_dtype(np.dtype('|S1')))
        fieldnames[:] = [np.fromiter(f, '|S1') for f in group.keys()]
        group.attrs['MATLAB_fields'] = fieldnames

        # Recurse into groups to obtain shape (visititems not suitable)
        def groupshape(obj):
            """Determine common shape"""
            if isinstance(obj, h5py._hl.group.Group):
                # Collect shapes from children
                dims = [groupshape(chld) for chld in obj.values()]

                # Obtain first n common dimensions
                commondim = ()
                for d in zip(*dims):
                    if len(set(d)) != 1:
                        break
                    commondim += (d[0], )

                # Pass upward
                return commondim
            else:
                if 'MATLAB_empty' in obj.attrs:
                    return (-np.random.randint(100), )  # Make non-scalar
                if obj.ndim == 2 and obj.shape[1] == 1:
                    return (obj.shape[0], )
                else:
                    # Reversed, because MATLAB transposes
                    return obj.shape[::-1]

        # Iterate over all children to determine if it should be scalar
        commondim = groupshape(group)
        idx = len(commondim)
        commondim = commondim[::-1]
        if len(commondim) == 1:
            commondim += (1, )

        # Different shapes = non-scalar: nothing to do
        if not idx or len(group.keys()) == 1:
            for child in group.values():
                if not isinstance(child, h5py.h5r.Reference):
                    continue

                # One-sized references can just be resolved into group
                if child.size == 1:
                    childname = child.name
                    del fp[child.name]
                    group.move(fp[child[()].item()].name, childname)
                else:
                    # Object arrays might need to be cell arrays
                    child.attrs['MATLAB_class'] = np.bytes_('cell')
            continue

        # Turn all children into references to make it non-scalar
        refs = fp.require_group('#refs#')

        # Simple loop over all group items. Assumes there are no more
        # groups within this group that haven't been resolved already.
        # Reshape a dataset/group/reference and turn it into reference
        for childname, child in group.items():
            # Skip references with correct shape
            if (getattr(child, 'dtype', None) == h5py.h5r.Reference
                    and getattr(child, 'shape', ()) == commondim):
                continue

            # Create a new dataset without any filters
            rf = group.create_dataset('__h5dereftemp__',
                                      shape=commondim,
                                      dtype=h5py.ref_dtype)

            # Iterate over dataset entries
            fi = np.nditer(rf,
                           flags=['refs_ok', 'multi_index'],
                           itershape=commondim)

            # Datasets are just turned into references, groups are
            # split into smaller groups referenced by datasets
            if isinstance(child, h5py._hl.dataset.Dataset):
                for _ in fi:
                    # Obtain index for dataset
                    if child.ndim == 2 and child.shape[1] == 1:
                        index = fi.multi_index[:idx] + (Ellipsis, )
                    else:
                        index = (Ellipsis, ) + fi.multi_index[:idx]

                    # Differentiate between data and reference
                    if child.dtype == h5py.h5r.Reference:
                        v = fp[child.name][index]
                    else:
                        v = child[index]

                    # Fix dimensions
                    if v.ndim < 2:
                        v = np.atleast_2d(v).T
                    else:
                        v = v[()]

                    # Create dataset for each element with filters
                    incr = str(len(refs.items()))
                    refs.create_dataset_like(incr,
                                             child,
                                             shape=v.shape,
                                             chunks=None,
                                             maxshape=None)
                    refs[incr][()] = v

                    # Copy attributes
                    for atr_key, atr_val in child.attrs.items():
                        refs[incr].attrs[atr_key] = atr_val
                    rf[fi.multi_index] = refs[incr].ref
            else:
                # Get the group names
                fieldnames = np.empty(len(child.keys()),
                                      dtype=h5py.vlen_dtype(np.dtype('|S1')))
                fieldnames[:] = [np.fromiter(f, '|S1') for f in child.keys()]

                for _ in fi:
                    # Create new group for each split
                    incr = str(len(refs.items()))
                    refs.create_group(incr, track_order=True)

                    # Add struct info
                    refs[incr].attrs['MATLAB_class'] = np.bytes_('struct')
                    refs[incr].attrs['MATLAB_fields'] = fieldnames

                    # Iterate over group children
                    for ckdname, ckd in child.items():
                        # Leave it like this, until needed
                        if isinstance(ckd, h5py._hl.group.Group):
                            raise NotImplementedError('Nested group')

                        # Obtain index for dataset
                        if ckd.ndim == 2 and ckd.shape[1] == 1:
                            index = fi.multi_index[:idx] + (Ellipsis, )
                        else:
                            index = (Ellipsis, ) + fi.multi_index[:idx]

                        # Differentiate between data and reference
                        if ckd.dtype == h5py.h5r.Reference:
                            v = fp[ckd.name][index]
                        else:
                            v = ckd[index]

                        # Fix dimensions
                        if v.ndim < 2:
                            v = np.atleast_2d(v).T
                        else:
                            v = v[()]

                        # Create dataset for each element with filters
                        refs[incr].create_dataset_like(ckdname,
                                                       ckd,
                                                       dtype=v.dtype,
                                                       shape=v.shape,
                                                       chunks=None,
                                                       maxshape=None)
                        refs[incr][ckdname][()] = v

                        # Copy attributes
                        for atr_key, atr_val in ckd.attrs.items():
                            refs[incr][ckdname].attrs[atr_key] = atr_val

                    rf[fi.multi_index] = refs[incr].ref

            # Re-add ALL children to maintain tracking order
            for ckdname, ckd in group.items():
                if ckdname == childname:
                    del group[childname]
                    group[childname] = group['__h5dereftemp__']
                    del group['__h5dereftemp__']
                elif ckdname != '__h5dereftemp__':
                    a = group[ckdname]
                    del group[ckdname]
                    group[ckdname] = a
                    del a
Ejemplo n.º 28
0
def write_compound_datasets(f):

    utf8 = h5py.special_dtype(vlen=str)
    gender_enum_dtype = h5py.enum_dtype({"MALE": 0, "FEMALE": 1}, basetype=np.uint8)
    dt = np.dtype([
        ('firstName', utf8), # variable lentgh utf8
        ('surname', 'S20'), # fixed length ASCII
        ('gender', gender_enum_dtype), # enum type
        ('age', np.uint8), # uint
        ('fav_number', np.float32), # float
        ('vector', np.float32, (3,))]) # array

    data = np.zeros(4, dtype=dt)

    # Set the example data
    data[0] = ('Bob', 'Smith', 0, 32, 1.0, [1, 2, 3])
    data[1] = ('Peter', 'Fletcher', 0, 43, 2.0, [16.2, 2.2, -32.4])
    data[2] = ('James', 'Mudd', 0, 12, 3.0, [-32.1,-774.1,-3.0])
    data[3] = ('Ellie', 'Kyle', 1, 22, 4.0, [2.1,74.1,-3.8])

    f.create_dataset('contiguous_compound', data=data)
    f.create_dataset('chunked_compound', data=data, chunks=(1,), compression="gzip")

    # 2d compound use img number example
    imgdt = np.dtype([
        ('real', np.float32),
        ('img', np.float32)
    ])
    data = np.zeros((3, 3), dtype=imgdt)
    data[0][0] = (2.3, -7.3)
    data[0][1] = (12.3, -17.3)
    data[0][2] = (-32.3, -0.3)
    data[1][0] = (2.3, -7.3)
    data[1][1] = (12.3, -17.3)
    data[1][2] = (-32.3, -0.3)
    data[2][0] = (2.3, -7.3)
    data[2][1] = (12.3, -17.3)
    data[2][2] = (-32.3, -0.3)

    f.create_dataset('2d_contiguous_compound', data=data)
    f.create_dataset('2d_chunked_compound', data=data, chunks=(1,2), compression="gzip")

    # Compound dataset containing ragged arrays
    uint8_vlen_type = h5py.vlen_dtype(np.uint8)
    compound_vlen_dtype = np.dtype([
        ('one', uint8_vlen_type),
        ('two', uint8_vlen_type)
    ])
    data = np.zeros(3, dtype=compound_vlen_dtype)
    data[0] = (np.array([1]), np.array([2]))
    data[1] = (np.array([1,1]), np.array([2,2]))
    data[2] = (np.array([1,1,1]), np.array([2,2,2]))

    f.create_dataset('vlen_contiguous_compound', data=data, dtype=compound_vlen_dtype)
    f.create_dataset('vlen_chunked_compound', data=data, dtype=compound_vlen_dtype, chunks=(1,), compression="gzip")

    # Compound dataset arrays of vlen type
    compound_vlen_dtype = np.dtype([
        ('name', utf8, 2)
    ])
    pointData = np.zeros(2, dtype=utf8)
    pointData[0] = "James"
    pointData[1] = "Ellie"
    data = np.zeros(1, dtype=compound_vlen_dtype)
    data['name'] = np.array(pointData)

    f.create_dataset('array_vlen_contiguous_compound', data=data, dtype=compound_vlen_dtype)
    f.create_dataset('array_vlen_chunked_compound', data=data, dtype=compound_vlen_dtype, chunks=(1,), compression="gzip")

    # Nested compound datasets use 2 img numbers as an example
    nested_dt = np.dtype([
        ('firstNumber', imgdt),
        ('secondNumber', imgdt),
    ])

    data = np.zeros(3, dtype=nested_dt)
    data[1] = ((1,1), (1,1))
    data[2] = ((2,2), (2,2))
    f.create_dataset('nested_contiguous_compound', data=data, dtype=nested_dt)
    f.create_dataset('nested_chunked_compound', data=data, dtype=nested_dt, chunks=(2,), compression="gzip")

    f.flush()
    f.close()
Ejemplo n.º 29
0
def save_households_to_hdf5(households: Households,
                            file_path: str,
                            chunk_size: int = 50000):
    """
    Saves the households object to hdf5 format file ``file_path``. Currently for each person,
    the following values are stored:
    - id, n_beds, n_icu_beds, super_area, coordinates

    Parameters
    ----------
    companies 
        population object
    file_path
        path of the saved hdf5 file
    chunk_size
        number of people to save at a time. Note that they have to be copied to be saved,
        so keep the number below 1e6.
    """
    n_households = len(households)
    n_chunks = int(np.ceil(n_households / chunk_size))
    int_vlen_type = h5py.vlen_dtype(np.dtype("int64"))
    str_vlen_type = h5py.vlen_dtype(np.dtype("S20"))
    with h5py.File(file_path, "a") as f:
        households_dset = f.create_group("households")
        for chunk in range(n_chunks):
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_households)
            ids = []
            areas = []
            types = []
            max_sizes = []
            household_complacencies = []
            for household in households[idx1:idx2]:
                ids.append(household.id)
                if household.area is None:
                    areas.append(nan_integer)
                else:
                    areas.append(household.area.id)
                if household.type is None:
                    types.append(" ".encode("ascii", "ignore"))
                else:
                    types.append(household.type.encode("ascii", "ignore"))
                max_sizes.append(household.max_size)
                household_complacencies.append(household.household_complacency)

            ids = np.array(ids, dtype=np.int)
            areas = np.array(areas, dtype=np.int)
            types = np.array(types, dtype="S15")
            max_sizes = np.array(max_sizes, dtype=np.float)
            household_complacencies = np.array(household_complacencies,
                                               dtype=np.float)
            if chunk == 0:
                households_dset.attrs["n_households"] = n_households
                households_dset.create_dataset("id",
                                               data=ids,
                                               maxshape=(None, ))
                households_dset.create_dataset("area",
                                               data=areas,
                                               maxshape=(None, ))
                households_dset.create_dataset("type",
                                               data=types,
                                               maxshape=(None, ))
                households_dset.create_dataset("max_size",
                                               data=max_sizes,
                                               maxshape=(None, ))
                households_dset.create_dataset("household_complacency",
                                               data=household_complacencies,
                                               maxshape=(None, ))

            else:
                newshape = (households_dset["id"].shape[0] + ids.shape[0], )
                households_dset["id"].resize(newshape)
                households_dset["id"][idx1:idx2] = ids
                households_dset["area"].resize(newshape)
                households_dset["area"][idx1:idx2] = areas
                households_dset["type"].resize(newshape)
                households_dset["type"][idx1:idx2] = types
                households_dset["max_size"].resize(newshape)
                households_dset["max_size"][idx1:idx2] = max_sizes
                households_dset["household_complacency"].resize(newshape)
                households_dset["household_complacency"][
                    idx1:idx2] = household_complacencies

        # I dont know how to chunk these...
        relatives_in_households = []
        relatives_in_care_homes = []
        social_venues_specs_list = []
        social_venues_ids_list = []
        for household in households:
            if (household.relatives_in_households is None
                    or len(household.relatives_in_households) == 0):
                relatives_in_households.append(
                    np.array([nan_integer], dtype=np.int))
            else:
                relatives_in_households.append(
                    np.array(
                        [
                            person.id
                            for person in household.relatives_in_households
                        ],
                        dtype=np.int,
                    ))
            if (household.relatives_in_care_homes is None
                    or len(household.relatives_in_care_homes) == 0):
                relatives_in_care_homes.append(
                    np.array([nan_integer], dtype=np.int))
            else:
                relatives_in_care_homes.append(
                    np.array(
                        [
                            person.id
                            for person in household.relatives_in_care_homes
                        ],
                        dtype=np.int,
                    ))
            social_venues_ids = []
            social_venues_specs = []
            for spec in household.social_venues.keys():
                for social_venue in household.social_venues[spec]:
                    social_venues_specs.append(spec.encode("ascii", "ignore"))
                    social_venues_ids.append(social_venue.id)
            social_venues_specs_list.append(
                np.array(social_venues_specs, dtype="S20"))
            social_venues_ids_list.append(
                np.array(social_venues_ids, dtype=np.int))
        relatives_in_households = np.array(relatives_in_households,
                                           dtype=int_vlen_type)
        relatives_in_care_homes = np.array(relatives_in_care_homes,
                                           dtype=int_vlen_type)
        social_venues_specs_list = np.array(social_venues_specs_list,
                                            dtype=str_vlen_type)
        social_venues_ids_list = np.array(social_venues_ids_list,
                                          dtype=int_vlen_type)
        try:
            households_dset.create_dataset(
                "relatives_in_households",
                data=relatives_in_households,
            )
        except:
            relatives_in_households = np.array(relatives_in_households,
                                               dtype=np.int)
            households_dset.create_dataset(
                "relatives_in_households",
                data=relatives_in_households,
            )
        try:
            households_dset.create_dataset(
                "relatives_in_care_homes",
                data=relatives_in_care_homes,
            )
        except:
            relatives_in_care_homes = np.array(relatives_in_care_homes,
                                               dtype=np.int)
            households_dset.create_dataset(
                "relatives_in_care_homes",
                data=relatives_in_care_homes,
            )
        households_dset.create_dataset(
            "social_venues_specs",
            data=social_venues_specs_list,
        )
        households_dset.create_dataset(
            "social_venues_ids",
            data=social_venues_ids_list,
        )
Ejemplo n.º 30
0
'''
import time
import warnings

import h5py
import numpy as np

#: Most up-to-date raw larpix hdf5 format version.
latest_version = '0.0'

#: Description of the datasets and their dtypes used in each version of the raw larpix hdf5 format.
#:
#: Structured as ``dataset_dtypes['<version>']['<dataset>'] = <dtype>``.
dataset_dtypes = {
    '0.0': {
        'msgs': h5py.vlen_dtype(np.dtype('u1')),
        'msg_headers': np.dtype([('io_groups', 'u1')])
    }
}


def _store_msgs_v0_0(msgs, version):
    msg_dtype = np.dtype('u1')
    arr_dtype = dataset_dtypes[version]['msgs']
    return np.array([np.frombuffer(msg, dtype=msg_dtype) for msg in msgs],
                    dtype=arr_dtype)


def _store_msg_headers_v0_0(msg_headers, version):
    length = len(msg_headers['io_groups'])
    arr = np.zeros((length, ), dtype=dataset_dtypes[version]['msg_headers'])
 def saveh5(filename, X, ORF, y):
     dt = h5py.vlen_dtype(np.dtype('int32'))
     with h5py.File(filename, 'w') as h5file:
         h5file.create_dataset('X', dtype=dt, data=X)
         h5file.create_dataset('ORF', dtype=dt, data=ORF)
         h5file.create_dataset('y', data=y)
Ejemplo n.º 32
0
def write_vlen_datasets(f):
    # Unsigned int
    uint8_vlen_type = h5py.vlen_dtype(np.uint8)
    uint8_vlen_dataset = f.create_dataset("vlen_uint8_data", (3, ),
                                          dtype=uint8_vlen_type)
    uint8_vlen_dataset[0] = [0]
    uint8_vlen_dataset[1] = [1, 2]
    uint8_vlen_dataset[2] = [3, 4, 5]

    uint16_vlen_type_chunked = h5py.vlen_dtype(np.uint16)
    uint16_vlen_dataset = f.create_dataset("vlen_uint16_data", (3, ),
                                           dtype=uint16_vlen_type_chunked)
    uint16_vlen_dataset[0] = [0]
    uint16_vlen_dataset[1] = [1, 2]
    uint16_vlen_dataset[2] = [3, 4, 5]

    uint32_vlen_type = h5py.vlen_dtype(np.uint32)
    uint32_vlen_dataset = f.create_dataset("vlen_uint32_data", (3, ),
                                           dtype=uint32_vlen_type)
    uint32_vlen_dataset[0] = [0]
    uint32_vlen_dataset[1] = [1, 2]
    uint32_vlen_dataset[2] = [3, 4, 5]

    uint64_vlen_type = h5py.vlen_dtype(np.uint64)
    uint64_vlen_dataset = f.create_dataset("vlen_uint64_data", (3, ),
                                           dtype=uint64_vlen_type)
    uint64_vlen_dataset[0] = [0]
    uint64_vlen_dataset[1] = [1, 2]
    uint64_vlen_dataset[2] = [3, 4, 5]

    # Signed int
    int8_vlen_type = h5py.vlen_dtype(np.int8)
    int8_vlen_dataset = f.create_dataset("vlen_int8_data", (3, ),
                                         dtype=int8_vlen_type)
    int8_vlen_dataset[0] = [0]
    int8_vlen_dataset[1] = [1, 2]
    int8_vlen_dataset[2] = [3, 4, 5]

    int16_vlen_type_chunked = h5py.vlen_dtype(np.int16)
    int16_vlen_dataset = f.create_dataset("vlen_int16_data", (3, ),
                                          dtype=int16_vlen_type_chunked)
    int16_vlen_dataset[0] = [0]
    int16_vlen_dataset[1] = [1, 2]
    int16_vlen_dataset[2] = [3, 4, 5]

    int32_vlen_type = h5py.vlen_dtype(np.int32)
    int32_vlen_dataset = f.create_dataset("vlen_int32_data", (3, ),
                                          dtype=int32_vlen_type)
    int32_vlen_dataset[0] = [0]
    int32_vlen_dataset[1] = [1, 2]
    int32_vlen_dataset[2] = [3, 4, 5]

    int64_vlen_type = h5py.vlen_dtype(np.int64)
    int64_vlen_dataset = f.create_dataset("vlen_int64_data", (3, ),
                                          dtype=int64_vlen_type)
    int64_vlen_dataset[0] = [0]
    int64_vlen_dataset[1] = [1, 2]
    int64_vlen_dataset[2] = [3, 4, 5]

    # Floating point
    float32_vlen_type = h5py.vlen_dtype(np.float32)
    float32_vlen_dataset = f.create_dataset("vlen_float32_data", (3, ),
                                            dtype=float32_vlen_type)
    float32_vlen_dataset[0] = [0]
    float32_vlen_dataset[1] = [1, 2]
    float32_vlen_dataset[2] = [3, 4, 5]

    float64_vlen_type = h5py.vlen_dtype(np.float64)
    float64_vlen_dataset = f.create_dataset("vlen_float64_data", (3, ),
                                            dtype=float64_vlen_type)
    float64_vlen_dataset[0] = [0]
    float64_vlen_dataset[1] = [1, 2]
    float64_vlen_dataset[2] = [3, 4, 5]

    # https://github.com/jamesmudd/jhdf/issues/247
    int32_vlen_type = h5py.vlen_dtype(np.dtype(np.int32))
    int32_vlen_dataset = f.create_dataset('vlen_issue_247', (3, ),
                                          dtype=int32_vlen_type)
    int32_vlen_dataset[0] = [1, 2, 3]
    int32_vlen_dataset[1] = []
    int32_vlen_dataset[2] = [1, 2, 3, 4, 5]

    # Chunked
    # Unsigned int
    uint8_vlen_type = h5py.vlen_dtype(np.uint8)
    uint8_vlen_dataset_chunked = f.create_dataset("vlen_uint8_data_chunked",
                                                  (3, ),
                                                  dtype=uint8_vlen_type,
                                                  chunks=(3, ))
    uint8_vlen_dataset_chunked[0] = [0]
    uint8_vlen_dataset_chunked[1] = [1, 2]
    uint8_vlen_dataset_chunked[2] = [3, 4, 5]

    uint16_vlen_type_chunked = h5py.vlen_dtype(np.uint16)
    uint16_vlen_dataset = f.create_dataset("vlen_uint16_data_chunked", (3, ),
                                           dtype=uint16_vlen_type_chunked,
                                           chunks=(3, ))
    uint16_vlen_dataset[0] = [0]
    uint16_vlen_dataset[1] = [1, 2]
    uint16_vlen_dataset[2] = [3, 4, 5]

    uint32_vlen_type = h5py.vlen_dtype(np.uint32)
    uint32_vlen_dataset_chunked = f.create_dataset("vlen_uint32_data_chunked",
                                                   (3, ),
                                                   dtype=uint32_vlen_type,
                                                   chunks=(3, ))
    uint32_vlen_dataset_chunked[0] = [0]
    uint32_vlen_dataset_chunked[1] = [1, 2]
    uint32_vlen_dataset_chunked[2] = [3, 4, 5]

    uint64_vlen_type = h5py.vlen_dtype(np.uint64)
    uint64_vlen_dataset_chunked = f.create_dataset("vlen_uint64_data_chunked",
                                                   (3, ),
                                                   dtype=uint64_vlen_type,
                                                   chunks=(3, ))
    uint64_vlen_dataset_chunked[0] = [0]
    uint64_vlen_dataset_chunked[1] = [1, 2]
    uint64_vlen_dataset_chunked[2] = [3, 4, 5]

    # Signed int
    int8_vlen_type = h5py.vlen_dtype(np.int8)
    int8_vlen_dataset = f.create_dataset("vlen_int8_data_chunked", (3, ),
                                         dtype=int8_vlen_type,
                                         chunks=(3, ))
    int8_vlen_dataset[0] = [0]
    int8_vlen_dataset[1] = [1, 2]
    int8_vlen_dataset[2] = [3, 4, 5]

    int16_vlen_type_chunked = h5py.vlen_dtype(np.int16)
    int16_vlen_dataset = f.create_dataset("vlen_int16_data_chunked", (3, ),
                                          dtype=int16_vlen_type_chunked,
                                          chunks=(3, ))
    int16_vlen_dataset[0] = [0]
    int16_vlen_dataset[1] = [1, 2]
    int16_vlen_dataset[2] = [3, 4, 5]

    int32_vlen_type = h5py.vlen_dtype(np.int32)
    int32_vlen_dataset = f.create_dataset("vlen_int32_data_chunked", (3, ),
                                          dtype=int32_vlen_type,
                                          chunks=(3, ))
    int32_vlen_dataset[0] = [0]
    int32_vlen_dataset[1] = [1, 2]
    int32_vlen_dataset[2] = [3, 4, 5]

    int64_vlen_type = h5py.vlen_dtype(np.int64)
    int64_vlen_dataset = f.create_dataset("vlen_int64_data_chunked", (3, ),
                                          dtype=int64_vlen_type,
                                          chunks=(3, ))
    int64_vlen_dataset[0] = [0]
    int64_vlen_dataset[1] = [1, 2]
    int64_vlen_dataset[2] = [3, 4, 5]

    # Floating point
    float32_vlen_type = h5py.vlen_dtype(np.float32)
    float32_vlen_dataset_chunked = f.create_dataset(
        "vlen_float32_data_chunked", (3, ),
        dtype=float32_vlen_type,
        chunks=(3, ))
    float32_vlen_dataset_chunked[0] = [0]
    float32_vlen_dataset_chunked[1] = [1, 2]
    float32_vlen_dataset_chunked[2] = [3, 4, 5]

    float64_vlen_type = h5py.vlen_dtype(np.float64)
    float64_vlen_dataset_chunked = f.create_dataset(
        "vlen_float64_data_chunked", (3, ),
        dtype=float64_vlen_type,
        chunks=(3, ))
    float64_vlen_dataset_chunked[0] = [0]
    float64_vlen_dataset_chunked[1] = [1, 2]
    float64_vlen_dataset_chunked[2] = [3, 4, 5]

    # https://github.com/jamesmudd/jhdf/issues/247
    int32_vlen_type = h5py.vlen_dtype(np.dtype(np.int32))
    int32_vlen_dataset = f.create_dataset('vlen_issue_247_chunked', (3, ),
                                          dtype=int32_vlen_type,
                                          chunks=(3, ))
    int32_vlen_dataset[0] = [1, 2, 3]
    int32_vlen_dataset[1] = []
    int32_vlen_dataset[2] = [1, 2, 3, 4, 5]

    f.flush()
    f.close()
Ejemplo n.º 33
0
    map(lambda x: np.array(x, dtype=np.dtype("int32")),
        encoder.transform(df_train["review"])))
train_scores = df_train["userscore"].to_numpy(np.dtype("int32"))

valid_tokens = list(
    map(lambda x: np.array(x, dtype=np.dtype("int32")),
        encoder.transform(df_valid["review"])))
valid_scores = df_valid["userscore"].to_numpy(np.dtype("int32"))

test_tokens = list(
    map(lambda x: np.array(x, dtype=np.dtype("int32")),
        encoder.transform(df_test["review"])))
test_scores = df_test["userscore"].to_numpy(np.dtype("int32"))

with h5py.File("../data/reviews/tokenized.h5", "w") as f:
    dt = h5py.vlen_dtype(np.dtype("int32"))
    f.create_group("data")
    f.create_group("data/train")
    f.create_dataset("data/train/tokens", data=train_tokens, dtype=dt)
    f.create_dataset("data/train/scores", data=train_scores)
    f.create_group("data/valid")
    f.create_dataset("data/valid/tokens", data=valid_tokens, dtype=dt)
    f.create_dataset("data/valid/scores", data=valid_scores)
    f.create_group("data/test")
    f.create_dataset("data/test/tokens", data=test_tokens, dtype=dt)
    f.create_dataset("data/test/scores", data=test_scores)

    dt = h5py.string_dtype(encoding='utf-8')
    f.create_group("metadata")
    f.create_dataset("metadata/encoder",
                     data=json.dumps(encoder.vocabs_to_dict()),
Ejemplo n.º 34
0
def convert_to_hdf5(base_directory, override):
    batch_size = 1000

    for labels_filepath in sorted(Path(base_directory).rglob('*labels.csv')):
        print(f'Processing {labels_filepath}...')
        basedir = os.path.dirname(labels_filepath)
        dataset_name = os.path.relpath(basedir, base_directory)
        dataset_path = f'{base_directory}/{dataset_name.replace("/", "_")}.hdf5'

        if os.path.isfile(dataset_path):
            if override:
                os.remove(dataset_path)
            else:
                print(f'Dataset already exists, skipping {dataset_name}... \n')
                continue

        dataset = h5py.File(dataset_path, 'a')
        with open(labels_filepath, newline='') as csv_file:
            csv_data = np.asarray(list(csv.reader(csv_file)))
        labels = csv_data[:, 1:5].astype(np.float)
        image_paths = np.asarray(
            [f'{basedir}/{image_name}' for image_name in csv_data[:, 0]])
        mask_paths = np.asarray([
            f'{os.path.splitext(image_path)[0]}.pgm'
            for image_path in image_paths
        ])
        load_masks = np.all(
            [os.path.isfile(mask_path) for mask_path in mask_paths])

        with tqdm(total=image_paths.shape[0], file=sys.stdout,
                  unit=' Images') as progress:
            dataset.create_dataset(f'{dataset_name}/labels',
                                   data=labels,
                                   maxshape=labels.shape,
                                   dtype=np.float)
            images_dataset = dataset.create_dataset(
                f'{dataset_name}/images', (image_paths.shape[0], ),
                dtype=h5py.vlen_dtype(np.uint8))
            masks_dataset = dataset.create_dataset(
                f'{dataset_name}/masks', (mask_paths.shape[0], ),
                dtype=h5py.vlen_dtype(np.uint8)) if load_masks else None
            if os.path.isfile(f'{basedir}/mapping.json'):
                dataset.create_dataset(f'{dataset_name}/mapping',
                                       data=json.dumps(json.loads(
                                           open(f'{basedir}/mapping.json',
                                                'r+').read()),
                                                       indent=4))

            for index in range(0, image_paths.shape[0], batch_size):
                if load_masks:
                    images, masks = load_synthetic_data(
                        image_paths[index:index + batch_size],
                        mask_paths[index:index + batch_size])
                    images_dataset[index:index + batch_size] = images
                    masks_dataset[index:index + batch_size] = masks
                else:
                    images_dataset[index:index + batch_size] = [
                        np.frombuffer(open(file, 'rb').read(), dtype=np.uint8)
                        for file in image_paths[index:index + batch_size]
                    ]
                progress.update(image_paths[index:index + batch_size].shape[0])

        dataset.flush()
        dataset.close()
Ejemplo n.º 35
0
def save_companies_to_hdf5(companies: Companies,
                           file_path: str,
                           chunk_size: int = 500000):
    """
    Saves the Population object to hdf5 format file ``file_path``. Currently for each person,
    the following values are stored:
    - id, super_area, sector, n_workers_max, 

    Parameters
    ----------
    companies 
        population object
    file_path
        path of the saved hdf5 file
    chunk_size
        number of people to save at a time. Note that they have to be copied to be saved,
        so keep the number below 1e6.
    """
    n_companies = len(companies)
    n_chunks = int(np.ceil(n_companies / chunk_size))
    vlen_type = h5py.vlen_dtype(np.dtype("float64"))
    with h5py.File(file_path, "a") as f:
        companies_dset = f.create_group("companies")
        first_company_idx = companies[0].id
        for chunk in range(n_chunks):
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_companies)
            ids = []
            super_areas = []
            sectors = []
            n_workers_max = []
            company_idx = [company.id for company in companies[idx1:idx2]]
            # sort companies by id
            companies_sorted = [
                companies[i - first_company_idx] for i in np.sort(company_idx)
            ]
            for company in companies_sorted:
                ids.append(company.id)
                if company.super_area is None:
                    super_areas.append(nan_integer)
                else:
                    super_areas.append(company.super_area.id)
                sectors.append(company.sector.encode("ascii", "ignore"))
                n_workers_max.append(company.n_workers_max)

            ids = np.array(ids, dtype=np.int)
            super_areas = np.array(super_areas, dtype=np.int)
            sectors = np.array(sectors, dtype="S10")
            n_workers_max = np.array(n_workers_max, dtype=np.float)
            if chunk == 0:
                companies_dset.attrs["n_companies"] = n_companies
                companies_dset.create_dataset("id",
                                              data=ids,
                                              maxshape=(None, ))
                companies_dset.create_dataset("super_area",
                                              data=super_areas,
                                              maxshape=(None, ))
                companies_dset.create_dataset("sector",
                                              data=sectors,
                                              maxshape=(None, ))
                companies_dset.create_dataset("n_workers_max",
                                              data=n_workers_max,
                                              maxshape=(None, ))
            else:
                newshape = (companies_dset["id"].shape[0] + ids.shape[0], )
                companies_dset["id"].resize(newshape)
                companies_dset["id"][idx1:idx2] = ids
                companies_dset["super_area"].resize(newshape)
                companies_dset["super_area"][idx1:idx2] = super_areas
                companies_dset["sector"].resize(newshape)
                companies_dset["sector"][idx1:idx2] = sectors
                companies_dset["n_workers_max"].resize(newshape)
                companies_dset["n_workers_max"][idx1:idx2] = n_workers_max