Ejemplos de vlen_dtype en Python, ejemplos de h5py.vlen_dtype en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: vlen_leak.py Proyecto: ajelenak-thg/h5py

def make_data(kind):
    global data
    global dt

    if kind is bytes:
        s = b"xx"
    else:
        s = b"xx".decode('utf8')

    dt = h5py.vlen_dtype(kind)
    data = np.array([s*100 for idx in xrange(1000)])

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_dtype.py Proyecto: bomber8013/h5py

    def test_vlen_enum(self):
        fname = self.mktemp()
        arr1 = [[1], [1, 2]]
        dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i'))

        with h5py.File(fname, 'w') as f:
            df1 = f.create_dataset('test', (len(arr1),), dtype=dt1)
            df1[:] = np.array(arr1, dtype=object)

        with h5py.File(fname, 'r') as f:
            df2 = f['test']
            dt2 = df2.dtype
            arr2 = [e.tolist() for e in df2[:]]

        self.assertEqual(arr1, arr2)
        self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)),
                         h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_dataset.py Proyecto: Silas-Asamoah/h5py

 def test_convert(self):
     dt = h5py.vlen_dtype(int)
     ds = self.f.create_dataset('vlen', (3, ), dtype=dt)
     ds[0] = np.array([1.4, 1.2])
     ds[1] = np.array([1.2])
     ds[2] = [1.2, 2, 3]
     self.assertArrayEqual(ds[0], np.array([1, 1]))
     self.assertArrayEqual(ds[1], np.array([1]))
     self.assertArrayEqual(ds[2], np.array([1, 2, 3]))
     ds[0:2] = np.array([[0.1, 1.1, 2.1, 3.1, 4], np.arange(4)])
     self.assertArrayEqual(ds[0], np.arange(5))
     self.assertArrayEqual(ds[1], np.arange(4))
     ds[0:2] = np.array(
         [np.array([0.1, 1.2, 2.2]),
          np.array([0.2, 1.2, 2.2])])
     self.assertArrayEqual(ds[0], np.arange(3))
     self.assertArrayEqual(ds[1], np.arange(3))

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_dataset.py Proyecto: Silas-Asamoah/h5py

 def test_int(self):
     dt = h5py.vlen_dtype(int)
     ds = self.f.create_dataset('vlen', (4, ), dtype=dt)
     ds[0] = np.arange(3)
     ds[1] = np.arange(0)
     ds[2] = [1, 2, 3]
     ds[3] = np.arange(1)
     self.assertArrayEqual(ds[0], np.arange(3))
     self.assertArrayEqual(ds[1], np.arange(0))
     self.assertArrayEqual(ds[2], np.array([1, 2, 3]))
     self.assertArrayEqual(ds[1], np.arange(0))
     ds[0:2] = np.array([np.arange(5), np.arange(4)])
     self.assertArrayEqual(ds[0], np.arange(5))
     self.assertArrayEqual(ds[1], np.arange(4))
     ds[0:2] = np.array([np.arange(3), np.arange(3)])
     self.assertArrayEqual(ds[0], np.arange(3))
     self.assertArrayEqual(ds[1], np.arange(3))

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_datatype.py Proyecto: ajelenak-thg/h5py

    def test_vlen_enum(self):
        fname = self.mktemp()
        arr1 = [[1],[1,2]]
        dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i'))

        with h5py.File(fname,'w') as f:
            df1 = f.create_dataset('test', (len(arr1),), dtype=dt1)
            df1[:] = np.array(arr1)

        with h5py.File(fname,'r') as f:
            df2  = f['test']
            dt2  = df2.dtype
            arr2 = [e.tolist() for e in df2[:]]

        self.assertEqual(arr1, arr2)
        self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)),
                         h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_dtype.py Proyecto: bomber8013/h5py

    def test_compound_vlen_bool(self):
        vidt = h5py.vlen_dtype(np.uint8)
        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vb = np.dtype([
            ('foo', vidt),
            ('logical', bool)])
        vb = f.create_dataset('dt_vb', shape=(4,), dtype=dt_vb)
        data = np.array([(a([1, 2, 3]), True),
                         (a([1    ]), False),
                         (a([1, 5  ]), True),
                         (a([],), False), ],
                     dtype=dt_vb)
        vb[:] = data
        actual = f['dt_vb'][:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertArrayEqual(data['logical'], actual['logical'])

        dt_vv = np.dtype([
            ('foo', vidt),
            ('bar', vidt)])
        f.create_dataset('dt_vv', shape=(4,), dtype=dt_vv)

        dt_vvb = np.dtype([
            ('foo', vidt),
            ('bar', vidt),
            ('logical', bool)])
        vvb = f.create_dataset('dt_vvb', shape=(2,), dtype=dt_vvb)

        dt_bvv = np.dtype([
            ('logical', bool),
            ('foo', vidt),
            ('bar', vidt)])
        bvv = f.create_dataset('dt_bvv', shape=(2,), dtype=dt_bvv)
        data = np.array([(True, a([1, 2, 3]), a([1, 2])),
                         (False, a([]), a([2, 4, 6])), ],
                         dtype=bvv)
        bvv[:] = data
        actual = bvv[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['logical'], actual['logical'])

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_datatype.py Proyecto: ajelenak-thg/h5py

    def test_compound_vlen_bool(self):
        vidt = h5py.vlen_dtype(np.uint8)
        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vb = np.dtype([
            ('foo', vidt),
            ('logical', np.bool)])
        vb = f.create_dataset('dt_vb', shape=(4,), dtype=dt_vb)
        data = np.array([(a([1,2,3]), True),
                         (a([1    ]), False),
                         (a([1,5  ]), True),
                         (a([],    ), False),],
                     dtype=dt_vb)
        vb[:] = data
        actual = f['dt_vb'][:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertArrayEqual(data['logical'], actual['logical'])

        dt_vv = np.dtype([
            ('foo', vidt),
            ('bar', vidt)])
        f.create_dataset('dt_vv', shape=(4,), dtype=dt_vv)

        dt_vvb = np.dtype([
            ('foo', vidt),
            ('bar', vidt),
            ('logical', np.bool)])
        vvb = f.create_dataset('dt_vvb', shape=(2,), dtype=dt_vvb)

        dt_bvv = np.dtype([
            ('logical', np.bool),
            ('foo', vidt),
            ('bar', vidt)])
        bvv = f.create_dataset('dt_bvv', shape=(2,), dtype=dt_bvv)
        data = np.array([(True,  a([1,2,3]), a([1,2]) ),
                         (False, a([]),      a([2,4,6])),],
                         dtype=bvv)
        bvv[:] = data
        actual = bvv[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['logical'], actual['logical'])

Ejemplo n.º 8

0

Mostrar archivo

Archivo: mechanics_hdf5.py Proyecto: always-walking/siconos

    def add_brep_from_string(self, name, shape_data):
        """
        Add a brep contained in a string.
        """
        if name not in self._ref:
            shape = self._ref.create_dataset(name, (1, ),
                                             dtype=h5py.vlen_dtype(str))
            if type(shape_data) == str:
                # raw str
                shape[:] = shape_data
            else:
                # __getstate__ as with pythonocc
                shape[:] = shape_data[0]
                shape.attrs['occ_indx'] = shape_data[1]

            shape.attrs['id'] = self._number_of_shapes
            shape.attrs['type'] = 'brep'

            self._number_of_shapes += 1

Ejemplo n.º 9

0

Mostrar archivo

def export_set(output_dir, name, data, labels, classes):
    """Stores paired data and labels into passed h5 file pointer."""

    assert len(data) == len(labels)

    # Variable-length datatypes for encoded png streams and label names
    dt_int = h5py.vlen_dtype(np.dtype('uint8'))
    dt_str = h5py.string_dtype(encoding='utf-8')

    # Initialize hdf5 file pointer
    f = h5py.File(f"{output_dir}/{name}_{len(data)}.h5", "w")

    # Create group and store data/labels
    x = f.create_dataset("data", (len(data), ), dtype=dt_int, data=data)
    y = f.create_dataset("label", data=np.array(labels, dtype=int))

    # Store <mapping from (0, 1 ...) to class names> as group attribute
    y.attrs.create("class_names", data=np.array(classes, dtype=dt_str))

    f.close()

Ejemplo n.º 10

0

Mostrar archivo

Archivo: openmm_reporter.py Proyecto: braceal/SC20-GB-CS1-ThetaGPU-AI-driven-MD

def write_contact_map_h5(h5_file, rows, cols):

    # Helper function to create ragged array
    def ragged(data):
        a = np.empty(len(data), dtype=object)
        a[...] = data
        return a

    # Specify variable length arrays
    dt = h5py.vlen_dtype(np.dtype("int16"))

    # list of np arrays of shape (2 * X) where X varies
    data = ragged([np.concatenate(row_col) for row_col in zip(rows, cols)])
    h5_file.create_dataset(
        "contact_map",
        data=data,
        dtype=dt,
        fletcher32=True,
        chunks=(1, ) + data.shape[1:],
    )

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_datatype.py Proyecto: ajelenak-thg/h5py

    def test_compound_vlen_enum(self):
        eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8)
        vidt = h5py.vlen_dtype(np.uint8)
        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vve = np.dtype([
            ('foo', vidt),
            ('bar', vidt),
            ('switch', eidt)])
        vve = f.create_dataset('dt_vve', shape=(2,), dtype=dt_vve)
        data = np.array([(a([1,2,3]), a([1,2]),   1),
                         (a([]),      a([2,4,6]), 0),],
                         dtype=dt_vve)
        vve[:] = data
        actual = vve[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['switch'], actual['switch'])

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_dtype.py Proyecto: yarikoptic/h5py

    def test_compound_vlen_enum(self):
        eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8)
        vidt = h5py.vlen_dtype(np.uint8)

        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vve = np.dtype([('foo', vidt), ('bar', vidt), ('switch', eidt)])
        vve = f.create_dataset('dt_vve', shape=(2, ), dtype=dt_vve)
        data = np.array([
            (a([1, 2, 3]), a([1, 2]), 1),
            (a([]), a([2, 4, 6]), 0),
        ],
                        dtype=dt_vve)
        vve[:] = data
        actual = vve[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['switch'], actual['switch'])

Ejemplo n.º 13

0

Mostrar archivo

Archivo: vibrational_integrals.py Proyecto: woodsp-ibm/qiskit-nature

    def to_hdf5(self, parent: h5py.Group) -> None:
        """Stores this instance in an HDF5 group inside of the provided parent group.

        See also :func:`~qiskit_nature.hdf5.HDF5Storable.to_hdf5` for more details.

        Args:
            parent: the parent HDF5 group.
        """
        group = parent.require_group(self.name)
        group.attrs["__class__"] = self.__class__.__name__
        group.attrs["__module__"] = self.__class__.__module__
        group.attrs["__version__"] = self.VERSION

        group.attrs["num_body_terms"] = self._num_body_terms

        dtype = h5py.vlen_dtype(np.dtype("int32"))
        integrals_dset = group.create_dataset("integrals", (len(self.integrals),), dtype=dtype)
        coeffs_dset = group.create_dataset("coefficients", (len(self.integrals),), dtype=float)

        for idx, ints in enumerate(self.integrals):
            coeffs_dset[idx] = ints[0]
            integrals_dset[idx] = list(ints[1])

Ejemplo n.º 14

0

Mostrar archivo

    def __init__(self, root_group: h5py.Group, total_length: int,
                 feature_dim: int):
        self.root_group = root_group
        LOG.debug(f"Creating dataset '{_FEATURE_GROUP}'")
        self._feature_dataset = self.root_group.create_dataset(
            _FEATURE_GROUP,
            (total_length, ),
            dtype=h5py.vlen_dtype(np.dtype("float32")),
        )
        self._feature_dataset.attrs["feature_dim"] = feature_dim

        LOG.debug(f"Creating group '{_LABEL_GROUP}'")
        self._label_group = self.root_group.create_group(_LABEL_GROUP)
        self._label_datasets = dict()

        LOG.debug(f"Creating dataset '{_UID_GROUP}'")
        self._uid_dataset = self.root_group.create_dataset(_UID_GROUP,
                                                           (total_length, ),
                                                           dtype=string_dtype)
        self.total_length = total_length
        self.feature_dim = feature_dim
        self._current_index = 0

Ejemplo n.º 15

0

Mostrar archivo

    def run(self) -> None:
        if self.cache_exists() and not self.force_update:
            self.logger.info(
                "Cached version of tokenized data already exists. " +
                "Skipping tokenization.")
            return None

        with h5py.File(self.hdf5_path, "a") as hdf5_store:
            for hdf5_group_name in self.raw_data_group_names.values():
                hdf5_group = hdf5_store.get(hdf5_group_name)
                captions = numpy.array(hdf5_group["caption_cleaned"])

                captions_tokenized = []
                captions_tokenized_id = []

                for caption in tqdm(captions):
                    caption_tokenized = (
                        self.tokenizer.encode_with_bos_eos(caption))
                    caption_tokenized_id = (
                        self.tokenizer.encode_ids_with_bos_eos(caption))
                    captions_tokenized.append(caption_tokenized)
                    captions_tokenized_id.append(caption_tokenized_id)

                if "caption_cleaned_tokenized" in hdf5_group.keys():
                    del hdf5_group["caption_cleaned_tokenized"]
                if "caption_cleaned_tokenized_id" in hdf5_group.keys():
                    del hdf5_group["caption_cleaned_tokenized_id"]

                hdf5_group.create_dataset(
                    "caption_cleaned_tokenized",
                    data=numpy.array(
                        captions_tokenized,
                        dtype=h5py.string_dtype(encoding="utf-8")))
                token_id_dataset = hdf5_group.create_dataset(
                    "caption_cleaned_tokenized_id",
                    shape=(len(captions_tokenized_id), ),
                    dtype=h5py.vlen_dtype(numpy.dtype("int32")))
                token_id_dataset[...] = captions_tokenized_id

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_datatype.py Proyecto: ajelenak-thg/h5py

    def test_compound_vlen(self):
        vidt = h5py.vlen_dtype(np.uint8)
        eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8)

        for np_align in (False, True):
            dt = np.dtype([
                ('a', eidt),
                ('foo', vidt),
                ('bar', vidt),
                ('switch', eidt)], align=np_align)
            np_offsets = [dt.fields[i][1] for i in dt.names]

            for logical in (False, True):
                if logical and np_align:
                    # Vlen types have different size in the numpy struct
                    self.assertRaises(TypeError, h5py.h5t.py_create, dt,
                            logical=logical)
                else:
                    ht = h5py.h5t.py_create(dt, logical=logical)
                    offsets = [ht.get_member_offset(i)
                               for i in range(ht.get_nmembers())]
                    if np_align:
                        self.assertEqual(np_offsets, offsets)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_dtype.py Proyecto: bomber8013/h5py

    def test_compound_vlen(self):
        vidt = h5py.vlen_dtype(np.uint8)
        eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8)

        for np_align in (False, True):
            dt = np.dtype([
                ('a', eidt),
                ('foo', vidt),
                ('bar', vidt),
                ('switch', eidt)], align=np_align)
            np_offsets = [dt.fields[i][1] for i in dt.names]

            for logical in (False, True):
                if logical and np_align:
                    # Vlen types have different size in the numpy struct
                    self.assertRaises(TypeError, h5py.h5t.py_create, dt,
                            logical=logical)
                else:
                    ht = h5py.h5t.py_create(dt, logical=logical)
                    offsets = [ht.get_member_offset(i)
                               for i in range(ht.get_nmembers())]
                    if np_align:
                        self.assertEqual(np_offsets, offsets)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: _features.py Proyecto: jpcsmith/wf-tools

def _extract_features_mp(timestamps: Sequence[Sequence[float]],
                         sizes: Sequence[Sequence[float]],
                         max_size: int = DEFAULT_NUM_FEATURES,
                         n_jobs: Optional[int] = None) -> np.ndarray:
    features = np.zeros((len(sizes), max_size), float)

    # Serialise the timestamps and sizes to file
    with tempfile.TemporaryDirectory(prefix="kfp-extract-") as directory:
        with h5py.File(f"{directory}/data.hdf", mode="w") as h5file:
            dtype = h5py.vlen_dtype(np.dtype("float"))
            h5file.create_dataset("sizes", data=sizes, dtype=dtype)
            h5file.create_dataset("timestamps", data=timestamps, dtype=dtype)

        offset = 0
        # Use our own splits as imap chunking would yield them one at a time
        chunksize = 5000
        n_chunks = max(len(sizes) // chunksize, 1)
        splits = np.array_split(np.arange(len(sizes)), n_chunks)
        assert n_chunks == len(splits)
        _LOGGER.info("Extracting features in %d batches...", n_chunks)

        with multiprocessing.Pool(n_jobs) as pool:
            # Pass the filenames and indices to the background process
            for i, batch in enumerate(
                    pool.imap(functools.partial(_run_extraction,
                                                directory=directory,
                                                max_size=max_size),
                              splits,
                              chunksize=1)):
                # Recombine them filenames and indices
                features[offset:offset + len(batch), :] = batch
                offset += len(batch)

                _LOGGER.info("Extraction is %.2f%% complete.",
                             ((i + 1) * 100 / n_chunks))

    return features

Ejemplo n.º 19

0

Mostrar archivo

def save_commute_hubs_to_hdf5(commute_hubs: CommuteHubs, file_path: str):
    n_hubs = len(commute_hubs)
    dt = h5py.vlen_dtype(np.dtype("int32"))
    with h5py.File(file_path, "a") as f:
        commute_hubs_dset = f.create_group("commute_hubs")
        ids = []
        cities = []
        commute_units_list = []
        for hub in commute_hubs:
            ids.append(hub.id)
            cities.append(hub.city)
            commute_units = []
            for commute_unit in hub.commuteunits:
                commute_units.append(commute_unit.id)
            commute_units_list.append(np.array(commute_units, dtype=np.int))

        ids = np.array(ids, dtype=np.int)
        cities = np.array(cities, dtype="S20")
        commute_units_list = np.array(commute_units_list, dtype=dt)
        commute_hubs_dset.attrs["n_commute_hubs"] = n_hubs
        commute_hubs_dset.create_dataset("id", data=ids)
        commute_hubs_dset.create_dataset("city_names", data=cities)
        commute_hubs_dset.create_dataset("commute_units",
                                         data=commute_units_list)

Ejemplo n.º 20

0

Mostrar archivo

 def test_reuse_from_other(self):
     dt = h5py.vlen_dtype(int)
     ds = self.f.create_dataset('vlen', (1,), dtype=dt)
     self.f.create_dataset('vlen2', (1,), ds[()].dtype)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: school_saver.py Proyecto: IDAS-Durham/JUNE

import h5py
import numpy as np

from june.groups import Schools, School
from june.world import World
from .utils import read_dataset

nan_integer = -999

int_vlen_type = h5py.vlen_dtype(np.dtype("int64"))


def save_schools_to_hdf5(schools: Schools,
                         file_path: str,
                         chunk_size: int = 50000):
    """
    Saves the schools object to hdf5 format file ``file_path``. Currently for each person,
    the following values are stored:
    - id, n_pupils_max,  age_min, age_max, sector, coordiantes

    Parameters
    ----------
    schools 
        population object
    file_path
        path of the saved hdf5 file
    chunk_size
        number of people to save at a time. Note that they have to be copied to be saved,
        so keep the number below 1e6.
    """
    n_schools = len(schools)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: hospital_saver.py Proyecto: sadielbartholomew/JUNE

def save_hospitals_to_hdf5(hospitals: Hospitals,
                           file_path: str,
                           chunk_size: int = 50000):
    """
    Saves the Hospitals object to hdf5 format file ``file_path``. Currently for each person,
    the following values are stored:
    - id, n_beds, n_icu_beds, super_area, coordinates

    Parameters
    ----------
    companies 
        population object
    file_path
        path of the saved hdf5 file
    chunk_size
        number of people to save at a time. Note that they have to be copied to be saved,
        so keep the number below 1e6.
    """
    n_hospitals = len(hospitals)
    n_chunks = int(np.ceil(n_hospitals / chunk_size))
    vlen_type = h5py.vlen_dtype(np.dtype("float64"))
    with h5py.File(file_path, "a") as f:
        hospitals_dset = f.create_group("hospitals")
        for chunk in range(n_chunks):
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_hospitals)
            ids = []
            n_beds = []
            n_icu_beds = []
            super_areas = []
            coordinates = []
            trust_code = []
            for hospital in hospitals[idx1:idx2]:
                ids.append(hospital.id)
                if hospital.super_area is None:
                    super_areas.append(nan_integer)
                else:
                    super_areas.append(hospital.super_area)
                n_beds.append(hospital.n_beds)
                n_icu_beds.append(hospital.n_icu_beds)
                coordinates.append(np.array(hospital.coordinates))
                trust_code.append(hospital.trust_code)

            ids = np.array(ids, dtype=np.int)
            super_areas = np.array(super_areas, dtype="S20")
            trust_code = np.array(trust_code, dtype="S10")
            n_beds = np.array(n_beds, dtype=np.int)
            n_icu_beds = np.array(n_icu_beds, dtype=np.int)
            coordinates = np.array(coordinates, dtype=np.float)
            if chunk == 0:
                hospitals_dset.attrs["n_hospitals"] = n_hospitals
                hospitals_dset.create_dataset("id",
                                              data=ids,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("super_area",
                                              data=super_areas,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("trust_code",
                                              data=trust_code,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("n_beds",
                                              data=n_beds,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("n_icu_beds",
                                              data=n_icu_beds,
                                              maxshape=(None, ))
                hospitals_dset.create_dataset("coordinates",
                                              data=coordinates,
                                              maxshape=(None,
                                                        coordinates.shape[1]))
            else:
                newshape = (hospitals_dset["id"].shape[0] + ids.shape[0], )
                hospitals_dset["id"].resize(newshape)
                hospitals_dset["id"][idx1:idx2] = ids
                hospitals_dset["super_area"].resize(newshape)
                hospitals_dset["super_area"][idx1:idx2] = super_areas
                hospitals_dset["trust_code"].resize(newshape)
                hospitals_dset["trust_code"][idx1:idx2] = trust_code
                hospitals_dset["n_beds"].resize(newshape)
                hospitals_dset["n_beds"][idx1:idx2] = n_beds
                hospitals_dset["n_icu_beds"].resize(newshape)
                hospitals_dset["n_icu_beds"][idx1:idx2] = n_icu_beds
                hospitals_dset["coordinates"].resize(newshape[0], axis=0)
                hospitals_dset["coordinates"][idx1:idx2] = coordinates

Ejemplo n.º 23

0

Mostrar archivo

import h5py
import numpy as np
from collections import defaultdict

from june.groups import ExternalGroup, ExternalSubgroup
from june.geography import Geography, Area, SuperArea, Areas, SuperAreas, Region, Regions
from .utils import read_dataset
from june.world import World

nan_integer = -999
int_vlen_type = h5py.vlen_dtype(np.dtype("int64"))
str_vlen_type = h5py.vlen_dtype(np.dtype("S40"))

social_venues_spec_mapper = {
    "pubs": "pubs",
    "household_visits": "households",
    "care_home_visits": "care_homes",
    "cinemas": "cinemas",
    "groceries": "groceries",
}

super_group_to_group_mapper = {
    "pubs": "pub",
    "groceries": "grocery",
    "cinemas": "cinema",
}


def save_geography_to_hdf5(geography: Geography, file_path: str):
    """
    Saves the households object to hdf5 format file ``file_path``. Currently for each person,

Ejemplo n.º 24

0

Mostrar archivo

Archivo: dataset_view.py Proyecto: radical-cybertools/QCFractal

    def write(self, ds: Dataset):
        import h5py
        # For data checksums
        dataset_kwargs = {"chunks": True, "fletcher32": True}

        n_records = len(ds.data.records)
        default_shape = (n_records, )

        if h5py.__version__ >= distutils.version.StrictVersion("2.10.0"):
            vlen_double_t = h5py.vlen_dtype(np.dtype("float64"))
            utf8_t = h5py.string_dtype(encoding="utf-8")
            bytes_t = h5py.vlen_dtype(np.dtype("uint8"))
            vlen_utf8_t = h5py.vlen_dtype(utf8_t)
        else:
            vlen_double_t = h5py.special_dtype(vlen=np.dtype("float64"))
            utf8_t = h5py.special_dtype(vlen=str)
            bytes_t = h5py.special_dtype(vlen=np.dtype("uint8"))
            vlen_utf8_t = h5py.special_dtype(vlen=utf8_t)

        driver_dataspec = {
            "energy": {
                "dtype": np.dtype("float64"),
                "shape": default_shape
            },
            "gradient": {
                "dtype": vlen_double_t,
                "shape": default_shape
            },
            "hessian": {
                "dtype": vlen_double_t,
                "shape": default_shape
            },
            "dipole": {
                "dtype": np.dtype("float64"),
                "shape": (n_records, 3)
            }
        }

        def _write_dataset(dataset, column, entry_dset):
            assert column.shape[1] == 1
            for i, name in enumerate(entry_dset):
                element = column.loc[name][0]
                if not h5py.check_dtype(vlen=dataset.dtype):
                    dataset[i] = element
                # Variable length datatypes require flattening of the array and special handling of missing values
                else:
                    try:
                        dataset[i] = element.ravel()
                    except AttributeError:
                        if np.isnan(element):
                            pass
                        else:
                            raise

        with self._write_file() as f:
            # Collection attributes
            for field in {
                    "name", "collection", "provenance", "tagline", "tags",
                    "id", "history_keys"
            }:
                f.attrs[field] = self._serialize_field(getattr(ds.data, field))
            if ds.client is not None:
                f.attrs["server_information"] = self._serialize_field(
                    ds.client.server_information())
                f.attrs["server_address"] = self._serialize_field(
                    ds.client.address)

            # Export molecules
            molecule_group = f.create_group("molecule")

            if "stoichiometry" in ds.data.history_keys:
                molecules = ds.get_molecules(stoich=list(ds.valid_stoich),
                                             force=True)
            else:
                molecules = ds.get_molecules(force=True)
            mol_shape = (len(molecules), )
            mol_geometry = molecule_group.create_dataset("geometry",
                                                         shape=mol_shape,
                                                         dtype=vlen_double_t,
                                                         **dataset_kwargs)
            mol_symbols = molecule_group.create_dataset("symbols",
                                                        shape=mol_shape,
                                                        dtype=vlen_utf8_t,
                                                        **dataset_kwargs)
            mol_schema = molecule_group.create_dataset("schema",
                                                       shape=mol_shape,
                                                       dtype=bytes_t,
                                                       **dataset_kwargs)
            mol_charge = molecule_group.create_dataset(
                "charge",
                shape=mol_shape,
                dtype=np.dtype('float64'),
                **dataset_kwargs)
            mol_spin = molecule_group.create_dataset("multiplicity",
                                                     shape=mol_shape,
                                                     dtype=np.dtype('int32'),
                                                     **dataset_kwargs)
            mol_id_server_view = {}
            for i, mol_row in enumerate(molecules.to_dict("records")):
                molecule = mol_row["molecule"]
                mol_geometry[i] = molecule.geometry.ravel()
                mol_schema[i] = self._serialize_data(molecule)
                mol_symbols[i] = molecule.symbols
                mol_charge[i] = molecule.molecular_charge
                mol_spin[i] = molecule.molecular_multiplicity
                mol_id_server_view[molecule.id] = i

            # Export entries
            entry_group = f.create_group("entry")
            entry_dset = entry_group.create_dataset("entry",
                                                    shape=default_shape,
                                                    dtype=utf8_t,
                                                    **dataset_kwargs)
            entry_dset[:] = ds.get_index()

            entries = ds.get_entries(force=True)
            if isinstance(ds.data.records[0], MoleculeEntry):
                entry_group.attrs["model"] = "MoleculeEntry"
                entries["hdf5_molecule_id"] = entries["molecule_id"].map(
                    mol_id_server_view)
                entry_group.create_dataset("name",
                                           data=entries["name"],
                                           dtype=utf8_t,
                                           **dataset_kwargs)
                entry_group.create_dataset("molecule_id",
                                           data=entries["hdf5_molecule_id"],
                                           dtype=np.dtype("int64"),
                                           **dataset_kwargs)
            elif isinstance(ds.data.records[0], ReactionEntry):
                entry_group.attrs["model"] = "ReactionEntry"
                entries["hdf5_molecule_id"] = entries["molecule"].map(
                    mol_id_server_view)
                entry_group.create_dataset("name",
                                           data=entries["name"],
                                           dtype=utf8_t,
                                           **dataset_kwargs)
                entry_group.create_dataset("stoichiometry",
                                           data=entries["stoichiometry"],
                                           dtype=utf8_t,
                                           **dataset_kwargs)
                entry_group.create_dataset("molecule",
                                           data=entries["hdf5_molecule_id"],
                                           dtype=np.dtype("int64"),
                                           **dataset_kwargs)
                entry_group.create_dataset("coefficient",
                                           data=entries["coefficient"],
                                           dtype=np.dtype("float64"),
                                           **dataset_kwargs)
            else:
                raise ValueError(
                    f"Unknown entry class ({type(ds.data.records[0])}) while writing HDF5 entries."
                )

            # Export native data columns
            value_group = f.create_group("value")
            history = ds.list_values(
                native=True, force=True).reset_index().to_dict("records")
            for specification in history:
                gv_spec = specification.copy()
                name = gv_spec.pop("name")
                if "stoichiometry" in gv_spec:
                    gv_spec["stoich"] = gv_spec.pop("stoichiometry")
                dataset_name = self._normalize_hdf5_name(name)
                df = ds.get_values(**gv_spec, force=True)
                assert df.shape[1] == 1

                driver = specification["driver"]
                dataspec = driver_dataspec[driver]
                dataset = value_group.create_dataset(dataset_name, **dataspec,
                                                     **dataset_kwargs)

                for key in specification:
                    dataset.attrs[key] = self._serialize_field(
                        specification[key])
                dataset.attrs["units"] = self._serialize_field(ds.units)

                _write_dataset(dataset, df, entry_dset)

            # Export contributed data columns
            contributed_group = f.create_group("contributed_value")
            for cv_name in ds.list_values(force=True, native=False)["name"]:
                cv_df = ds.get_values(name=cv_name, force=True, native=False)
                cv_model = ds.data.contributed_values[cv_name.lower()]

                try:
                    dataspec = driver_dataspec[
                        cv_model.theory_level_details["driver"]]
                except (KeyError, TypeError):
                    warnings.warn(
                        f"Contributed values column {cv_name} does not provide driver in theory_level_details. "
                        f"Assuming default driver for the dataset ({ds.data.default_driver})."
                    )
                    dataspec = driver_dataspec[ds.data.default_driver]

                dataset = contributed_group.create_dataset(
                    self._normalize_hdf5_name(cv_name), **dataspec,
                    **dataset_kwargs)
                for field in {
                        "name", "theory_level", "units", "doi", "comments",
                        "theory_level", "theory_level_details"
                }:
                    dataset.attrs[field] = self._serialize_field(
                        getattr(cv_model, field))

                _write_dataset(dataset, cv_df, entry_dset)

        # Clean up any caches
        self._entries = None

Ejemplo n.º 25

0

Mostrar archivo

Archivo: logger.py Proyecto: IDAS-Durham/JUNE

    def log_population(
        self,
        population: Population,
        chunk_size: int = 100000,
    ):
        """
        Saves the Population object to hdf5 format file ``self.save_path``. Currently for each person,
        the following values are stored:
        - id, age, sex, super_area

        Parameters
        ----------
        population:
            population object
        chunk_size:
            number of people to save at a time. Note that they have to be copied to be saved,
            so keep the number below 1e6.
        """
        n_people = len(population.people)
        dt = h5py.vlen_dtype(np.dtype("int32"))
        # dt = tuple
        n_chunks = int(np.ceil(n_people / chunk_size))
        with h5py.File(self.file_path, "a", libver="latest") as f:
            people_dset = f.create_group("population")
            people_dset.attrs["n_people"] = n_people
            for chunk in range(n_chunks):
                idx1 = chunk * chunk_size
                idx2 = min((chunk + 1) * chunk_size, n_people)
                ids = []
                ages = []
                sexes = []
                ethnicities = []
                socioeconomic_indcs = []
                super_areas = []

                for person in population.people[idx1:idx2]:
                    ids.append(person.id)
                    ages.append(person.age)
                    ethnicities.append(
                        person.ethnicity.encode("ascii", "ignore"))
                    socioeconomic_indcs.append(person.socioecon_index)
                    sexes.append(person.sex.encode("ascii", "ignore"))
                    super_areas.append(person.area.super_area.name)

                ids = np.array(ids, dtype=np.int)
                ages = np.array(ages, dtype=np.int16)
                sexes = np.array(sexes, dtype="S10")
                super_areas = np.array(super_areas, dtype="S10")
                ethnicities = np.array(ethnicities, dtype="S10")
                socioeconomic_indcs = np.array(socioeconomic_indcs,
                                               dtype=np.int8)

                if chunk == 0:
                    people_dset.create_dataset("id",
                                               data=ids,
                                               maxshape=(None, ),
                                               compression="gzip")
                    people_dset.create_dataset("age",
                                               data=ages,
                                               maxshape=(None, ),
                                               compression="gzip")
                    people_dset.create_dataset("sex",
                                               data=sexes,
                                               maxshape=(None, ),
                                               compression="gzip")
                    people_dset.create_dataset(
                        "ethnicity",
                        data=ethnicities,
                        maxshape=(None, ),
                        compression="gzip",
                    )
                    people_dset.create_dataset(
                        "socioeconomic_index",
                        data=socioeconomic_indcs,
                        maxshape=(None, ),
                        compression="gzip",
                    )
                    people_dset.create_dataset(
                        "super_area",
                        data=super_areas,
                        maxshape=(None, ),
                        compression="gzip",
                    )
                else:
                    newshape = (people_dset["id"].shape[0] + ids.shape[0], )
                    people_dset["id"].resize(newshape)
                    people_dset["id"][idx1:idx2] = ids
                    people_dset["age"].resize(newshape)
                    people_dset["age"][idx1:idx2] = ages
                    people_dset["sex"].resize(newshape)
                    people_dset["sex"][idx1:idx2] = sexes
                    people_dset["super_area"].resize(newshape)
                    people_dset["super_area"][idx1:idx2] = super_areas
                    people_dset["ethnicity"].resize(newshape)
                    people_dset["ethnicity"][idx1:idx2] = ethnicities
                    people_dset["socioeconomic_index"].resize(newshape)
                    people_dset["socioeconomic_index"][
                        idx1:idx2] = socioeconomic_indcs

Ejemplo n.º 26

0

Mostrar archivo

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 22 10:35:01 2019

@author: Vedran Furtula
"""

import h5py, random
import numpy

run_test = 1

if run_test == 0:

    dt_ = h5py.vlen_dtype(numpy.dtype('float32'))

    with h5py.File('resize_dataset.hdf5', 'w') as f:
        d1 = f.create_dataset('dataset1', (0, ), maxshape=(None, ), dtype=dt_)
        d2 = f.create_dataset('dataset2', (0, ), maxshape=(None, ))
        #d1[:10] = np.random.randn(10)
        #d2[:5] = np.random.randn(5)
        #d.resize((200,))
        #d[100:200] = np.random.randn(100)

    with h5py.File('resize_dataset.hdf5', 'r') as f:
        dset = f['dataset1']
        print("dset: ", dset[:])

    for tal in range(10):
        with h5py.File('resize_dataset.hdf5', 'a') as f:

Ejemplo n.º 27

0

Mostrar archivo

def _fixmatlabstruct(fp):  # noqa: C901
    """Verify MATLAB structs: It cannot load mixed non-scalar structs"""
    groups = []

    def collectgroups(name, obj):
        """Callback function to collect all suitable struct groups"""
        if (isinstance(obj, h5py._hl.group.Group) and name != '#refs#'
                and obj.attrs.get('MATLAB_class', None) != b'struct'):
            groups.append(obj)

    def dynamiciterator():
        """Dynamically reassessing groups iterator"""
        while True:
            fp.visititems(collectgroups)
            if groups:
                yield groups[-1]  # Start with last
            else:
                return

    # Iterate over all groups to make them MATLAB compatible structs
    for group in dynamiciterator():
        groups = []  # Reset groups for iterator
        group.attrs['MATLAB_class'] = np.bytes_('struct')

        # Create struct fields
        fieldnames = np.empty(len(group.keys()),
                              dtype=h5py.vlen_dtype(np.dtype('|S1')))
        fieldnames[:] = [np.fromiter(f, '|S1') for f in group.keys()]
        group.attrs['MATLAB_fields'] = fieldnames

        # Recurse into groups to obtain shape (visititems not suitable)
        def groupshape(obj):
            """Determine common shape"""
            if isinstance(obj, h5py._hl.group.Group):
                # Collect shapes from children
                dims = [groupshape(chld) for chld in obj.values()]

                # Obtain first n common dimensions
                commondim = ()
                for d in zip(*dims):
                    if len(set(d)) != 1:
                        break
                    commondim += (d[0], )

                # Pass upward
                return commondim
            else:
                if 'MATLAB_empty' in obj.attrs:
                    return (-np.random.randint(100), )  # Make non-scalar
                if obj.ndim == 2 and obj.shape[1] == 1:
                    return (obj.shape[0], )
                else:
                    # Reversed, because MATLAB transposes
                    return obj.shape[::-1]

        # Iterate over all children to determine if it should be scalar
        commondim = groupshape(group)
        idx = len(commondim)
        commondim = commondim[::-1]
        if len(commondim) == 1:
            commondim += (1, )

        # Different shapes = non-scalar: nothing to do
        if not idx or len(group.keys()) == 1:
            for child in group.values():
                if not isinstance(child, h5py.h5r.Reference):
                    continue

                # One-sized references can just be resolved into group
                if child.size == 1:
                    childname = child.name
                    del fp[child.name]
                    group.move(fp[child[()].item()].name, childname)
                else:
                    # Object arrays might need to be cell arrays
                    child.attrs['MATLAB_class'] = np.bytes_('cell')
            continue

        # Turn all children into references to make it non-scalar
        refs = fp.require_group('#refs#')

        # Simple loop over all group items. Assumes there are no more
        # groups within this group that haven't been resolved already.
        # Reshape a dataset/group/reference and turn it into reference
        for childname, child in group.items():
            # Skip references with correct shape
            if (getattr(child, 'dtype', None) == h5py.h5r.Reference
                    and getattr(child, 'shape', ()) == commondim):
                continue

            # Create a new dataset without any filters
            rf = group.create_dataset('__h5dereftemp__',
                                      shape=commondim,
                                      dtype=h5py.ref_dtype)

            # Iterate over dataset entries
            fi = np.nditer(rf,
                           flags=['refs_ok', 'multi_index'],
                           itershape=commondim)

            # Datasets are just turned into references, groups are
            # split into smaller groups referenced by datasets
            if isinstance(child, h5py._hl.dataset.Dataset):
                for _ in fi:
                    # Obtain index for dataset
                    if child.ndim == 2 and child.shape[1] == 1:
                        index = fi.multi_index[:idx] + (Ellipsis, )
                    else:
                        index = (Ellipsis, ) + fi.multi_index[:idx]

                    # Differentiate between data and reference
                    if child.dtype == h5py.h5r.Reference:
                        v = fp[child.name][index]
                    else:
                        v = child[index]

                    # Fix dimensions
                    if v.ndim < 2:
                        v = np.atleast_2d(v).T
                    else:
                        v = v[()]

                    # Create dataset for each element with filters
                    incr = str(len(refs.items()))
                    refs.create_dataset_like(incr,
                                             child,
                                             shape=v.shape,
                                             chunks=None,
                                             maxshape=None)
                    refs[incr][()] = v

                    # Copy attributes
                    for atr_key, atr_val in child.attrs.items():
                        refs[incr].attrs[atr_key] = atr_val
                    rf[fi.multi_index] = refs[incr].ref
            else:
                # Get the group names
                fieldnames = np.empty(len(child.keys()),
                                      dtype=h5py.vlen_dtype(np.dtype('|S1')))
                fieldnames[:] = [np.fromiter(f, '|S1') for f in child.keys()]

                for _ in fi:
                    # Create new group for each split
                    incr = str(len(refs.items()))
                    refs.create_group(incr, track_order=True)

                    # Add struct info
                    refs[incr].attrs['MATLAB_class'] = np.bytes_('struct')
                    refs[incr].attrs['MATLAB_fields'] = fieldnames

                    # Iterate over group children
                    for ckdname, ckd in child.items():
                        # Leave it like this, until needed
                        if isinstance(ckd, h5py._hl.group.Group):
                            raise NotImplementedError('Nested group')

                        # Obtain index for dataset
                        if ckd.ndim == 2 and ckd.shape[1] == 1:
                            index = fi.multi_index[:idx] + (Ellipsis, )
                        else:
                            index = (Ellipsis, ) + fi.multi_index[:idx]

                        # Differentiate between data and reference
                        if ckd.dtype == h5py.h5r.Reference:
                            v = fp[ckd.name][index]
                        else:
                            v = ckd[index]

                        # Fix dimensions
                        if v.ndim < 2:
                            v = np.atleast_2d(v).T
                        else:
                            v = v[()]

                        # Create dataset for each element with filters
                        refs[incr].create_dataset_like(ckdname,
                                                       ckd,
                                                       dtype=v.dtype,
                                                       shape=v.shape,
                                                       chunks=None,
                                                       maxshape=None)
                        refs[incr][ckdname][()] = v

                        # Copy attributes
                        for atr_key, atr_val in ckd.attrs.items():
                            refs[incr][ckdname].attrs[atr_key] = atr_val

                    rf[fi.multi_index] = refs[incr].ref

            # Re-add ALL children to maintain tracking order
            for ckdname, ckd in group.items():
                if ckdname == childname:
                    del group[childname]
                    group[childname] = group['__h5dereftemp__']
                    del group['__h5dereftemp__']
                elif ckdname != '__h5dereftemp__':
                    a = group[ckdname]
                    del group[ckdname]
                    group[ckdname] = a
                    del a

Ejemplo n.º 28

0

Mostrar archivo

def write_compound_datasets(f):

    utf8 = h5py.special_dtype(vlen=str)
    gender_enum_dtype = h5py.enum_dtype({"MALE": 0, "FEMALE": 1}, basetype=np.uint8)
    dt = np.dtype([
        ('firstName', utf8), # variable lentgh utf8
        ('surname', 'S20'), # fixed length ASCII
        ('gender', gender_enum_dtype), # enum type
        ('age', np.uint8), # uint
        ('fav_number', np.float32), # float
        ('vector', np.float32, (3,))]) # array

    data = np.zeros(4, dtype=dt)

    # Set the example data
    data[0] = ('Bob', 'Smith', 0, 32, 1.0, [1, 2, 3])
    data[1] = ('Peter', 'Fletcher', 0, 43, 2.0, [16.2, 2.2, -32.4])
    data[2] = ('James', 'Mudd', 0, 12, 3.0, [-32.1,-774.1,-3.0])
    data[3] = ('Ellie', 'Kyle', 1, 22, 4.0, [2.1,74.1,-3.8])

    f.create_dataset('contiguous_compound', data=data)
    f.create_dataset('chunked_compound', data=data, chunks=(1,), compression="gzip")

    # 2d compound use img number example
    imgdt = np.dtype([
        ('real', np.float32),
        ('img', np.float32)
    ])
    data = np.zeros((3, 3), dtype=imgdt)
    data[0][0] = (2.3, -7.3)
    data[0][1] = (12.3, -17.3)
    data[0][2] = (-32.3, -0.3)
    data[1][0] = (2.3, -7.3)
    data[1][1] = (12.3, -17.3)
    data[1][2] = (-32.3, -0.3)
    data[2][0] = (2.3, -7.3)
    data[2][1] = (12.3, -17.3)
    data[2][2] = (-32.3, -0.3)

    f.create_dataset('2d_contiguous_compound', data=data)
    f.create_dataset('2d_chunked_compound', data=data, chunks=(1,2), compression="gzip")

    # Compound dataset containing ragged arrays
    uint8_vlen_type = h5py.vlen_dtype(np.uint8)
    compound_vlen_dtype = np.dtype([
        ('one', uint8_vlen_type),
        ('two', uint8_vlen_type)
    ])
    data = np.zeros(3, dtype=compound_vlen_dtype)
    data[0] = (np.array([1]), np.array([2]))
    data[1] = (np.array([1,1]), np.array([2,2]))
    data[2] = (np.array([1,1,1]), np.array([2,2,2]))

    f.create_dataset('vlen_contiguous_compound', data=data, dtype=compound_vlen_dtype)
    f.create_dataset('vlen_chunked_compound', data=data, dtype=compound_vlen_dtype, chunks=(1,), compression="gzip")

    # Compound dataset arrays of vlen type
    compound_vlen_dtype = np.dtype([
        ('name', utf8, 2)
    ])
    pointData = np.zeros(2, dtype=utf8)
    pointData[0] = "James"
    pointData[1] = "Ellie"
    data = np.zeros(1, dtype=compound_vlen_dtype)
    data['name'] = np.array(pointData)

    f.create_dataset('array_vlen_contiguous_compound', data=data, dtype=compound_vlen_dtype)
    f.create_dataset('array_vlen_chunked_compound', data=data, dtype=compound_vlen_dtype, chunks=(1,), compression="gzip")

    # Nested compound datasets use 2 img numbers as an example
    nested_dt = np.dtype([
        ('firstNumber', imgdt),
        ('secondNumber', imgdt),
    ])

    data = np.zeros(3, dtype=nested_dt)
    data[1] = ((1,1), (1,1))
    data[2] = ((2,2), (2,2))
    f.create_dataset('nested_contiguous_compound', data=data, dtype=nested_dt)
    f.create_dataset('nested_chunked_compound', data=data, dtype=nested_dt, chunks=(2,), compression="gzip")

    f.flush()
    f.close()

Ejemplo n.º 29

0

Mostrar archivo

Archivo: household_saver.py Proyecto: sadielbartholomew/JUNE

def save_households_to_hdf5(households: Households,
                            file_path: str,
                            chunk_size: int = 50000):
    """
    Saves the households object to hdf5 format file ``file_path``. Currently for each person,
    the following values are stored:
    - id, n_beds, n_icu_beds, super_area, coordinates

    Parameters
    ----------
    companies 
        population object
    file_path
        path of the saved hdf5 file
    chunk_size
        number of people to save at a time. Note that they have to be copied to be saved,
        so keep the number below 1e6.
    """
    n_households = len(households)
    n_chunks = int(np.ceil(n_households / chunk_size))
    int_vlen_type = h5py.vlen_dtype(np.dtype("int64"))
    str_vlen_type = h5py.vlen_dtype(np.dtype("S20"))
    with h5py.File(file_path, "a") as f:
        households_dset = f.create_group("households")
        for chunk in range(n_chunks):
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_households)
            ids = []
            areas = []
            types = []
            max_sizes = []
            household_complacencies = []
            for household in households[idx1:idx2]:
                ids.append(household.id)
                if household.area is None:
                    areas.append(nan_integer)
                else:
                    areas.append(household.area.id)
                if household.type is None:
                    types.append(" ".encode("ascii", "ignore"))
                else:
                    types.append(household.type.encode("ascii", "ignore"))
                max_sizes.append(household.max_size)
                household_complacencies.append(household.household_complacency)

            ids = np.array(ids, dtype=np.int)
            areas = np.array(areas, dtype=np.int)
            types = np.array(types, dtype="S15")
            max_sizes = np.array(max_sizes, dtype=np.float)
            household_complacencies = np.array(household_complacencies,
                                               dtype=np.float)
            if chunk == 0:
                households_dset.attrs["n_households"] = n_households
                households_dset.create_dataset("id",
                                               data=ids,
                                               maxshape=(None, ))
                households_dset.create_dataset("area",
                                               data=areas,
                                               maxshape=(None, ))
                households_dset.create_dataset("type",
                                               data=types,
                                               maxshape=(None, ))
                households_dset.create_dataset("max_size",
                                               data=max_sizes,
                                               maxshape=(None, ))
                households_dset.create_dataset("household_complacency",
                                               data=household_complacencies,
                                               maxshape=(None, ))

            else:
                newshape = (households_dset["id"].shape[0] + ids.shape[0], )
                households_dset["id"].resize(newshape)
                households_dset["id"][idx1:idx2] = ids
                households_dset["area"].resize(newshape)
                households_dset["area"][idx1:idx2] = areas
                households_dset["type"].resize(newshape)
                households_dset["type"][idx1:idx2] = types
                households_dset["max_size"].resize(newshape)
                households_dset["max_size"][idx1:idx2] = max_sizes
                households_dset["household_complacency"].resize(newshape)
                households_dset["household_complacency"][
                    idx1:idx2] = household_complacencies

        # I dont know how to chunk these...
        relatives_in_households = []
        relatives_in_care_homes = []
        social_venues_specs_list = []
        social_venues_ids_list = []
        for household in households:
            if (household.relatives_in_households is None
                    or len(household.relatives_in_households) == 0):
                relatives_in_households.append(
                    np.array([nan_integer], dtype=np.int))
            else:
                relatives_in_households.append(
                    np.array(
                        [
                            person.id
                            for person in household.relatives_in_households
                        ],
                        dtype=np.int,
                    ))
            if (household.relatives_in_care_homes is None
                    or len(household.relatives_in_care_homes) == 0):
                relatives_in_care_homes.append(
                    np.array([nan_integer], dtype=np.int))
            else:
                relatives_in_care_homes.append(
                    np.array(
                        [
                            person.id
                            for person in household.relatives_in_care_homes
                        ],
                        dtype=np.int,
                    ))
            social_venues_ids = []
            social_venues_specs = []
            for spec in household.social_venues.keys():
                for social_venue in household.social_venues[spec]:
                    social_venues_specs.append(spec.encode("ascii", "ignore"))
                    social_venues_ids.append(social_venue.id)
            social_venues_specs_list.append(
                np.array(social_venues_specs, dtype="S20"))
            social_venues_ids_list.append(
                np.array(social_venues_ids, dtype=np.int))
        relatives_in_households = np.array(relatives_in_households,
                                           dtype=int_vlen_type)
        relatives_in_care_homes = np.array(relatives_in_care_homes,
                                           dtype=int_vlen_type)
        social_venues_specs_list = np.array(social_venues_specs_list,
                                            dtype=str_vlen_type)
        social_venues_ids_list = np.array(social_venues_ids_list,
                                          dtype=int_vlen_type)
        try:
            households_dset.create_dataset(
                "relatives_in_households",
                data=relatives_in_households,
            )
        except:
            relatives_in_households = np.array(relatives_in_households,
                                               dtype=np.int)
            households_dset.create_dataset(
                "relatives_in_households",
                data=relatives_in_households,
            )
        try:
            households_dset.create_dataset(
                "relatives_in_care_homes",
                data=relatives_in_care_homes,
            )
        except:
            relatives_in_care_homes = np.array(relatives_in_care_homes,
                                               dtype=np.int)
            households_dset.create_dataset(
                "relatives_in_care_homes",
                data=relatives_in_care_homes,
            )
        households_dset.create_dataset(
            "social_venues_specs",
            data=social_venues_specs_list,
        )
        households_dset.create_dataset(
            "social_venues_ids",
            data=social_venues_ids_list,
        )

Ejemplo n.º 30

0

Mostrar archivo

'''
import time
import warnings

import h5py
import numpy as np

#: Most up-to-date raw larpix hdf5 format version.
latest_version = '0.0'

#: Description of the datasets and their dtypes used in each version of the raw larpix hdf5 format.
#:
#: Structured as ``dataset_dtypes['<version>']['<dataset>'] = <dtype>``.
dataset_dtypes = {
    '0.0': {
        'msgs': h5py.vlen_dtype(np.dtype('u1')),
        'msg_headers': np.dtype([('io_groups', 'u1')])
    }
}


def _store_msgs_v0_0(msgs, version):
    msg_dtype = np.dtype('u1')
    arr_dtype = dataset_dtypes[version]['msgs']
    return np.array([np.frombuffer(msg, dtype=msg_dtype) for msg in msgs],
                    dtype=arr_dtype)


def _store_msg_headers_v0_0(msg_headers, version):
    length = len(msg_headers['io_groups'])
    arr = np.zeros((length, ), dtype=dataset_dtypes[version]['msg_headers'])

Ejemplo n.º 31

0

Mostrar archivo

Archivo: preprocessing.py Proyecto: thekoc/lncRNAnet-reimplementation

 def saveh5(filename, X, ORF, y):
     dt = h5py.vlen_dtype(np.dtype('int32'))
     with h5py.File(filename, 'w') as h5file:
         h5file.create_dataset('X', dtype=dt, data=X)
         h5file.create_dataset('ORF', dtype=dt, data=ORF)
         h5file.create_dataset('y', data=y)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: vlen_datasets.py Proyecto: jamesmudd/jhdf

def write_vlen_datasets(f):
    # Unsigned int
    uint8_vlen_type = h5py.vlen_dtype(np.uint8)
    uint8_vlen_dataset = f.create_dataset("vlen_uint8_data", (3, ),
                                          dtype=uint8_vlen_type)
    uint8_vlen_dataset[0] = [0]
    uint8_vlen_dataset[1] = [1, 2]
    uint8_vlen_dataset[2] = [3, 4, 5]

    uint16_vlen_type_chunked = h5py.vlen_dtype(np.uint16)
    uint16_vlen_dataset = f.create_dataset("vlen_uint16_data", (3, ),
                                           dtype=uint16_vlen_type_chunked)
    uint16_vlen_dataset[0] = [0]
    uint16_vlen_dataset[1] = [1, 2]
    uint16_vlen_dataset[2] = [3, 4, 5]

    uint32_vlen_type = h5py.vlen_dtype(np.uint32)
    uint32_vlen_dataset = f.create_dataset("vlen_uint32_data", (3, ),
                                           dtype=uint32_vlen_type)
    uint32_vlen_dataset[0] = [0]
    uint32_vlen_dataset[1] = [1, 2]
    uint32_vlen_dataset[2] = [3, 4, 5]

    uint64_vlen_type = h5py.vlen_dtype(np.uint64)
    uint64_vlen_dataset = f.create_dataset("vlen_uint64_data", (3, ),
                                           dtype=uint64_vlen_type)
    uint64_vlen_dataset[0] = [0]
    uint64_vlen_dataset[1] = [1, 2]
    uint64_vlen_dataset[2] = [3, 4, 5]

    # Signed int
    int8_vlen_type = h5py.vlen_dtype(np.int8)
    int8_vlen_dataset = f.create_dataset("vlen_int8_data", (3, ),
                                         dtype=int8_vlen_type)
    int8_vlen_dataset[0] = [0]
    int8_vlen_dataset[1] = [1, 2]
    int8_vlen_dataset[2] = [3, 4, 5]

    int16_vlen_type_chunked = h5py.vlen_dtype(np.int16)
    int16_vlen_dataset = f.create_dataset("vlen_int16_data", (3, ),
                                          dtype=int16_vlen_type_chunked)
    int16_vlen_dataset[0] = [0]
    int16_vlen_dataset[1] = [1, 2]
    int16_vlen_dataset[2] = [3, 4, 5]

    int32_vlen_type = h5py.vlen_dtype(np.int32)
    int32_vlen_dataset = f.create_dataset("vlen_int32_data", (3, ),
                                          dtype=int32_vlen_type)
    int32_vlen_dataset[0] = [0]
    int32_vlen_dataset[1] = [1, 2]
    int32_vlen_dataset[2] = [3, 4, 5]

    int64_vlen_type = h5py.vlen_dtype(np.int64)
    int64_vlen_dataset = f.create_dataset("vlen_int64_data", (3, ),
                                          dtype=int64_vlen_type)
    int64_vlen_dataset[0] = [0]
    int64_vlen_dataset[1] = [1, 2]
    int64_vlen_dataset[2] = [3, 4, 5]

    # Floating point
    float32_vlen_type = h5py.vlen_dtype(np.float32)
    float32_vlen_dataset = f.create_dataset("vlen_float32_data", (3, ),
                                            dtype=float32_vlen_type)
    float32_vlen_dataset[0] = [0]
    float32_vlen_dataset[1] = [1, 2]
    float32_vlen_dataset[2] = [3, 4, 5]

    float64_vlen_type = h5py.vlen_dtype(np.float64)
    float64_vlen_dataset = f.create_dataset("vlen_float64_data", (3, ),
                                            dtype=float64_vlen_type)
    float64_vlen_dataset[0] = [0]
    float64_vlen_dataset[1] = [1, 2]
    float64_vlen_dataset[2] = [3, 4, 5]

    # https://github.com/jamesmudd/jhdf/issues/247
    int32_vlen_type = h5py.vlen_dtype(np.dtype(np.int32))
    int32_vlen_dataset = f.create_dataset('vlen_issue_247', (3, ),
                                          dtype=int32_vlen_type)
    int32_vlen_dataset[0] = [1, 2, 3]
    int32_vlen_dataset[1] = []
    int32_vlen_dataset[2] = [1, 2, 3, 4, 5]

    # Chunked
    # Unsigned int
    uint8_vlen_type = h5py.vlen_dtype(np.uint8)
    uint8_vlen_dataset_chunked = f.create_dataset("vlen_uint8_data_chunked",
                                                  (3, ),
                                                  dtype=uint8_vlen_type,
                                                  chunks=(3, ))
    uint8_vlen_dataset_chunked[0] = [0]
    uint8_vlen_dataset_chunked[1] = [1, 2]
    uint8_vlen_dataset_chunked[2] = [3, 4, 5]

    uint16_vlen_type_chunked = h5py.vlen_dtype(np.uint16)
    uint16_vlen_dataset = f.create_dataset("vlen_uint16_data_chunked", (3, ),
                                           dtype=uint16_vlen_type_chunked,
                                           chunks=(3, ))
    uint16_vlen_dataset[0] = [0]
    uint16_vlen_dataset[1] = [1, 2]
    uint16_vlen_dataset[2] = [3, 4, 5]

    uint32_vlen_type = h5py.vlen_dtype(np.uint32)
    uint32_vlen_dataset_chunked = f.create_dataset("vlen_uint32_data_chunked",
                                                   (3, ),
                                                   dtype=uint32_vlen_type,
                                                   chunks=(3, ))
    uint32_vlen_dataset_chunked[0] = [0]
    uint32_vlen_dataset_chunked[1] = [1, 2]
    uint32_vlen_dataset_chunked[2] = [3, 4, 5]

    uint64_vlen_type = h5py.vlen_dtype(np.uint64)
    uint64_vlen_dataset_chunked = f.create_dataset("vlen_uint64_data_chunked",
                                                   (3, ),
                                                   dtype=uint64_vlen_type,
                                                   chunks=(3, ))
    uint64_vlen_dataset_chunked[0] = [0]
    uint64_vlen_dataset_chunked[1] = [1, 2]
    uint64_vlen_dataset_chunked[2] = [3, 4, 5]

    # Signed int
    int8_vlen_type = h5py.vlen_dtype(np.int8)
    int8_vlen_dataset = f.create_dataset("vlen_int8_data_chunked", (3, ),
                                         dtype=int8_vlen_type,
                                         chunks=(3, ))
    int8_vlen_dataset[0] = [0]
    int8_vlen_dataset[1] = [1, 2]
    int8_vlen_dataset[2] = [3, 4, 5]

    int16_vlen_type_chunked = h5py.vlen_dtype(np.int16)
    int16_vlen_dataset = f.create_dataset("vlen_int16_data_chunked", (3, ),
                                          dtype=int16_vlen_type_chunked,
                                          chunks=(3, ))
    int16_vlen_dataset[0] = [0]
    int16_vlen_dataset[1] = [1, 2]
    int16_vlen_dataset[2] = [3, 4, 5]

    int32_vlen_type = h5py.vlen_dtype(np.int32)
    int32_vlen_dataset = f.create_dataset("vlen_int32_data_chunked", (3, ),
                                          dtype=int32_vlen_type,
                                          chunks=(3, ))
    int32_vlen_dataset[0] = [0]
    int32_vlen_dataset[1] = [1, 2]
    int32_vlen_dataset[2] = [3, 4, 5]

    int64_vlen_type = h5py.vlen_dtype(np.int64)
    int64_vlen_dataset = f.create_dataset("vlen_int64_data_chunked", (3, ),
                                          dtype=int64_vlen_type,
                                          chunks=(3, ))
    int64_vlen_dataset[0] = [0]
    int64_vlen_dataset[1] = [1, 2]
    int64_vlen_dataset[2] = [3, 4, 5]

    # Floating point
    float32_vlen_type = h5py.vlen_dtype(np.float32)
    float32_vlen_dataset_chunked = f.create_dataset(
        "vlen_float32_data_chunked", (3, ),
        dtype=float32_vlen_type,
        chunks=(3, ))
    float32_vlen_dataset_chunked[0] = [0]
    float32_vlen_dataset_chunked[1] = [1, 2]
    float32_vlen_dataset_chunked[2] = [3, 4, 5]

    float64_vlen_type = h5py.vlen_dtype(np.float64)
    float64_vlen_dataset_chunked = f.create_dataset(
        "vlen_float64_data_chunked", (3, ),
        dtype=float64_vlen_type,
        chunks=(3, ))
    float64_vlen_dataset_chunked[0] = [0]
    float64_vlen_dataset_chunked[1] = [1, 2]
    float64_vlen_dataset_chunked[2] = [3, 4, 5]

    # https://github.com/jamesmudd/jhdf/issues/247
    int32_vlen_type = h5py.vlen_dtype(np.dtype(np.int32))
    int32_vlen_dataset = f.create_dataset('vlen_issue_247_chunked', (3, ),
                                          dtype=int32_vlen_type,
                                          chunks=(3, ))
    int32_vlen_dataset[0] = [1, 2, 3]
    int32_vlen_dataset[1] = []
    int32_vlen_dataset[2] = [1, 2, 3, 4, 5]

    f.flush()
    f.close()

Ejemplo n.º 33

0

Mostrar archivo

    map(lambda x: np.array(x, dtype=np.dtype("int32")),
        encoder.transform(df_train["review"])))
train_scores = df_train["userscore"].to_numpy(np.dtype("int32"))

valid_tokens = list(
    map(lambda x: np.array(x, dtype=np.dtype("int32")),
        encoder.transform(df_valid["review"])))
valid_scores = df_valid["userscore"].to_numpy(np.dtype("int32"))

test_tokens = list(
    map(lambda x: np.array(x, dtype=np.dtype("int32")),
        encoder.transform(df_test["review"])))
test_scores = df_test["userscore"].to_numpy(np.dtype("int32"))

with h5py.File("../data/reviews/tokenized.h5", "w") as f:
    dt = h5py.vlen_dtype(np.dtype("int32"))
    f.create_group("data")
    f.create_group("data/train")
    f.create_dataset("data/train/tokens", data=train_tokens, dtype=dt)
    f.create_dataset("data/train/scores", data=train_scores)
    f.create_group("data/valid")
    f.create_dataset("data/valid/tokens", data=valid_tokens, dtype=dt)
    f.create_dataset("data/valid/scores", data=valid_scores)
    f.create_group("data/test")
    f.create_dataset("data/test/tokens", data=test_tokens, dtype=dt)
    f.create_dataset("data/test/scores", data=test_scores)

    dt = h5py.string_dtype(encoding='utf-8')
    f.create_group("metadata")
    f.create_dataset("metadata/encoder",
                     data=json.dumps(encoder.vocabs_to_dict()),

Ejemplo n.º 34

0

Mostrar archivo

Archivo: converter.py Proyecto: bhuman/DeepFieldBoundary

def convert_to_hdf5(base_directory, override):
    batch_size = 1000

    for labels_filepath in sorted(Path(base_directory).rglob('*labels.csv')):
        print(f'Processing {labels_filepath}...')
        basedir = os.path.dirname(labels_filepath)
        dataset_name = os.path.relpath(basedir, base_directory)
        dataset_path = f'{base_directory}/{dataset_name.replace("/", "_")}.hdf5'

        if os.path.isfile(dataset_path):
            if override:
                os.remove(dataset_path)
            else:
                print(f'Dataset already exists, skipping {dataset_name}... \n')
                continue

        dataset = h5py.File(dataset_path, 'a')
        with open(labels_filepath, newline='') as csv_file:
            csv_data = np.asarray(list(csv.reader(csv_file)))
        labels = csv_data[:, 1:5].astype(np.float)
        image_paths = np.asarray(
            [f'{basedir}/{image_name}' for image_name in csv_data[:, 0]])
        mask_paths = np.asarray([
            f'{os.path.splitext(image_path)[0]}.pgm'
            for image_path in image_paths
        ])
        load_masks = np.all(
            [os.path.isfile(mask_path) for mask_path in mask_paths])

        with tqdm(total=image_paths.shape[0], file=sys.stdout,
                  unit=' Images') as progress:
            dataset.create_dataset(f'{dataset_name}/labels',
                                   data=labels,
                                   maxshape=labels.shape,
                                   dtype=np.float)
            images_dataset = dataset.create_dataset(
                f'{dataset_name}/images', (image_paths.shape[0], ),
                dtype=h5py.vlen_dtype(np.uint8))
            masks_dataset = dataset.create_dataset(
                f'{dataset_name}/masks', (mask_paths.shape[0], ),
                dtype=h5py.vlen_dtype(np.uint8)) if load_masks else None
            if os.path.isfile(f'{basedir}/mapping.json'):
                dataset.create_dataset(f'{dataset_name}/mapping',
                                       data=json.dumps(json.loads(
                                           open(f'{basedir}/mapping.json',
                                                'r+').read()),
                                                       indent=4))

            for index in range(0, image_paths.shape[0], batch_size):
                if load_masks:
                    images, masks = load_synthetic_data(
                        image_paths[index:index + batch_size],
                        mask_paths[index:index + batch_size])
                    images_dataset[index:index + batch_size] = images
                    masks_dataset[index:index + batch_size] = masks
                else:
                    images_dataset[index:index + batch_size] = [
                        np.frombuffer(open(file, 'rb').read(), dtype=np.uint8)
                        for file in image_paths[index:index + batch_size]
                    ]
                progress.update(image_paths[index:index + batch_size].shape[0])

        dataset.flush()
        dataset.close()

Ejemplo n.º 35

0

Mostrar archivo

Archivo: company_saver.py Proyecto: IDAS-Durham/JUNE

def save_companies_to_hdf5(companies: Companies,
                           file_path: str,
                           chunk_size: int = 500000):
    """
    Saves the Population object to hdf5 format file ``file_path``. Currently for each person,
    the following values are stored:
    - id, super_area, sector, n_workers_max, 

    Parameters
    ----------
    companies 
        population object
    file_path
        path of the saved hdf5 file
    chunk_size
        number of people to save at a time. Note that they have to be copied to be saved,
        so keep the number below 1e6.
    """
    n_companies = len(companies)
    n_chunks = int(np.ceil(n_companies / chunk_size))
    vlen_type = h5py.vlen_dtype(np.dtype("float64"))
    with h5py.File(file_path, "a") as f:
        companies_dset = f.create_group("companies")
        first_company_idx = companies[0].id
        for chunk in range(n_chunks):
            idx1 = chunk * chunk_size
            idx2 = min((chunk + 1) * chunk_size, n_companies)
            ids = []
            super_areas = []
            sectors = []
            n_workers_max = []
            company_idx = [company.id for company in companies[idx1:idx2]]
            # sort companies by id
            companies_sorted = [
                companies[i - first_company_idx] for i in np.sort(company_idx)
            ]
            for company in companies_sorted:
                ids.append(company.id)
                if company.super_area is None:
                    super_areas.append(nan_integer)
                else:
                    super_areas.append(company.super_area.id)
                sectors.append(company.sector.encode("ascii", "ignore"))
                n_workers_max.append(company.n_workers_max)

            ids = np.array(ids, dtype=np.int)
            super_areas = np.array(super_areas, dtype=np.int)
            sectors = np.array(sectors, dtype="S10")
            n_workers_max = np.array(n_workers_max, dtype=np.float)
            if chunk == 0:
                companies_dset.attrs["n_companies"] = n_companies
                companies_dset.create_dataset("id",
                                              data=ids,
                                              maxshape=(None, ))
                companies_dset.create_dataset("super_area",
                                              data=super_areas,
                                              maxshape=(None, ))
                companies_dset.create_dataset("sector",
                                              data=sectors,
                                              maxshape=(None, ))
                companies_dset.create_dataset("n_workers_max",
                                              data=n_workers_max,
                                              maxshape=(None, ))
            else:
                newshape = (companies_dset["id"].shape[0] + ids.shape[0], )
                companies_dset["id"].resize(newshape)
                companies_dset["id"][idx1:idx2] = ids
                companies_dset["super_area"].resize(newshape)
                companies_dset["super_area"][idx1:idx2] = super_areas
                companies_dset["sector"].resize(newshape)
                companies_dset["sector"][idx1:idx2] = sectors
                companies_dset["n_workers_max"].resize(newshape)
                companies_dset["n_workers_max"][idx1:idx2] = n_workers_max