コード例 #1
0
ファイル: wrappers.py プロジェクト: melissawm/versioned-hdf5
    def resize(self, size, axis=None):
        self.parent._check_committed()
        if axis is not None:
            if not (axis >= 0 and axis < self.ndim):
                raise ValueError("Invalid axis (0 to %s allowed)" %
                                 (self.ndim - 1))
            try:
                newlen = int(size)
            except TypeError:
                raise TypeError(
                    "Argument must be a single int if axis is specified")
            size = list(self.shape)
            size[axis] = newlen

        old_shape = self.shape
        size = tuple(size)
        if all(new <= old for new, old in zip(size, old_shape)):
            # Don't create a new array if the old one can just be sliced in
            # memory.
            idx = tuple(slice(0, i) for i in size)
            self.array = self.array[idx]
        else:
            old_shape_idx = Tuple(*[Slice(0, i) for i in old_shape])
            new_shape_idx = Tuple(*[Slice(0, i) for i in size])
            new_array = np.full(size, self.fillvalue, dtype=self.dtype)
            new_array[old_shape_idx.as_subindex(
                new_shape_idx).raw] = self.array[new_shape_idx.as_subindex(
                    old_shape_idx).raw]
            self.array = new_array
コード例 #2
0
def spaceid_to_slice(space):
    """
    Convert an h5py spaceid object into an ndindex index

    The resulting index is always a Tuple index.
    """

    from h5py import h5s

    sel_type = space.get_select_type()

    if sel_type == h5s.SEL_ALL:
        return Tuple()
    elif sel_type == h5s.SEL_HYPERSLABS:
        slices = []
        starts, strides, counts, blocks = space.get_regular_hyperslab()
        for _start, _stride, count, block in zip(starts, strides, counts, blocks):
            start = _start
            if not (block == 1 or count == 1):
                raise NotImplementedError("Nontrivial blocks are not yet supported")
            end = _start + (_stride*(count - 1) + 1)*block
            stride = _stride if block == 1 else 1
            slices.append(Slice(start, end, stride))
        return Tuple(*slices)
    elif sel_type == h5s.SEL_NONE:
        return Tuple(Slice(0, 0),)
    else:
        raise NotImplementedError("Point selections are not yet supported")
コード例 #3
0
def test_write_dataset_offset_chunk_size(h5file):
    chunk_size = 2**10
    chunks = (chunk_size, )
    slices1 = write_dataset(h5file,
                            'test_data',
                            1 * np.ones((2 * chunk_size, )),
                            chunks=chunks)
    slices2 = write_dataset(
        h5file, 'test_data',
        np.concatenate((2 * np.ones(chunks), 2 * np.ones(chunks), 3 * np.ones(
            (chunk_size - 2, )))))

    assert slices1 == {
        Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)):
        slice(0 * chunk_size, 1 * chunk_size),
        Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)):
        slice(0 * chunk_size, 1 * chunk_size),
    }
    assert slices2 == {
        Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)):
        slice(1 * chunk_size, 2 * chunk_size),
        Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)):
        slice(1 * chunk_size, 2 * chunk_size),
        Tuple(Slice(2 * chunk_size, 3 * chunk_size - 2, 1)):
        slice(2 * chunk_size, 3 * chunk_size - 2),
    }

    ds = h5file['/_version_data/test_data/raw_data']
    assert ds.shape == (3 * chunk_size, )
    assert_equal(ds[0 * chunk_size:1 * chunk_size], 1.0)
    assert_equal(ds[1 * chunk_size:2 * chunk_size], 2.0)
    assert_equal(ds[2 * chunk_size:3 * chunk_size - 2], 3.0)
    assert_equal(ds[3 * chunk_size - 2:4 * chunk_size], 0.0)
コード例 #4
0
def create_virtual_dataset(f,
                           version_name,
                           name,
                           slices,
                           attrs=None,
                           fillvalue=None):
    raw_data = f['_version_data'][name]['raw_data']
    chunks = tuple(raw_data.attrs['chunks'])
    slices = {c: s.reduce() for c, s in slices.items()}

    shape = tuple(
        [max(c.args[i].stop for c in slices) for i in range(len(chunks))])
    # Chunks in the raw dataset are expanded along the first dimension only.
    # Since the chunks are pointed to by virtual datasets, it doesn't make
    # sense to expand the chunks in the raw dataset along multiple dimensions
    # (the true layout of the chunks in the raw dataset is irrelevant).
    for c, s in slices.items():
        if len(c.args[0]) != len(s):
            raise ValueError(
                f"Inconsistent slices dictionary ({c.args[0]}, {s})")

    layout = VirtualLayout(shape, dtype=raw_data.dtype)
    vs = VirtualSource('.',
                       name=raw_data.name,
                       shape=raw_data.shape,
                       dtype=raw_data.dtype)

    for c, s in slices.items():
        # TODO: This needs to handle more than one dimension
        idx = Tuple(
            s,
            *Tuple(*[slice(0, i) for i in shape]).as_subindex(c).args[1:])
        assert c.newshape(shape) == vs[idx.raw].shape, (c, shape, s)
        layout[c.raw] = vs[idx.raw]

    dtype = raw_data.dtype
    if dtype.metadata and ('vlen' in dtype.metadata
                           or 'h5py_encoding' in dtype.metadata):
        # Variable length string dtype
        # (https://h5py.readthedocs.io/en/2.10.0/strings.html). Setting the
        # fillvalue in this case doesn't work
        # (https://github.com/h5py/h5py/issues/941).
        if fillvalue not in [0, '', b'', None]:
            raise ValueError(
                "Non-default fillvalue not supported for variable length strings"
            )
        fillvalue = None

    virtual_data = f['_version_data/versions'][
        version_name].create_virtual_dataset(name, layout, fillvalue=fillvalue)

    if attrs:
        for k, v in attrs.items():
            virtual_data.attrs[k] = v
    return virtual_data
コード例 #5
0
def split_chunks(shape, chunks):
    """
    Yield a set of ndindex indices for chunks over shape

    If the shape is not a multiple of the chunk size, some chunks will be
    truncated.

    For example, if a has shape (10, 19) and is chunked into chunks
    of shape (5, 5):

    >>> from versioned_hdf5.slicetools import split_chunks
    >>> for i in split_chunks((10, 19), (5, 5)):
    ...     print(i)
    Tuple(slice(0, 5, 1), slice(0, 5, 1))
    Tuple(slice(0, 5, 1), slice(5, 10, 1))
    Tuple(slice(0, 5, 1), slice(10, 15, 1))
    Tuple(slice(0, 5, 1), slice(15, 19, 1))
    Tuple(slice(5, 10, 1), slice(0, 5, 1))
    Tuple(slice(5, 10, 1), slice(5, 10, 1))
    Tuple(slice(5, 10, 1), slice(10, 15, 1))
    Tuple(slice(5, 10, 1), slice(15, 19, 1))

    """
    if len(shape) != len(chunks):
        raise ValueError("chunks shape must equal the array shape")
    if len(shape) == 0:
        raise NotImplementedError("Scalar datasets")

    d = [math.ceil(i/c) for i, c in zip(shape, chunks)]
    for c in product(*[range(i) for i in d]):
        # c = (0, 0, 0), (0, 0, 1), ...
        yield Tuple(*[Slice(chunk_size*i, min(chunk_size*(i + 1), n), 1) for
    n, chunk_size,
                      i in zip(shape, chunks, c)])
コード例 #6
0
def test_create_virtual_dataset(h5file):
    with h5file as f:
        slices1 = write_dataset(f, 'test_data',
                                np.ones((2 * DEFAULT_CHUNK_SIZE, )))
        slices2 = write_dataset(
            f, 'test_data',
            np.concatenate((2 * np.ones((DEFAULT_CHUNK_SIZE, )), 3 * np.ones(
                (DEFAULT_CHUNK_SIZE, )))))

        virtual_data = create_virtual_dataset(
            f, 'test_version', 'test_data',
            (3 *
             DEFAULT_CHUNK_SIZE, ), {
                 **slices1,
                 Tuple(
                     Slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE, 1), ):
                 slices2[(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE,
                                1), )]
             })

        assert virtual_data.shape == (3 * DEFAULT_CHUNK_SIZE, )
        assert_equal(virtual_data[0:2 * DEFAULT_CHUNK_SIZE], 1.0)
        assert_equal(
            virtual_data[2 * DEFAULT_CHUNK_SIZE:3 * DEFAULT_CHUNK_SIZE], 3.0)
        assert virtual_data.dtype == np.float64
コード例 #7
0
def test_create_virtual_dataset_attrs(h5file):
    with h5file as f:
        slices1 = write_dataset(f, 'test_data',
                                np.ones((2 * DEFAULT_CHUNK_SIZE, )))
        slices2 = write_dataset(
            f, 'test_data',
            np.concatenate((2 * np.ones((DEFAULT_CHUNK_SIZE, )), 3 * np.ones(
                (DEFAULT_CHUNK_SIZE, )))))

        attrs = {"attribute": "value"}
        virtual_data = create_virtual_dataset(
            f,
            'test_version',
            'test_data',
            (3 *
             DEFAULT_CHUNK_SIZE, ), {
                 **slices1,
                 Tuple(
                     Slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE, 1), ):
                 slices2[(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE,
                                1), )]
             },
            attrs=attrs)

        assert dict(virtual_data.attrs) == {
            **attrs, "raw_data": '/_version_data/test_data/raw_data',
            "chunks": np.array([DEFAULT_CHUNK_SIZE])
        }
コード例 #8
0
def write_dataset(f,
                  name,
                  data,
                  chunks=None,
                  compression=None,
                  compression_opts=None,
                  fillvalue=None):
    if name not in f['_version_data']:
        return create_base_dataset(f,
                                   name,
                                   data=data,
                                   chunks=chunks,
                                   compression=compression,
                                   compression_opts=compression_opts,
                                   fillvalue=fillvalue)

    ds = f['_version_data'][name]['raw_data']
    if isinstance(chunks, int) and not isinstance(chunks, bool):
        chunks = (chunks, )
    if chunks is None:
        chunks = tuple(ds.attrs['chunks'])
    else:
        if chunks != tuple(ds.attrs['chunks']):
            raise ValueError(
                "Chunk size specified but doesn't match already existing chunk size"
            )

    if compression or compression_opts:
        raise ValueError(
            "Compression options can only be specified for the first version of a dataset"
        )
    if fillvalue is not None and fillvalue != ds.fillvalue:
        raise ValueError(
            f"fillvalues do not match ({fillvalue} != {ds.fillvalue})")
    if data.dtype != ds.dtype:
        raise ValueError(f"dtypes do not match ({data.dtype} != {ds.dtype})")
    # TODO: Handle more than one dimension
    old_shape = ds.shape
    hashtable = Hashtable(f, name)
    slices = {}
    slices_to_write = {}
    chunk_size = chunks[0]
    for s in split_chunks(data.shape, chunks):
        idx = hashtable.largest_index
        data_s = data[s.raw]
        raw_slice = Slice(idx * chunk_size, idx * chunk_size + data_s.shape[0])
        data_hash = hashtable.hash(data_s)
        raw_slice2 = hashtable.setdefault(data_hash, raw_slice)
        if raw_slice2 == raw_slice:
            slices_to_write[raw_slice] = s
        slices[s] = raw_slice2

    ds.resize((old_shape[0] + len(slices_to_write) * chunk_size, ) +
              chunks[1:])
    for raw_slice, s in slices_to_write.items():
        data_s = data[s.raw]
        idx = Tuple(raw_slice, *[slice(0, i) for i in data_s.shape[1:]])
        ds[idx.raw] = data[s.raw]
    return slices
コード例 #9
0
ファイル: wrappers.py プロジェクト: asmeurer/versioned-hdf5
    def data_dict(self):
        if self._data_dict is None:
            self._data_dict = {}

            dcpl = self.get_create_plist()
            is_virtual = dcpl.get_layout() == h5d.VIRTUAL


            if not is_virtual:
                # A dataset created with only a fillvalue will be nonvirtual,
                # since create_virtual_dataset makes a nonvirtual dataset when
                # there are no virtual sources.
                slice_map = {}
            # Same as dataset.get_virtual_sources
            elif 0 in self._shape:
                # Work around https://github.com/h5py/h5py/issues/1660
                empty_idx = Tuple().expand(self._shape)
                slice_map = {empty_idx: empty_idx}
            else:
                virtual_sources = [
                        VDSmap(dcpl.get_virtual_vspace(j),
                               dcpl.get_virtual_filename(j),
                               dcpl.get_virtual_dsetname(j),
                               dcpl.get_virtual_srcspace(j))
                        for j in range(dcpl.get_virtual_count())]

                slice_map = {spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space)
                             for i in virtual_sources}
                assert self.raw_data.name == virtual_sources[0].dset_name
                assert all(i.dset_name == self.raw_data.name for i in virtual_sources)

            # slice_map = {i.args[0]: j.args[0] for i, j in slice_map.items()}

            for s in slice_map:
                src_idx = slice_map[s]
                if isinstance(src_idx, Tuple):
                    # The pointers to the raw data should only be slices, since
                    # the raw data chunks are extended in the first dimension
                    # only.
                    assert src_idx != Tuple()
                    assert len(src_idx.args) == len(self.chunks)
                    src_idx = src_idx.args[0]
                assert isinstance(src_idx, Slice)
                self._data_dict[s] = src_idx

        return self._data_dict
コード例 #10
0
ファイル: backend.py プロジェクト: deshaw/versioned-hdf5
def write_dataset(f, name, data, chunks=None, dtype=None, compression=None,
                  compression_opts=None, fillvalue=None):

    if name not in f['_version_data']:
        return create_base_dataset(f, name, data=data, dtype=dtype,
                                   chunks=chunks, compression=compression,
                                   compression_opts=compression_opts, fillvalue=fillvalue)

    ds = f['_version_data'][name]['raw_data']
    if isinstance(chunks, int) and not isinstance(chunks, bool):
        chunks = (chunks,)
    if chunks is None:
        chunks = tuple(ds.attrs['chunks'])
    else:
        if chunks != tuple(ds.attrs['chunks']):
            raise ValueError("Chunk size specified but doesn't match already existing chunk size")

    if dtype is not None:
        if dtype != ds.dtype:
            raise ValueError("dtype specified but doesn't match already existing dtype")

    if compression and compression != ds.compression or compression_opts and compression_opts != ds.compression_opts:
        raise ValueError("Compression options can only be specified for the first version of a dataset")
    if fillvalue is not None and fillvalue != ds.fillvalue:
        dtype = ds.dtype
        if dtype.metadata and ('vlen' in dtype.metadata or 'h5py_encoding' in dtype.metadata):
            # Variable length string dtype. The ds.fillvalue will be None in
            # this case (see create_virtual_dataset() below)
            pass
        else:
            raise ValueError(f"fillvalues do not match ({fillvalue} != {ds.fillvalue})")
    if data.dtype != ds.dtype:
        raise ValueError(f"dtypes do not match ({data.dtype} != {ds.dtype})")
    # TODO: Handle more than one dimension
    old_shape = ds.shape
    slices = {}
    slices_to_write = {}
    chunk_size = chunks[0]

    with Hashtable(f, name) as hashtable:
        if len(data.shape) != 0:
            for s in ChunkSize(chunks).indices(data.shape):
                idx = hashtable.largest_index
                data_s = data[s.raw]
                raw_slice = Slice(idx*chunk_size, idx*chunk_size + data_s.shape[0])
                data_hash = hashtable.hash(data_s)
                raw_slice2 = hashtable.setdefault(data_hash, raw_slice)
                if raw_slice2 == raw_slice:
                    slices_to_write[raw_slice] = s
                slices[s] = raw_slice2

            ds.resize((old_shape[0] + len(slices_to_write)*chunk_size,) + chunks[1:])
            for raw_slice, s in slices_to_write.items():
                # idx = raw_slice.expand(ds.shape[:1] + s.newshape(data.shape)[1:])
                data_s = data[s.raw]
                idx = Tuple(raw_slice, *[slice(0, i) for i in data_s.shape[1:]])
                ds[idx.raw] = data[s.raw]
    return slices
コード例 #11
0
ファイル: ndindex.py プロジェクト: joelostblom/ndindex
 def setup(self):
     from ndindex import (Slice, Tuple, Integer, ellipsis, Newaxis,
                          IntegerArray, BooleanArray)
     self.slice = Slice(0, 4, 2)
     self.integer = Integer(1)
     self.tuple = Tuple(self.slice, ..., 0)
     self.ellipsis = ellipsis()
     self.newaxis = Newaxis()
     self.integer_array = IntegerArray([[1, 2], [-1, 2]])
     self.boolean_array = BooleanArray([[True, False], [False, False]])
コード例 #12
0
def spaceid_to_slice(space):
    """
    Convert an h5py spaceid object into an ndindex index

    The resulting index is always a Tuple index.
    """

    from h5py import h5s

    sel_type = space.get_select_type()

    if sel_type == h5s.SEL_ALL:
        return Tuple()
    elif sel_type == h5s.SEL_HYPERSLABS:
        slices = []
        starts, strides, counts, blocks = space.get_regular_hyperslab()
        for start, stride, count, block in zip(starts, strides, counts,
                                               blocks):
            slices.append(hyperslab_to_slice(start, stride, count, block))
        return Tuple(*slices)
    elif sel_type == h5s.SEL_NONE:
        return Tuple(Slice(0, 0), )
    else:
        raise NotImplementedError("Point selections are not yet supported")
コード例 #13
0
ファイル: wrappers.py プロジェクト: melissawm/versioned-hdf5
    def resize(self, size, axis=None):
        """ Resize the dataset, or the specified axis.

        The rank of the dataset cannot be changed.

        "Size" should be a shape tuple, or if an axis is specified, an integer.

        BEWARE: This functions differently than the NumPy resize() method!
        The data is not "reshuffled" to fit in the new shape; each axis is
        grown or shrunk independently.  The coordinates of existing data are
        fixed.
        """
        self.parent._check_committed()
        # This boilerplate code is based on h5py.Dataset.resize
        if axis is not None:
            if not (axis >= 0 and axis < self.id.rank):
                raise ValueError("Invalid axis (0 to %s allowed)" %
                                 (self.id.rank - 1))
            try:
                newlen = int(size)
            except TypeError:
                raise TypeError(
                    "Argument must be a single int if axis is specified")
            size = list(self.shape)
            size[axis] = newlen

        size = tuple(size)
        # === END CODE FROM h5py.Dataset.resize ===

        old_shape = self.shape
        data_dict = self.id.data_dict
        chunks = self.chunks

        old_shape_idx = Tuple(*[Slice(0, i) for i in old_shape])
        new_data_dict = {}
        for c in set(split_chunks(size, chunks)):
            if c in data_dict:
                new_data_dict[c] = data_dict[c]
            else:
                a = self[c.raw]
                data = np.full(c.newshape(size),
                               self.fillvalue,
                               dtype=self.dtype)
                data[old_shape_idx.as_subindex(c).raw] = a
                new_data_dict[c] = data

        self.id.data_dict = new_data_dict
        self.id.shape = size
コード例 #14
0
ファイル: wrappers.py プロジェクト: melissawm/versioned-hdf5
    def __init__(self, _id):
        # super __init__ is handled by DatasetID.__cinit__ automatically
        self.data_dict = {}
        with phil:
            sid = self.get_space()
            self._shape = sid.get_simple_extent_dims()

        dcpl = self.get_create_plist()
        # Same as dataset.get_virtual_sources
        virtual_sources = [
            VDSmap(dcpl.get_virtual_vspace(j), dcpl.get_virtual_filename(j),
                   dcpl.get_virtual_dsetname(j), dcpl.get_virtual_srcspace(j))
            for j in range(dcpl.get_virtual_count())
        ]

        slice_map = {
            spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space)
            for i in virtual_sources
        }

        # slice_map = {i.args[0]: j.args[0] for i, j in slice_map.items()}
        fid = h5i.get_file_id(self)
        g = Group(fid)
        raw_data_name = virtual_sources[0].dset_name
        assert all(i.dset_name == raw_data_name for i in virtual_sources)
        self.raw_data = g[raw_data_name]
        self.chunks = tuple(self.raw_data.attrs['chunks'])

        for s in slice_map:
            src_idx = slice_map[s]
            if isinstance(src_idx, Tuple):
                # The pointers to the raw data should only be slices, since
                # the raw data chunks are extended in the first dimension
                # only.
                assert src_idx != Tuple()
                assert len(src_idx.args) == len(self.chunks)
                src_idx = src_idx.args[0]
            assert isinstance(src_idx, Slice)
            self.data_dict[s] = src_idx

        fillvalue_a = np.empty((1, ), dtype=self.dtype)
        dcpl.get_fill_value(fillvalue_a)
        self.fillvalue = fillvalue_a[0]
コード例 #15
0
ファイル: tuple.py プロジェクト: joelostblom/ndindex
 def time_constructor_ndindex(self):
     Tuple(*self.t.args)
コード例 #16
0
ファイル: tuple.py プロジェクト: joelostblom/ndindex
 def time_constructor_invalid_ellipses(self):
     try:
         Tuple(0, ..., ...)
     except IndexError:
         pass
コード例 #17
0
def test_write_dataset_offset_multidimension(h5file):
    chunks = ChunkSize(3 * (CHUNK_SIZE_3D, ))
    shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D)
    data = np.zeros(shape)
    slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks)
    shape2 = (2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2,
              2 * CHUNK_SIZE_3D - 2)
    data2 = np.empty(shape2)
    for n, c in enumerate(chunks.indices(shape)):
        data2[c.raw] = n

    slices2 = write_dataset(h5file, 'test_data', data2, chunks=chunks)

    assert slices1 == {
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
    }

    assert slices2 == {
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
        ):
        slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(2 * CHUNK_SIZE_3D, 3 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
        ):
        slice(3 * CHUNK_SIZE_3D, 4 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(4 * CHUNK_SIZE_3D, 5 * CHUNK_SIZE_3D - 2),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
        ):
        slice(5 * CHUNK_SIZE_3D, 6 * CHUNK_SIZE_3D - 2),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(6 * CHUNK_SIZE_3D, 7 * CHUNK_SIZE_3D - 2),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
        ):
        slice(7 * CHUNK_SIZE_3D, 8 * CHUNK_SIZE_3D - 2),
    }

    ds = h5file['/_version_data/test_data/raw_data']
    assert ds.shape == (8 * CHUNK_SIZE_3D, CHUNK_SIZE_3D, CHUNK_SIZE_3D)
    for n, c in enumerate(chunks.indices(shape2)):
        a = np.zeros(chunks)
        a[Tuple(*[slice(0, i) for i in shape2]).as_subindex(c).raw] = n
        assert_equal(ds[n * CHUNK_SIZE_3D:(n + 1) * CHUNK_SIZE_3D], a)
    assert ds.dtype == np.float64
コード例 #18
0
def test_write_dataset_chunks(h5file):
    slices1 = write_dataset(h5file, 'test_data',
                            np.ones((2 * DEFAULT_CHUNK_SIZE, )))
    slices2 = write_dataset_chunks(
        h5file, 'test_data', {
            Tuple(Slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE, 1)):
            slices1[Tuple(
                Slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE, 1))],
            Tuple(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE, 1)):
            2 * np.ones((DEFAULT_CHUNK_SIZE, )),
            Tuple(Slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE, 1)):
            2 * np.ones((DEFAULT_CHUNK_SIZE, )),
            Tuple(Slice(3 * DEFAULT_CHUNK_SIZE, 4 * DEFAULT_CHUNK_SIZE, 1)):
            3 * np.ones((DEFAULT_CHUNK_SIZE, )),
        })

    assert slices1 == {
        Tuple(Slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE, 1)):
        slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE),
        Tuple(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE, 1)):
        slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE),
    }
    assert slices2 == {
        Tuple(Slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE, 1)):
        slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE),
        Tuple(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE, 1)):
        slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE),
        Tuple(Slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE, 1)):
        slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE),
        Tuple(Slice(3 * DEFAULT_CHUNK_SIZE, 4 * DEFAULT_CHUNK_SIZE, 1)):
        slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE),
    }

    ds = h5file['/_version_data/test_data/raw_data']
    assert ds.shape == (3 * DEFAULT_CHUNK_SIZE, )
    assert_equal(ds[0:1 * DEFAULT_CHUNK_SIZE], 1.0)
    assert_equal(ds[1 * DEFAULT_CHUNK_SIZE:2 * DEFAULT_CHUNK_SIZE], 2.0)
    assert_equal(ds[2 * DEFAULT_CHUNK_SIZE:3 * DEFAULT_CHUNK_SIZE], 3.0)
    assert_equal(ds[3 * DEFAULT_CHUNK_SIZE:4 * DEFAULT_CHUNK_SIZE], 0.0)
    assert ds.dtype == np.float64
コード例 #19
0
ファイル: tuple.py プロジェクト: joelostblom/ndindex
 def time_constructor_invalid_array_broadcast(self):
     try:
         Tuple([0, 1], [0, 1, 2])
     except IndexError:
         pass
コード例 #20
0
ファイル: wrappers.py プロジェクト: melissawm/versioned-hdf5
    def __setitem__(self, args, val):
        """ Write to the HDF5 dataset from a Numpy array.

        NumPy's broadcasting rules are honored, for "simple" indexing
        (slices and integers).  For advanced indexing, the shapes must
        match.
        """
        self.parent._check_committed()
        # This boilerplate code is based on h5py.Dataset.__setitem__
        args = args if isinstance(args, tuple) else (args, )

        # Sort field indices from the slicing
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        # Generally we try to avoid converting the arrays on the Python
        # side.  However, for compound literals this is unavoidable.
        vlen = h5t.check_vlen_dtype(self.dtype)
        if vlen is not None and vlen not in (bytes, str):
            try:
                val = np.asarray(val, dtype=vlen)
            except ValueError:
                try:
                    val = np.array([np.array(x, dtype=vlen) for x in val],
                                   dtype=self.dtype)
                except ValueError:
                    pass
            if vlen == val.dtype:
                if val.ndim > 1:
                    tmp = np.empty(shape=val.shape[:-1], dtype=object)
                    tmp.ravel()[:] = [
                        i for i in val.reshape((
                            np.product(val.shape[:-1], dtype=np.ulonglong),
                            val.shape[-1]))
                    ]
                else:
                    tmp = np.array([None], dtype=object)
                    tmp[0] = val
                val = tmp
        elif self.dtype.kind == "O" or \
          (self.dtype.kind == 'V' and \
          (not isinstance(val, np.ndarray) or val.dtype.kind != 'V') and \
          (self.dtype.subdtype == None)):
            if len(names) == 1 and self.dtype.fields is not None:
                # Single field selected for write, from a non-array source
                if not names[0] in self.dtype.fields:
                    raise ValueError("No such field for indexing: %s" %
                                     names[0])
                dtype = self.dtype.fields[names[0]][0]
                cast_compound = True
            else:
                dtype = self.dtype
                cast_compound = False

            val = np.asarray(val, dtype=dtype.base, order='C')
            if cast_compound:
                val = val.view(np.dtype([(names[0], dtype)]))
                val = val.reshape(val.shape[:len(val.shape) -
                                            len(dtype.shape)])
        else:
            val = np.asarray(val, order='C')

        # Check for array dtype compatibility and convert
        if self.dtype.subdtype is not None:
            shp = self.dtype.subdtype[1]
            valshp = val.shape[-len(shp):]
            if valshp != shp:  # Last dimension has to match
                raise TypeError(
                    "When writing to array types, last N dimensions have to match (got %s, but should be %s)"
                    % (
                        valshp,
                        shp,
                    ))
            mtype = h5t.py_create(np.dtype((val.dtype, shp)))
            # mshape = val.shape[0:len(val.shape)-len(shp)]

        # Make a compound memory type if field-name slicing is required
        elif len(names) != 0:

            # mshape = val.shape

            # Catch common errors
            if self.dtype.fields is None:
                raise TypeError(
                    "Illegal slicing argument (not a compound dataset)")
            mismatch = [x for x in names if x not in self.dtype.fields]
            if len(mismatch) != 0:
                mismatch = ", ".join('"%s"' % x for x in mismatch)
                raise ValueError(
                    "Illegal slicing argument (fields %s not in dataset type)"
                    % mismatch)

            # Write non-compound source into a single dataset field
            if len(names) == 1 and val.dtype.fields is None:
                subtype = h5t.py_create(val.dtype)
                mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
                mtype.insert(self._e(names[0]), 0, subtype)

            # Make a new source type keeping only the requested fields
            else:
                fieldnames = [x for x in val.dtype.names
                              if x in names]  # Keep source order
                mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
                for fieldname in fieldnames:
                    subtype = h5t.py_create(val.dtype.fields[fieldname][0])
                    offset = val.dtype.fields[fieldname][1]
                    mtype.insert(self._e(fieldname), offset, subtype)

        # Use mtype derived from array (let DatasetID.write figure it out)
        else:
            mtype = None

        # === END CODE FROM h5py.Dataset.__setitem__ ===

        idx = ndindex(args).reduce(self.shape)

        val = np.broadcast_to(val, idx.newshape(self.shape))

        for c, index in as_subchunks(idx, self.shape, self.chunks):
            if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)):
                raw_idx = Tuple(self.id.data_dict[c],
                                *[slice(0, len(i)) for i in c.args[1:]]).raw
                a = self.id._read_chunk(raw_idx)
                self.id.data_dict[c] = a

            if self.id.data_dict[c].size != 0:
                val_idx = c.as_subindex(idx)
                self.id.data_dict[c][index.raw] = val[val_idx.raw]
コード例 #21
0
ファイル: wrappers.py プロジェクト: melissawm/versioned-hdf5
    def __getitem__(self, args, new_dtype=None):
        """ Read a slice from the HDF5 dataset.

        Takes slices and recarray-style field names (more than one is
        allowed!) in any order.  Obeys basic NumPy rules, including
        broadcasting.

        """
        # This boilerplate code is based on h5py.Dataset.__getitem__
        args = args if isinstance(args, tuple) else (args, )

        if new_dtype is None:
            new_dtype = getattr(self._local, 'astype', None)

        # Sort field names from the rest of the args.
        names = tuple(x for x in args if isinstance(x, str))

        if names:
            # Read a subset of the fields in this structured dtype
            if len(names) == 1:
                names = names[0]  # Read with simpler dtype of this field
            args = tuple(x for x in args if not isinstance(x, str))
            return self.fields(names, _prior_dtype=new_dtype)[args]

        if new_dtype is None:
            new_dtype = self.dtype
        mtype = h5t.py_create(new_dtype)

        # === Special-case region references ====

        if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

            obj = h5r.dereference(args[0], self.id)
            if obj != self.id:
                raise ValueError("Region reference must point to this dataset")

            sid = h5r.get_region(args[0], self.id)
            mshape = guess_shape(sid)
            if mshape is None:
                # 0D with no data (NULL or deselected SCALAR)
                return Empty(new_dtype)
            out = np.empty(mshape, dtype=new_dtype)
            if out.size == 0:
                return out

            sid_out = h5s.create_simple(mshape)
            sid_out.select_all()
            self.id.read(sid_out, sid, out, mtype)
            return out

        # === END CODE FROM h5py.Dataset.__getitem__ ===

        idx = ndindex(args).reduce(self.shape)

        arr = np.ndarray(idx.newshape(self.shape), new_dtype, order='C')

        for c, index in as_subchunks(idx, self.shape, self.chunks):
            if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)):
                raw_idx = Tuple(self.id.data_dict[c],
                                *[slice(0, len(i)) for i in c.args[1:]]).raw
                a = self.id._read_chunk(raw_idx)
                self.id.data_dict[c] = a

            if self.id.data_dict[c].size != 0:
                arr_idx = c.as_subindex(idx)
                arr[arr_idx.raw] = self.id.data_dict[c][index.raw]

        return arr
コード例 #22
0
ファイル: tuple.py プロジェクト: joelostblom/ndindex
 def time_constructor_arrays(self):
     Tuple([[0, 1], [1, 2]], 2, [0, 1])
コード例 #23
0
ファイル: tuple.py プロジェクト: joelostblom/ndindex
 def setup(self):
     self.t = Tuple(slice(0, 10), ..., 1)
     self.t_arrays = Tuple([[0, 1], [1, 2]], 2, [0, 1])
     self.t_boolean_scalars = Tuple(True, 0, False)
コード例 #24
0
def test_write_dataset_chunk_size(h5file):
    chunk_size = 2**10
    chunks = (chunk_size, )
    slices1 = write_dataset(h5file,
                            'test_data',
                            np.ones((2 * chunk_size, )),
                            chunks=chunks)
    raises(
        ValueError, lambda: write_dataset(
            h5file, 'test_data', np.ones(chunks), chunks=(2**9, )))
    slices2 = write_dataset_chunks(
        h5file, 'test_data', {
            Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)):
            slices1[Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1))],
            Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)):
            2 * np.ones((chunk_size, )),
            Tuple(Slice(2 * chunk_size, 3 * chunk_size, 1)):
            2 * np.ones((chunk_size, )),
            Tuple(Slice(3 * chunk_size, 4 * chunk_size, 1)):
            3 * np.ones((chunk_size, )),
        })

    assert slices1 == {
        Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)):
        slice(0 * chunk_size, 1 * chunk_size),
        Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)):
        slice(0 * chunk_size, 1 * chunk_size),
    }
    assert slices2 == {
        Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)):
        slice(0 * chunk_size, 1 * chunk_size),
        Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)):
        slice(1 * chunk_size, 2 * chunk_size),
        Tuple(Slice(2 * chunk_size, 3 * chunk_size, 1)):
        slice(1 * chunk_size, 2 * chunk_size),
        Tuple(Slice(3 * chunk_size, 4 * chunk_size, 1)):
        slice(2 * chunk_size, 3 * chunk_size),
    }

    ds = h5file['/_version_data/test_data/raw_data']
    assert ds.shape == (3 * chunk_size, )
    assert_equal(ds[0:1 * chunk_size], 1.0)
    assert_equal(ds[1 * chunk_size:2 * chunk_size], 2.0)
    assert_equal(ds[2 * chunk_size:3 * chunk_size], 3.0)
    assert_equal(ds[3 * chunk_size:4 * chunk_size], 0.0)
    assert ds.dtype == np.float64
コード例 #25
0
ファイル: tuple.py プロジェクト: joelostblom/ndindex
 def time_constructor_builtin(self):
     Tuple(slice(0, 10), ..., 1)
コード例 #26
0
def _recreate_raw_data(f, name, versions_to_delete, tmp=False):
    """
    Return a new raw data set for a dataset without the chunks from
    versions_to_delete.

    If no chunks would be left, i.e., the dataset does not appear in any
    version not in versions_to_delete, None is returned.

    If tmp is True, the new raw dataset is called '_tmp_raw_data' and is
    placed alongside the existing raw dataset. Otherwise the existing raw
    dataset is replaced.

    """
    chunks_map = defaultdict(dict)

    for version_name in all_versions(f):
        if (version_name in versions_to_delete
                or name not in f['_version_data/versions'][version_name]):
            continue

        dataset = f['_version_data/versions'][version_name][name]

        virtual_sources = dataset.virtual_sources()
        slice_map = {
            spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space)
            for i in virtual_sources
        }
        chunks_map[version_name].update(slice_map)

    chunks_to_keep = set().union(
        *[map.values() for map in chunks_map.values()])

    chunks_to_keep = sorted(chunks_to_keep, key=lambda i: i.args[0].args[0])

    if not chunks_to_keep:
        return {}

    raw_data = f['_version_data'][name]['raw_data']
    chunks = ChunkSize(raw_data.chunks)
    new_shape = (len(chunks_to_keep) * chunks[0], *chunks[1:])

    new_raw_data = f['_version_data'][name].create_dataset(
        '_tmp_raw_data',
        shape=new_shape,
        maxshape=(None, ) + chunks[1:],
        chunks=raw_data.chunks,
        dtype=raw_data.dtype,
        compression=raw_data.compression,
        compression_opts=raw_data.compression_opts,
        fillvalue=raw_data.fillvalue)
    for key, val in raw_data.attrs.items():
        new_raw_data.attrs[key] = val

    r = raw_data[:]
    n = np.full(new_raw_data.shape,
                new_raw_data.fillvalue,
                dtype=new_raw_data.dtype)
    raw_data_chunks_map = {}
    for new_chunk, chunk in zip(chunks.indices(new_shape), chunks_to_keep):
        # Shrink new_chunk to the size of chunk, in case chunk isn't a full
        # chunk in one of the dimensions.
        # TODO: Implement something in ndindex to do this.
        new_chunk = Tuple(*[
            Slice(new_chunk.args[i].start, new_chunk.args[i].start +
                  len(chunk.args[i])) for i in range(len(new_chunk.args))
        ])
        raw_data_chunks_map[chunk] = new_chunk
        n[new_chunk.raw] = r[chunk.raw]

    new_raw_data[:] = n
    if not tmp:
        del f['_version_data'][name]['raw_data']
        f['_version_data'][name].move('_tmp_raw_data', 'raw_data')

    return raw_data_chunks_map
コード例 #27
0
def test_create_version_chunks(h5file):
    chunk_size = 2**10
    chunks = (chunk_size,)
    data = np.concatenate((np.ones((2*chunk_size,)),
                           2*np.ones(chunks),
                           3*np.ones(chunks)))
    # TODO: Support creating the initial version with chunks
    version1 = create_version_group(h5file, 'version1')
    commit_version(version1, {'test_data': data},
                   chunks={'test_data': chunks},
                   compression={'test_data': 'gzip'},
                   compression_opts={'test_data': 3})
    version_bad = create_version_group(h5file, 'version_bad', '')
    raises(ValueError, lambda: commit_version(version_bad,
                                              {'test_data': data},
                                              chunks={'test_data': (2**9,)}))
    delete_version(h5file, 'version_bad', 'version1')

    version_bad = create_version_group(h5file, 'version_bad', '')
    raises(ValueError, lambda: commit_version(version_bad,
                                              {'test_data': data},
                                              compression={'test_data':'lzf'}))
    delete_version(h5file, 'version_bad', 'version1')

    version_bad = create_version_group(h5file, 'version_bad', '')
    raises(ValueError, lambda: commit_version(version_bad,
                                              {'test_data': data},
                                              compression_opts={'test_data':4}))
    delete_version(h5file, 'version_bad', 'version1')

    assert_equal(h5file['_version_data/versions/version1/test_data'], data)

    ds = h5file['/_version_data/test_data/raw_data']

    assert ds.shape == (3*chunk_size,)
    assert_equal(ds[0:1*chunk_size], 1.0)
    assert_equal(ds[1*chunk_size:2*chunk_size], 2.0)
    assert_equal(ds[2*chunk_size:3*chunk_size], 3.0)
    assert ds.compression == 'gzip'
    assert ds.compression_opts == 3

    data2_chunks = {
        Tuple(Slice(0*chunk_size, 1*chunk_size, 1)): np.ones(chunks),
        Tuple(Slice(1*chunk_size, 2*chunk_size, 1)): np.ones(chunks),
        Tuple(Slice(2*chunk_size, 3*chunk_size, 1)): 2*np.ones(chunks),
        Tuple(Slice(3*chunk_size, 4*chunk_size, 1)): 3*np.ones(chunks),
    }
    data2_chunks[Tuple(Slice(0*chunk_size, 1*chunk_size, 1))][0] = 0.0
    data[0] = 0.0

    version2 = create_version_group(h5file, 'version2')
    commit_version(version2, {'test_data':  data2_chunks})
    assert_equal(h5file['_version_data/versions/version2/test_data'], data)

    assert ds.shape == (4*chunk_size,)
    assert_equal(ds[0:1*chunk_size], 1.0)
    assert_equal(ds[1*chunk_size:2*chunk_size], 2.0)
    assert_equal(ds[2*chunk_size:3*chunk_size], 3.0)
    assert_equal(ds[3*chunk_size], 0.0)
    assert_equal(ds[3*chunk_size+1:4*chunk_size], 1.0)
    assert ds.compression == 'gzip'
    assert ds.compression_opts == 3

    data3_chunks = {
        Tuple(Slice(0*chunk_size, 1*chunk_size, 1)): np.ones(chunks),
        Tuple(Slice(1*chunk_size, 2*chunk_size, 1)): Slice(0*chunk_size, 1*chunk_size),
        Tuple(Slice(2*chunk_size, 3*chunk_size, 1)): Slice(1*chunk_size, 2*chunk_size),
        Tuple(Slice(3*chunk_size, 4*chunk_size, 1)): Slice(2*chunk_size, 3*chunk_size),
    }
    data3_chunks[Tuple(Slice(0*chunk_size, 1*chunk_size, 1))][0] = 2.0
    data[0] = 2.0

    version3 = create_version_group(h5file, 'version3')
    commit_version(version3, {'test_data':  data3_chunks})
    assert_equal(h5file['_version_data/versions/version3/test_data'], data)

    assert ds.shape == (5*chunk_size,)
    assert_equal(ds[0:1*chunk_size], 1.0)
    assert_equal(ds[1*chunk_size:2*chunk_size], 2.0)
    assert_equal(ds[2*chunk_size:3*chunk_size], 3.0)
    assert_equal(ds[3*chunk_size], 0.0)
    assert_equal(ds[3*chunk_size+1:4*chunk_size], 1.0)
    assert_equal(ds[4*chunk_size], 2.0)
    assert_equal(ds[4*chunk_size+1:5*chunk_size], 1.0)

    assert set(all_versions(h5file)) == {'version1', 'version2', 'version3'}
コード例 #28
0
ファイル: wrappers.py プロジェクト: asmeurer/versioned-hdf5
 def __setitem__(self, index, value):
     if isinstance(self.dataset, InMemoryDataset) and ndindex(index).expand(self.shape) == Tuple().expand(self.shape):
         new_dataset = InMemoryArrayDataset(self.name,
                                            np.broadcast_to(value, self.shape).astype(self.dtype),
                                            self.parent,
                                            fillvalue=self.fillvalue, chunks=self.chunks)
         new_dataset.attrs = self.dataset.attrs
         self.dataset = new_dataset
         return
     self.dataset.__setitem__(index, value)
コード例 #29
0
ファイル: backend.py プロジェクト: deshaw/versioned-hdf5
def create_virtual_dataset(f, version_name, name, shape, slices, attrs=None, fillvalue=None):
    from h5py._hl.selections import select
    from h5py._hl.vds import VDSmap

    raw_data = f['_version_data'][name]['raw_data']
    raw_data_shape = raw_data.shape
    slices = {c: s.reduce() for c, s in slices.items()}

    if len(raw_data) == 0:
        shape = ()
        layout = VirtualLayout((1,), dtype=raw_data.dtype)
        vs = VirtualSource('.', name=raw_data.name, shape=(1,), dtype=raw_data.dtype)
        layout[0] = vs[()]
    else:
        # Chunks in the raw dataset are expanded along the first dimension only.
        # Since the chunks are pointed to by virtual datasets, it doesn't make
        # sense to expand the chunks in the raw dataset along multiple dimensions
        # (the true layout of the chunks in the raw dataset is irrelevant).
        for c, s in slices.items():
            if len(c.args[0]) != len(s):
                raise ValueError(f"Inconsistent slices dictionary ({c.args[0]}, {s})")

        # h5py 3.3 changed the VirtualLayout code. See
        # https://github.com/h5py/h5py/pull/1905.
        layout = VirtualLayout(shape, dtype=raw_data.dtype)
        layout_has_sources = hasattr(layout, 'sources')
        if not layout_has_sources:
            vs = VirtualSource('.', name=raw_data.name, shape=raw_data.shape, dtype=raw_data.dtype)

        for c, s in slices.items():
            if c.isempty():
                continue
            # idx = Tuple(s, *Tuple(*[slice(0, i) for i in shape[1:]]).as_subindex(Tuple(*c.args[1:])).args)
            S = [Slice(0, len(c.args[i])) for i in range(1, len(shape))]
            idx = Tuple(s, *S)
            # assert c.newshape(shape) == vs[idx.raw].shape, (c, shape, s)

            if not layout_has_sources:
                # TODO: Use a faster workaround here too
                layout[c.raw] = vs[idx.raw]
            else:
                # This is equivalent, but it is faster because vs[idx.raw] does a deepcopy(vs), which
                # is slow.
                vs_sel = select(raw_data_shape, idx.raw, None)
                layout_sel = select(shape, c.raw, None)
                layout.sources.append(VDSmap(layout_sel.id,
                                       '.',
                                       raw_data.name,
                                       vs_sel.id))

    dtype = raw_data.dtype
    if dtype.metadata and ('vlen' in dtype.metadata or 'h5py_encoding' in dtype.metadata):
        # Variable length string dtype
        # (https://h5py.readthedocs.io/en/2.10.0/strings.html). Setting the
        # fillvalue in this case doesn't work
        # (https://github.com/h5py/h5py/issues/941).
        if fillvalue not in [0, '', b'', None]:
            raise ValueError("Non-default fillvalue not supported for variable length strings")
        fillvalue = None

    virtual_data = f['_version_data/versions'][version_name].create_virtual_dataset(name, layout, fillvalue=fillvalue)

    if attrs:
        for k, v in attrs.items():
            virtual_data.attrs[k] = v
    virtual_data.attrs['raw_data'] = raw_data.name
    virtual_data.attrs['chunks'] = raw_data.chunks
    return virtual_data
コード例 #30
0
ファイル: tuple.py プロジェクト: joelostblom/ndindex
 def time_constructor_boolean_scalars(self):
     Tuple(True, 0, False)