def resize(self, size, axis=None): self.parent._check_committed() if axis is not None: if not (axis >= 0 and axis < self.ndim): raise ValueError("Invalid axis (0 to %s allowed)" % (self.ndim - 1)) try: newlen = int(size) except TypeError: raise TypeError( "Argument must be a single int if axis is specified") size = list(self.shape) size[axis] = newlen old_shape = self.shape size = tuple(size) if all(new <= old for new, old in zip(size, old_shape)): # Don't create a new array if the old one can just be sliced in # memory. idx = tuple(slice(0, i) for i in size) self.array = self.array[idx] else: old_shape_idx = Tuple(*[Slice(0, i) for i in old_shape]) new_shape_idx = Tuple(*[Slice(0, i) for i in size]) new_array = np.full(size, self.fillvalue, dtype=self.dtype) new_array[old_shape_idx.as_subindex( new_shape_idx).raw] = self.array[new_shape_idx.as_subindex( old_shape_idx).raw] self.array = new_array
def spaceid_to_slice(space): """ Convert an h5py spaceid object into an ndindex index The resulting index is always a Tuple index. """ from h5py import h5s sel_type = space.get_select_type() if sel_type == h5s.SEL_ALL: return Tuple() elif sel_type == h5s.SEL_HYPERSLABS: slices = [] starts, strides, counts, blocks = space.get_regular_hyperslab() for _start, _stride, count, block in zip(starts, strides, counts, blocks): start = _start if not (block == 1 or count == 1): raise NotImplementedError("Nontrivial blocks are not yet supported") end = _start + (_stride*(count - 1) + 1)*block stride = _stride if block == 1 else 1 slices.append(Slice(start, end, stride)) return Tuple(*slices) elif sel_type == h5s.SEL_NONE: return Tuple(Slice(0, 0),) else: raise NotImplementedError("Point selections are not yet supported")
def test_write_dataset_offset_chunk_size(h5file): chunk_size = 2**10 chunks = (chunk_size, ) slices1 = write_dataset(h5file, 'test_data', 1 * np.ones((2 * chunk_size, )), chunks=chunks) slices2 = write_dataset( h5file, 'test_data', np.concatenate((2 * np.ones(chunks), 2 * np.ones(chunks), 3 * np.ones( (chunk_size - 2, ))))) assert slices1 == { Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)): slice(0 * chunk_size, 1 * chunk_size), Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)): slice(0 * chunk_size, 1 * chunk_size), } assert slices2 == { Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)): slice(1 * chunk_size, 2 * chunk_size), Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)): slice(1 * chunk_size, 2 * chunk_size), Tuple(Slice(2 * chunk_size, 3 * chunk_size - 2, 1)): slice(2 * chunk_size, 3 * chunk_size - 2), } ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (3 * chunk_size, ) assert_equal(ds[0 * chunk_size:1 * chunk_size], 1.0) assert_equal(ds[1 * chunk_size:2 * chunk_size], 2.0) assert_equal(ds[2 * chunk_size:3 * chunk_size - 2], 3.0) assert_equal(ds[3 * chunk_size - 2:4 * chunk_size], 0.0)
def create_virtual_dataset(f, version_name, name, slices, attrs=None, fillvalue=None): raw_data = f['_version_data'][name]['raw_data'] chunks = tuple(raw_data.attrs['chunks']) slices = {c: s.reduce() for c, s in slices.items()} shape = tuple( [max(c.args[i].stop for c in slices) for i in range(len(chunks))]) # Chunks in the raw dataset are expanded along the first dimension only. # Since the chunks are pointed to by virtual datasets, it doesn't make # sense to expand the chunks in the raw dataset along multiple dimensions # (the true layout of the chunks in the raw dataset is irrelevant). for c, s in slices.items(): if len(c.args[0]) != len(s): raise ValueError( f"Inconsistent slices dictionary ({c.args[0]}, {s})") layout = VirtualLayout(shape, dtype=raw_data.dtype) vs = VirtualSource('.', name=raw_data.name, shape=raw_data.shape, dtype=raw_data.dtype) for c, s in slices.items(): # TODO: This needs to handle more than one dimension idx = Tuple( s, *Tuple(*[slice(0, i) for i in shape]).as_subindex(c).args[1:]) assert c.newshape(shape) == vs[idx.raw].shape, (c, shape, s) layout[c.raw] = vs[idx.raw] dtype = raw_data.dtype if dtype.metadata and ('vlen' in dtype.metadata or 'h5py_encoding' in dtype.metadata): # Variable length string dtype # (https://h5py.readthedocs.io/en/2.10.0/strings.html). Setting the # fillvalue in this case doesn't work # (https://github.com/h5py/h5py/issues/941). if fillvalue not in [0, '', b'', None]: raise ValueError( "Non-default fillvalue not supported for variable length strings" ) fillvalue = None virtual_data = f['_version_data/versions'][ version_name].create_virtual_dataset(name, layout, fillvalue=fillvalue) if attrs: for k, v in attrs.items(): virtual_data.attrs[k] = v return virtual_data
def split_chunks(shape, chunks): """ Yield a set of ndindex indices for chunks over shape If the shape is not a multiple of the chunk size, some chunks will be truncated. For example, if a has shape (10, 19) and is chunked into chunks of shape (5, 5): >>> from versioned_hdf5.slicetools import split_chunks >>> for i in split_chunks((10, 19), (5, 5)): ... print(i) Tuple(slice(0, 5, 1), slice(0, 5, 1)) Tuple(slice(0, 5, 1), slice(5, 10, 1)) Tuple(slice(0, 5, 1), slice(10, 15, 1)) Tuple(slice(0, 5, 1), slice(15, 19, 1)) Tuple(slice(5, 10, 1), slice(0, 5, 1)) Tuple(slice(5, 10, 1), slice(5, 10, 1)) Tuple(slice(5, 10, 1), slice(10, 15, 1)) Tuple(slice(5, 10, 1), slice(15, 19, 1)) """ if len(shape) != len(chunks): raise ValueError("chunks shape must equal the array shape") if len(shape) == 0: raise NotImplementedError("Scalar datasets") d = [math.ceil(i/c) for i, c in zip(shape, chunks)] for c in product(*[range(i) for i in d]): # c = (0, 0, 0), (0, 0, 1), ... yield Tuple(*[Slice(chunk_size*i, min(chunk_size*(i + 1), n), 1) for n, chunk_size, i in zip(shape, chunks, c)])
def test_create_virtual_dataset(h5file): with h5file as f: slices1 = write_dataset(f, 'test_data', np.ones((2 * DEFAULT_CHUNK_SIZE, ))) slices2 = write_dataset( f, 'test_data', np.concatenate((2 * np.ones((DEFAULT_CHUNK_SIZE, )), 3 * np.ones( (DEFAULT_CHUNK_SIZE, ))))) virtual_data = create_virtual_dataset( f, 'test_version', 'test_data', (3 * DEFAULT_CHUNK_SIZE, ), { **slices1, Tuple( Slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE, 1), ): slices2[(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE, 1), )] }) assert virtual_data.shape == (3 * DEFAULT_CHUNK_SIZE, ) assert_equal(virtual_data[0:2 * DEFAULT_CHUNK_SIZE], 1.0) assert_equal( virtual_data[2 * DEFAULT_CHUNK_SIZE:3 * DEFAULT_CHUNK_SIZE], 3.0) assert virtual_data.dtype == np.float64
def test_create_virtual_dataset_attrs(h5file): with h5file as f: slices1 = write_dataset(f, 'test_data', np.ones((2 * DEFAULT_CHUNK_SIZE, ))) slices2 = write_dataset( f, 'test_data', np.concatenate((2 * np.ones((DEFAULT_CHUNK_SIZE, )), 3 * np.ones( (DEFAULT_CHUNK_SIZE, ))))) attrs = {"attribute": "value"} virtual_data = create_virtual_dataset( f, 'test_version', 'test_data', (3 * DEFAULT_CHUNK_SIZE, ), { **slices1, Tuple( Slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE, 1), ): slices2[(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE, 1), )] }, attrs=attrs) assert dict(virtual_data.attrs) == { **attrs, "raw_data": '/_version_data/test_data/raw_data', "chunks": np.array([DEFAULT_CHUNK_SIZE]) }
def write_dataset(f, name, data, chunks=None, compression=None, compression_opts=None, fillvalue=None): if name not in f['_version_data']: return create_base_dataset(f, name, data=data, chunks=chunks, compression=compression, compression_opts=compression_opts, fillvalue=fillvalue) ds = f['_version_data'][name]['raw_data'] if isinstance(chunks, int) and not isinstance(chunks, bool): chunks = (chunks, ) if chunks is None: chunks = tuple(ds.attrs['chunks']) else: if chunks != tuple(ds.attrs['chunks']): raise ValueError( "Chunk size specified but doesn't match already existing chunk size" ) if compression or compression_opts: raise ValueError( "Compression options can only be specified for the first version of a dataset" ) if fillvalue is not None and fillvalue != ds.fillvalue: raise ValueError( f"fillvalues do not match ({fillvalue} != {ds.fillvalue})") if data.dtype != ds.dtype: raise ValueError(f"dtypes do not match ({data.dtype} != {ds.dtype})") # TODO: Handle more than one dimension old_shape = ds.shape hashtable = Hashtable(f, name) slices = {} slices_to_write = {} chunk_size = chunks[0] for s in split_chunks(data.shape, chunks): idx = hashtable.largest_index data_s = data[s.raw] raw_slice = Slice(idx * chunk_size, idx * chunk_size + data_s.shape[0]) data_hash = hashtable.hash(data_s) raw_slice2 = hashtable.setdefault(data_hash, raw_slice) if raw_slice2 == raw_slice: slices_to_write[raw_slice] = s slices[s] = raw_slice2 ds.resize((old_shape[0] + len(slices_to_write) * chunk_size, ) + chunks[1:]) for raw_slice, s in slices_to_write.items(): data_s = data[s.raw] idx = Tuple(raw_slice, *[slice(0, i) for i in data_s.shape[1:]]) ds[idx.raw] = data[s.raw] return slices
def data_dict(self): if self._data_dict is None: self._data_dict = {} dcpl = self.get_create_plist() is_virtual = dcpl.get_layout() == h5d.VIRTUAL if not is_virtual: # A dataset created with only a fillvalue will be nonvirtual, # since create_virtual_dataset makes a nonvirtual dataset when # there are no virtual sources. slice_map = {} # Same as dataset.get_virtual_sources elif 0 in self._shape: # Work around https://github.com/h5py/h5py/issues/1660 empty_idx = Tuple().expand(self._shape) slice_map = {empty_idx: empty_idx} else: virtual_sources = [ VDSmap(dcpl.get_virtual_vspace(j), dcpl.get_virtual_filename(j), dcpl.get_virtual_dsetname(j), dcpl.get_virtual_srcspace(j)) for j in range(dcpl.get_virtual_count())] slice_map = {spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space) for i in virtual_sources} assert self.raw_data.name == virtual_sources[0].dset_name assert all(i.dset_name == self.raw_data.name for i in virtual_sources) # slice_map = {i.args[0]: j.args[0] for i, j in slice_map.items()} for s in slice_map: src_idx = slice_map[s] if isinstance(src_idx, Tuple): # The pointers to the raw data should only be slices, since # the raw data chunks are extended in the first dimension # only. assert src_idx != Tuple() assert len(src_idx.args) == len(self.chunks) src_idx = src_idx.args[0] assert isinstance(src_idx, Slice) self._data_dict[s] = src_idx return self._data_dict
def write_dataset(f, name, data, chunks=None, dtype=None, compression=None, compression_opts=None, fillvalue=None): if name not in f['_version_data']: return create_base_dataset(f, name, data=data, dtype=dtype, chunks=chunks, compression=compression, compression_opts=compression_opts, fillvalue=fillvalue) ds = f['_version_data'][name]['raw_data'] if isinstance(chunks, int) and not isinstance(chunks, bool): chunks = (chunks,) if chunks is None: chunks = tuple(ds.attrs['chunks']) else: if chunks != tuple(ds.attrs['chunks']): raise ValueError("Chunk size specified but doesn't match already existing chunk size") if dtype is not None: if dtype != ds.dtype: raise ValueError("dtype specified but doesn't match already existing dtype") if compression and compression != ds.compression or compression_opts and compression_opts != ds.compression_opts: raise ValueError("Compression options can only be specified for the first version of a dataset") if fillvalue is not None and fillvalue != ds.fillvalue: dtype = ds.dtype if dtype.metadata and ('vlen' in dtype.metadata or 'h5py_encoding' in dtype.metadata): # Variable length string dtype. The ds.fillvalue will be None in # this case (see create_virtual_dataset() below) pass else: raise ValueError(f"fillvalues do not match ({fillvalue} != {ds.fillvalue})") if data.dtype != ds.dtype: raise ValueError(f"dtypes do not match ({data.dtype} != {ds.dtype})") # TODO: Handle more than one dimension old_shape = ds.shape slices = {} slices_to_write = {} chunk_size = chunks[0] with Hashtable(f, name) as hashtable: if len(data.shape) != 0: for s in ChunkSize(chunks).indices(data.shape): idx = hashtable.largest_index data_s = data[s.raw] raw_slice = Slice(idx*chunk_size, idx*chunk_size + data_s.shape[0]) data_hash = hashtable.hash(data_s) raw_slice2 = hashtable.setdefault(data_hash, raw_slice) if raw_slice2 == raw_slice: slices_to_write[raw_slice] = s slices[s] = raw_slice2 ds.resize((old_shape[0] + len(slices_to_write)*chunk_size,) + chunks[1:]) for raw_slice, s in slices_to_write.items(): # idx = raw_slice.expand(ds.shape[:1] + s.newshape(data.shape)[1:]) data_s = data[s.raw] idx = Tuple(raw_slice, *[slice(0, i) for i in data_s.shape[1:]]) ds[idx.raw] = data[s.raw] return slices
def setup(self): from ndindex import (Slice, Tuple, Integer, ellipsis, Newaxis, IntegerArray, BooleanArray) self.slice = Slice(0, 4, 2) self.integer = Integer(1) self.tuple = Tuple(self.slice, ..., 0) self.ellipsis = ellipsis() self.newaxis = Newaxis() self.integer_array = IntegerArray([[1, 2], [-1, 2]]) self.boolean_array = BooleanArray([[True, False], [False, False]])
def spaceid_to_slice(space): """ Convert an h5py spaceid object into an ndindex index The resulting index is always a Tuple index. """ from h5py import h5s sel_type = space.get_select_type() if sel_type == h5s.SEL_ALL: return Tuple() elif sel_type == h5s.SEL_HYPERSLABS: slices = [] starts, strides, counts, blocks = space.get_regular_hyperslab() for start, stride, count, block in zip(starts, strides, counts, blocks): slices.append(hyperslab_to_slice(start, stride, count, block)) return Tuple(*slices) elif sel_type == h5s.SEL_NONE: return Tuple(Slice(0, 0), ) else: raise NotImplementedError("Point selections are not yet supported")
def resize(self, size, axis=None): """ Resize the dataset, or the specified axis. The rank of the dataset cannot be changed. "Size" should be a shape tuple, or if an axis is specified, an integer. BEWARE: This functions differently than the NumPy resize() method! The data is not "reshuffled" to fit in the new shape; each axis is grown or shrunk independently. The coordinates of existing data are fixed. """ self.parent._check_committed() # This boilerplate code is based on h5py.Dataset.resize if axis is not None: if not (axis >= 0 and axis < self.id.rank): raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank - 1)) try: newlen = int(size) except TypeError: raise TypeError( "Argument must be a single int if axis is specified") size = list(self.shape) size[axis] = newlen size = tuple(size) # === END CODE FROM h5py.Dataset.resize === old_shape = self.shape data_dict = self.id.data_dict chunks = self.chunks old_shape_idx = Tuple(*[Slice(0, i) for i in old_shape]) new_data_dict = {} for c in set(split_chunks(size, chunks)): if c in data_dict: new_data_dict[c] = data_dict[c] else: a = self[c.raw] data = np.full(c.newshape(size), self.fillvalue, dtype=self.dtype) data[old_shape_idx.as_subindex(c).raw] = a new_data_dict[c] = data self.id.data_dict = new_data_dict self.id.shape = size
def __init__(self, _id): # super __init__ is handled by DatasetID.__cinit__ automatically self.data_dict = {} with phil: sid = self.get_space() self._shape = sid.get_simple_extent_dims() dcpl = self.get_create_plist() # Same as dataset.get_virtual_sources virtual_sources = [ VDSmap(dcpl.get_virtual_vspace(j), dcpl.get_virtual_filename(j), dcpl.get_virtual_dsetname(j), dcpl.get_virtual_srcspace(j)) for j in range(dcpl.get_virtual_count()) ] slice_map = { spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space) for i in virtual_sources } # slice_map = {i.args[0]: j.args[0] for i, j in slice_map.items()} fid = h5i.get_file_id(self) g = Group(fid) raw_data_name = virtual_sources[0].dset_name assert all(i.dset_name == raw_data_name for i in virtual_sources) self.raw_data = g[raw_data_name] self.chunks = tuple(self.raw_data.attrs['chunks']) for s in slice_map: src_idx = slice_map[s] if isinstance(src_idx, Tuple): # The pointers to the raw data should only be slices, since # the raw data chunks are extended in the first dimension # only. assert src_idx != Tuple() assert len(src_idx.args) == len(self.chunks) src_idx = src_idx.args[0] assert isinstance(src_idx, Slice) self.data_dict[s] = src_idx fillvalue_a = np.empty((1, ), dtype=self.dtype) dcpl.get_fill_value(fillvalue_a) self.fillvalue = fillvalue_a[0]
def time_constructor_ndindex(self): Tuple(*self.t.args)
def time_constructor_invalid_ellipses(self): try: Tuple(0, ..., ...) except IndexError: pass
def test_write_dataset_offset_multidimension(h5file): chunks = ChunkSize(3 * (CHUNK_SIZE_3D, )) shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D) data = np.zeros(shape) slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks) shape2 = (2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2) data2 = np.empty(shape2) for n, c in enumerate(chunks.indices(shape)): data2[c.raw] = n slices2 = write_dataset(h5file, 'test_data', data2, chunks=chunks) assert slices1 == { ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), } assert slices2 == { ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(2 * CHUNK_SIZE_3D, 3 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(3 * CHUNK_SIZE_3D, 4 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(4 * CHUNK_SIZE_3D, 5 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(5 * CHUNK_SIZE_3D, 6 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(6 * CHUNK_SIZE_3D, 7 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(7 * CHUNK_SIZE_3D, 8 * CHUNK_SIZE_3D - 2), } ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (8 * CHUNK_SIZE_3D, CHUNK_SIZE_3D, CHUNK_SIZE_3D) for n, c in enumerate(chunks.indices(shape2)): a = np.zeros(chunks) a[Tuple(*[slice(0, i) for i in shape2]).as_subindex(c).raw] = n assert_equal(ds[n * CHUNK_SIZE_3D:(n + 1) * CHUNK_SIZE_3D], a) assert ds.dtype == np.float64
def test_write_dataset_chunks(h5file): slices1 = write_dataset(h5file, 'test_data', np.ones((2 * DEFAULT_CHUNK_SIZE, ))) slices2 = write_dataset_chunks( h5file, 'test_data', { Tuple(Slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE, 1)): slices1[Tuple( Slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE, 1))], Tuple(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE, 1)): 2 * np.ones((DEFAULT_CHUNK_SIZE, )), Tuple(Slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE, 1)): 2 * np.ones((DEFAULT_CHUNK_SIZE, )), Tuple(Slice(3 * DEFAULT_CHUNK_SIZE, 4 * DEFAULT_CHUNK_SIZE, 1)): 3 * np.ones((DEFAULT_CHUNK_SIZE, )), }) assert slices1 == { Tuple(Slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE, 1)): slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE), Tuple(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE, 1)): slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE), } assert slices2 == { Tuple(Slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE, 1)): slice(0 * DEFAULT_CHUNK_SIZE, 1 * DEFAULT_CHUNK_SIZE), Tuple(Slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE, 1)): slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE), Tuple(Slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE, 1)): slice(1 * DEFAULT_CHUNK_SIZE, 2 * DEFAULT_CHUNK_SIZE), Tuple(Slice(3 * DEFAULT_CHUNK_SIZE, 4 * DEFAULT_CHUNK_SIZE, 1)): slice(2 * DEFAULT_CHUNK_SIZE, 3 * DEFAULT_CHUNK_SIZE), } ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (3 * DEFAULT_CHUNK_SIZE, ) assert_equal(ds[0:1 * DEFAULT_CHUNK_SIZE], 1.0) assert_equal(ds[1 * DEFAULT_CHUNK_SIZE:2 * DEFAULT_CHUNK_SIZE], 2.0) assert_equal(ds[2 * DEFAULT_CHUNK_SIZE:3 * DEFAULT_CHUNK_SIZE], 3.0) assert_equal(ds[3 * DEFAULT_CHUNK_SIZE:4 * DEFAULT_CHUNK_SIZE], 0.0) assert ds.dtype == np.float64
def time_constructor_invalid_array_broadcast(self): try: Tuple([0, 1], [0, 1, 2]) except IndexError: pass
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ self.parent._check_committed() # This boilerplate code is based on h5py.Dataset.__setitem__ args = args if isinstance(args, tuple) else (args, ) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. vlen = h5t.check_vlen_dtype(self.dtype) if vlen is not None and vlen not in (bytes, str): try: val = np.asarray(val, dtype=vlen) except ValueError: try: val = np.array([np.array(x, dtype=vlen) for x in val], dtype=self.dtype) except ValueError: pass if vlen == val.dtype: if val.ndim > 1: tmp = np.empty(shape=val.shape[:-1], dtype=object) tmp.ravel()[:] = [ i for i in val.reshape(( np.product(val.shape[:-1], dtype=np.ulonglong), val.shape[-1])) ] else: tmp = np.array([None], dtype=object) tmp[0] = val val = tmp elif self.dtype.kind == "O" or \ (self.dtype.kind == 'V' and \ (not isinstance(val, np.ndarray) or val.dtype.kind != 'V') and \ (self.dtype.subdtype == None)): if len(names) == 1 and self.dtype.fields is not None: # Single field selected for write, from a non-array source if not names[0] in self.dtype.fields: raise ValueError("No such field for indexing: %s" % names[0]) dtype = self.dtype.fields[names[0]][0] cast_compound = True else: dtype = self.dtype cast_compound = False val = np.asarray(val, dtype=dtype.base, order='C') if cast_compound: val = val.view(np.dtype([(names[0], dtype)])) val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)]) else: val = np.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] valshp = val.shape[-len(shp):] if valshp != shp: # Last dimension has to match raise TypeError( "When writing to array types, last N dimensions have to match (got %s, but should be %s)" % ( valshp, shp, )) mtype = h5t.py_create(np.dtype((val.dtype, shp))) # mshape = val.shape[0:len(val.shape)-len(shp)] # Make a compound memory type if field-name slicing is required elif len(names) != 0: # mshape = val.shape # Catch common errors if self.dtype.fields is None: raise TypeError( "Illegal slicing argument (not a compound dataset)") mismatch = [x for x in names if x not in self.dtype.fields] if len(mismatch) != 0: mismatch = ", ".join('"%s"' % x for x in mismatch) raise ValueError( "Illegal slicing argument (fields %s not in dataset type)" % mismatch) # Write non-compound source into a single dataset field if len(names) == 1 and val.dtype.fields is None: subtype = h5t.py_create(val.dtype) mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) mtype.insert(self._e(names[0]), 0, subtype) # Make a new source type keeping only the requested fields else: fieldnames = [x for x in val.dtype.names if x in names] # Keep source order mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) for fieldname in fieldnames: subtype = h5t.py_create(val.dtype.fields[fieldname][0]) offset = val.dtype.fields[fieldname][1] mtype.insert(self._e(fieldname), offset, subtype) # Use mtype derived from array (let DatasetID.write figure it out) else: mtype = None # === END CODE FROM h5py.Dataset.__setitem__ === idx = ndindex(args).reduce(self.shape) val = np.broadcast_to(val, idx.newshape(self.shape)) for c, index in as_subchunks(idx, self.shape, self.chunks): if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)): raw_idx = Tuple(self.id.data_dict[c], *[slice(0, len(i)) for i in c.args[1:]]).raw a = self.id._read_chunk(raw_idx) self.id.data_dict[c] = a if self.id.data_dict[c].size != 0: val_idx = c.as_subindex(idx) self.id.data_dict[c][index.raw] = val[val_idx.raw]
def __getitem__(self, args, new_dtype=None): """ Read a slice from the HDF5 dataset. Takes slices and recarray-style field names (more than one is allowed!) in any order. Obeys basic NumPy rules, including broadcasting. """ # This boilerplate code is based on h5py.Dataset.__getitem__ args = args if isinstance(args, tuple) else (args, ) if new_dtype is None: new_dtype = getattr(self._local, 'astype', None) # Sort field names from the rest of the args. names = tuple(x for x in args if isinstance(x, str)) if names: # Read a subset of the fields in this structured dtype if len(names) == 1: names = names[0] # Read with simpler dtype of this field args = tuple(x for x in args if not isinstance(x, str)) return self.fields(names, _prior_dtype=new_dtype)[args] if new_dtype is None: new_dtype = self.dtype mtype = h5t.py_create(new_dtype) # === Special-case region references ==== if len(args) == 1 and isinstance(args[0], h5r.RegionReference): obj = h5r.dereference(args[0], self.id) if obj != self.id: raise ValueError("Region reference must point to this dataset") sid = h5r.get_region(args[0], self.id) mshape = guess_shape(sid) if mshape is None: # 0D with no data (NULL or deselected SCALAR) return Empty(new_dtype) out = np.empty(mshape, dtype=new_dtype) if out.size == 0: return out sid_out = h5s.create_simple(mshape) sid_out.select_all() self.id.read(sid_out, sid, out, mtype) return out # === END CODE FROM h5py.Dataset.__getitem__ === idx = ndindex(args).reduce(self.shape) arr = np.ndarray(idx.newshape(self.shape), new_dtype, order='C') for c, index in as_subchunks(idx, self.shape, self.chunks): if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)): raw_idx = Tuple(self.id.data_dict[c], *[slice(0, len(i)) for i in c.args[1:]]).raw a = self.id._read_chunk(raw_idx) self.id.data_dict[c] = a if self.id.data_dict[c].size != 0: arr_idx = c.as_subindex(idx) arr[arr_idx.raw] = self.id.data_dict[c][index.raw] return arr
def time_constructor_arrays(self): Tuple([[0, 1], [1, 2]], 2, [0, 1])
def setup(self): self.t = Tuple(slice(0, 10), ..., 1) self.t_arrays = Tuple([[0, 1], [1, 2]], 2, [0, 1]) self.t_boolean_scalars = Tuple(True, 0, False)
def test_write_dataset_chunk_size(h5file): chunk_size = 2**10 chunks = (chunk_size, ) slices1 = write_dataset(h5file, 'test_data', np.ones((2 * chunk_size, )), chunks=chunks) raises( ValueError, lambda: write_dataset( h5file, 'test_data', np.ones(chunks), chunks=(2**9, ))) slices2 = write_dataset_chunks( h5file, 'test_data', { Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)): slices1[Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1))], Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)): 2 * np.ones((chunk_size, )), Tuple(Slice(2 * chunk_size, 3 * chunk_size, 1)): 2 * np.ones((chunk_size, )), Tuple(Slice(3 * chunk_size, 4 * chunk_size, 1)): 3 * np.ones((chunk_size, )), }) assert slices1 == { Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)): slice(0 * chunk_size, 1 * chunk_size), Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)): slice(0 * chunk_size, 1 * chunk_size), } assert slices2 == { Tuple(Slice(0 * chunk_size, 1 * chunk_size, 1)): slice(0 * chunk_size, 1 * chunk_size), Tuple(Slice(1 * chunk_size, 2 * chunk_size, 1)): slice(1 * chunk_size, 2 * chunk_size), Tuple(Slice(2 * chunk_size, 3 * chunk_size, 1)): slice(1 * chunk_size, 2 * chunk_size), Tuple(Slice(3 * chunk_size, 4 * chunk_size, 1)): slice(2 * chunk_size, 3 * chunk_size), } ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (3 * chunk_size, ) assert_equal(ds[0:1 * chunk_size], 1.0) assert_equal(ds[1 * chunk_size:2 * chunk_size], 2.0) assert_equal(ds[2 * chunk_size:3 * chunk_size], 3.0) assert_equal(ds[3 * chunk_size:4 * chunk_size], 0.0) assert ds.dtype == np.float64
def time_constructor_builtin(self): Tuple(slice(0, 10), ..., 1)
def _recreate_raw_data(f, name, versions_to_delete, tmp=False): """ Return a new raw data set for a dataset without the chunks from versions_to_delete. If no chunks would be left, i.e., the dataset does not appear in any version not in versions_to_delete, None is returned. If tmp is True, the new raw dataset is called '_tmp_raw_data' and is placed alongside the existing raw dataset. Otherwise the existing raw dataset is replaced. """ chunks_map = defaultdict(dict) for version_name in all_versions(f): if (version_name in versions_to_delete or name not in f['_version_data/versions'][version_name]): continue dataset = f['_version_data/versions'][version_name][name] virtual_sources = dataset.virtual_sources() slice_map = { spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space) for i in virtual_sources } chunks_map[version_name].update(slice_map) chunks_to_keep = set().union( *[map.values() for map in chunks_map.values()]) chunks_to_keep = sorted(chunks_to_keep, key=lambda i: i.args[0].args[0]) if not chunks_to_keep: return {} raw_data = f['_version_data'][name]['raw_data'] chunks = ChunkSize(raw_data.chunks) new_shape = (len(chunks_to_keep) * chunks[0], *chunks[1:]) new_raw_data = f['_version_data'][name].create_dataset( '_tmp_raw_data', shape=new_shape, maxshape=(None, ) + chunks[1:], chunks=raw_data.chunks, dtype=raw_data.dtype, compression=raw_data.compression, compression_opts=raw_data.compression_opts, fillvalue=raw_data.fillvalue) for key, val in raw_data.attrs.items(): new_raw_data.attrs[key] = val r = raw_data[:] n = np.full(new_raw_data.shape, new_raw_data.fillvalue, dtype=new_raw_data.dtype) raw_data_chunks_map = {} for new_chunk, chunk in zip(chunks.indices(new_shape), chunks_to_keep): # Shrink new_chunk to the size of chunk, in case chunk isn't a full # chunk in one of the dimensions. # TODO: Implement something in ndindex to do this. new_chunk = Tuple(*[ Slice(new_chunk.args[i].start, new_chunk.args[i].start + len(chunk.args[i])) for i in range(len(new_chunk.args)) ]) raw_data_chunks_map[chunk] = new_chunk n[new_chunk.raw] = r[chunk.raw] new_raw_data[:] = n if not tmp: del f['_version_data'][name]['raw_data'] f['_version_data'][name].move('_tmp_raw_data', 'raw_data') return raw_data_chunks_map
def test_create_version_chunks(h5file): chunk_size = 2**10 chunks = (chunk_size,) data = np.concatenate((np.ones((2*chunk_size,)), 2*np.ones(chunks), 3*np.ones(chunks))) # TODO: Support creating the initial version with chunks version1 = create_version_group(h5file, 'version1') commit_version(version1, {'test_data': data}, chunks={'test_data': chunks}, compression={'test_data': 'gzip'}, compression_opts={'test_data': 3}) version_bad = create_version_group(h5file, 'version_bad', '') raises(ValueError, lambda: commit_version(version_bad, {'test_data': data}, chunks={'test_data': (2**9,)})) delete_version(h5file, 'version_bad', 'version1') version_bad = create_version_group(h5file, 'version_bad', '') raises(ValueError, lambda: commit_version(version_bad, {'test_data': data}, compression={'test_data':'lzf'})) delete_version(h5file, 'version_bad', 'version1') version_bad = create_version_group(h5file, 'version_bad', '') raises(ValueError, lambda: commit_version(version_bad, {'test_data': data}, compression_opts={'test_data':4})) delete_version(h5file, 'version_bad', 'version1') assert_equal(h5file['_version_data/versions/version1/test_data'], data) ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (3*chunk_size,) assert_equal(ds[0:1*chunk_size], 1.0) assert_equal(ds[1*chunk_size:2*chunk_size], 2.0) assert_equal(ds[2*chunk_size:3*chunk_size], 3.0) assert ds.compression == 'gzip' assert ds.compression_opts == 3 data2_chunks = { Tuple(Slice(0*chunk_size, 1*chunk_size, 1)): np.ones(chunks), Tuple(Slice(1*chunk_size, 2*chunk_size, 1)): np.ones(chunks), Tuple(Slice(2*chunk_size, 3*chunk_size, 1)): 2*np.ones(chunks), Tuple(Slice(3*chunk_size, 4*chunk_size, 1)): 3*np.ones(chunks), } data2_chunks[Tuple(Slice(0*chunk_size, 1*chunk_size, 1))][0] = 0.0 data[0] = 0.0 version2 = create_version_group(h5file, 'version2') commit_version(version2, {'test_data': data2_chunks}) assert_equal(h5file['_version_data/versions/version2/test_data'], data) assert ds.shape == (4*chunk_size,) assert_equal(ds[0:1*chunk_size], 1.0) assert_equal(ds[1*chunk_size:2*chunk_size], 2.0) assert_equal(ds[2*chunk_size:3*chunk_size], 3.0) assert_equal(ds[3*chunk_size], 0.0) assert_equal(ds[3*chunk_size+1:4*chunk_size], 1.0) assert ds.compression == 'gzip' assert ds.compression_opts == 3 data3_chunks = { Tuple(Slice(0*chunk_size, 1*chunk_size, 1)): np.ones(chunks), Tuple(Slice(1*chunk_size, 2*chunk_size, 1)): Slice(0*chunk_size, 1*chunk_size), Tuple(Slice(2*chunk_size, 3*chunk_size, 1)): Slice(1*chunk_size, 2*chunk_size), Tuple(Slice(3*chunk_size, 4*chunk_size, 1)): Slice(2*chunk_size, 3*chunk_size), } data3_chunks[Tuple(Slice(0*chunk_size, 1*chunk_size, 1))][0] = 2.0 data[0] = 2.0 version3 = create_version_group(h5file, 'version3') commit_version(version3, {'test_data': data3_chunks}) assert_equal(h5file['_version_data/versions/version3/test_data'], data) assert ds.shape == (5*chunk_size,) assert_equal(ds[0:1*chunk_size], 1.0) assert_equal(ds[1*chunk_size:2*chunk_size], 2.0) assert_equal(ds[2*chunk_size:3*chunk_size], 3.0) assert_equal(ds[3*chunk_size], 0.0) assert_equal(ds[3*chunk_size+1:4*chunk_size], 1.0) assert_equal(ds[4*chunk_size], 2.0) assert_equal(ds[4*chunk_size+1:5*chunk_size], 1.0) assert set(all_versions(h5file)) == {'version1', 'version2', 'version3'}
def __setitem__(self, index, value): if isinstance(self.dataset, InMemoryDataset) and ndindex(index).expand(self.shape) == Tuple().expand(self.shape): new_dataset = InMemoryArrayDataset(self.name, np.broadcast_to(value, self.shape).astype(self.dtype), self.parent, fillvalue=self.fillvalue, chunks=self.chunks) new_dataset.attrs = self.dataset.attrs self.dataset = new_dataset return self.dataset.__setitem__(index, value)
def create_virtual_dataset(f, version_name, name, shape, slices, attrs=None, fillvalue=None): from h5py._hl.selections import select from h5py._hl.vds import VDSmap raw_data = f['_version_data'][name]['raw_data'] raw_data_shape = raw_data.shape slices = {c: s.reduce() for c, s in slices.items()} if len(raw_data) == 0: shape = () layout = VirtualLayout((1,), dtype=raw_data.dtype) vs = VirtualSource('.', name=raw_data.name, shape=(1,), dtype=raw_data.dtype) layout[0] = vs[()] else: # Chunks in the raw dataset are expanded along the first dimension only. # Since the chunks are pointed to by virtual datasets, it doesn't make # sense to expand the chunks in the raw dataset along multiple dimensions # (the true layout of the chunks in the raw dataset is irrelevant). for c, s in slices.items(): if len(c.args[0]) != len(s): raise ValueError(f"Inconsistent slices dictionary ({c.args[0]}, {s})") # h5py 3.3 changed the VirtualLayout code. See # https://github.com/h5py/h5py/pull/1905. layout = VirtualLayout(shape, dtype=raw_data.dtype) layout_has_sources = hasattr(layout, 'sources') if not layout_has_sources: vs = VirtualSource('.', name=raw_data.name, shape=raw_data.shape, dtype=raw_data.dtype) for c, s in slices.items(): if c.isempty(): continue # idx = Tuple(s, *Tuple(*[slice(0, i) for i in shape[1:]]).as_subindex(Tuple(*c.args[1:])).args) S = [Slice(0, len(c.args[i])) for i in range(1, len(shape))] idx = Tuple(s, *S) # assert c.newshape(shape) == vs[idx.raw].shape, (c, shape, s) if not layout_has_sources: # TODO: Use a faster workaround here too layout[c.raw] = vs[idx.raw] else: # This is equivalent, but it is faster because vs[idx.raw] does a deepcopy(vs), which # is slow. vs_sel = select(raw_data_shape, idx.raw, None) layout_sel = select(shape, c.raw, None) layout.sources.append(VDSmap(layout_sel.id, '.', raw_data.name, vs_sel.id)) dtype = raw_data.dtype if dtype.metadata and ('vlen' in dtype.metadata or 'h5py_encoding' in dtype.metadata): # Variable length string dtype # (https://h5py.readthedocs.io/en/2.10.0/strings.html). Setting the # fillvalue in this case doesn't work # (https://github.com/h5py/h5py/issues/941). if fillvalue not in [0, '', b'', None]: raise ValueError("Non-default fillvalue not supported for variable length strings") fillvalue = None virtual_data = f['_version_data/versions'][version_name].create_virtual_dataset(name, layout, fillvalue=fillvalue) if attrs: for k, v in attrs.items(): virtual_data.attrs[k] = v virtual_data.attrs['raw_data'] = raw_data.name virtual_data.attrs['chunks'] = raw_data.chunks return virtual_data
def time_constructor_boolean_scalars(self): Tuple(True, 0, False)