def test_write_dataset_chunks_multidimension(h5file): chunks = ChunkSize(3 * (CHUNK_SIZE_3D, )) shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D) data = np.zeros(shape) slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks) data_dict = {} for n, c in enumerate(chunks.indices(shape)): if n == 0: data_dict[c] = slices1[c] else: data_dict[c] = n * np.ones(chunks) slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks) slices2 = write_dataset_chunks(h5file, 'test_data', data_dict) assert slices1 == { c: slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D) for c in chunks.indices(shape) } assert slices2 == { c: slice(i * CHUNK_SIZE_3D, (i + 1) * CHUNK_SIZE_3D) for i, c in enumerate(chunks.indices(shape)) } ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (8 * CHUNK_SIZE_3D, CHUNK_SIZE_3D, CHUNK_SIZE_3D) for n in range(8): assert_equal(ds[n * CHUNK_SIZE_3D:(n + 1) * CHUNK_SIZE_3D], n) assert ds.dtype == np.float64
def write_dataset(f, name, data, chunks=None, dtype=None, compression=None, compression_opts=None, fillvalue=None): if name not in f['_version_data']: return create_base_dataset(f, name, data=data, dtype=dtype, chunks=chunks, compression=compression, compression_opts=compression_opts, fillvalue=fillvalue) ds = f['_version_data'][name]['raw_data'] if isinstance(chunks, int) and not isinstance(chunks, bool): chunks = (chunks,) if chunks is None: chunks = tuple(ds.attrs['chunks']) else: if chunks != tuple(ds.attrs['chunks']): raise ValueError("Chunk size specified but doesn't match already existing chunk size") if dtype is not None: if dtype != ds.dtype: raise ValueError("dtype specified but doesn't match already existing dtype") if compression and compression != ds.compression or compression_opts and compression_opts != ds.compression_opts: raise ValueError("Compression options can only be specified for the first version of a dataset") if fillvalue is not None and fillvalue != ds.fillvalue: dtype = ds.dtype if dtype.metadata and ('vlen' in dtype.metadata or 'h5py_encoding' in dtype.metadata): # Variable length string dtype. The ds.fillvalue will be None in # this case (see create_virtual_dataset() below) pass else: raise ValueError(f"fillvalues do not match ({fillvalue} != {ds.fillvalue})") if data.dtype != ds.dtype: raise ValueError(f"dtypes do not match ({data.dtype} != {ds.dtype})") # TODO: Handle more than one dimension old_shape = ds.shape slices = {} slices_to_write = {} chunk_size = chunks[0] with Hashtable(f, name) as hashtable: if len(data.shape) != 0: for s in ChunkSize(chunks).indices(data.shape): idx = hashtable.largest_index data_s = data[s.raw] raw_slice = Slice(idx*chunk_size, idx*chunk_size + data_s.shape[0]) data_hash = hashtable.hash(data_s) raw_slice2 = hashtable.setdefault(data_hash, raw_slice) if raw_slice2 == raw_slice: slices_to_write[raw_slice] = s slices[s] = raw_slice2 ds.resize((old_shape[0] + len(slices_to_write)*chunk_size,) + chunks[1:]) for raw_slice, s in slices_to_write.items(): # idx = raw_slice.expand(ds.shape[:1] + s.newshape(data.shape)[1:]) data_s = data[s.raw] idx = Tuple(raw_slice, *[slice(0, i) for i in data_s.shape[1:]]) ds[idx.raw] = data[s.raw] return slices
def test_create_virtual_dataset_offset_multidimension(h5file): chunks = ChunkSize(3 * (CHUNK_SIZE_3D, )) shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D) data = np.zeros(shape) write_dataset(h5file, 'test_data', data, chunks=chunks) shape2 = (2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2) data2 = np.empty(shape2) for n, c in enumerate(chunks.indices(shape)): data2[c.raw] = n slices2 = write_dataset(h5file, 'test_data', data2, chunks=chunks) virtual_data = create_virtual_dataset(h5file, 'test_version', 'test_data', shape2, slices2) assert virtual_data.shape == shape2 assert_equal(virtual_data[()], data2) assert virtual_data.dtype == np.float64
def __init__(self, name, *, shape, dtype, parent, chunks=None, fillvalue=None): if shape is None: raise TypeError("shape must be specified for sparse datasets") self.name = name self.shape = shape self.dtype = np.dtype(dtype) self.attrs = {} self._fillvalue = fillvalue if chunks in [True, None]: if len(shape) == 1: chunks = (DEFAULT_CHUNK_SIZE,) else: raise NotImplementedError("chunks must be specified for multi-dimensional datasets") self.chunks = ChunkSize(chunks) self.parent = parent # This works like a mix between InMemoryArrayDataset and # InMemoryDatasetID. Explicit array data is stored in a data_dict like # with InMemoryDatasetID, but unlike it, missing data (which equals # the fill value) is omitted. self.data_dict = {}
def from_raw_data(cls, f, name, chunk_size=None, hash_table_name='hash_table'): if hash_table_name in f['_version_data'][name]: raise ValueError( f"a hash table {hash_table_name!r} for {name!r} already exists" ) hashtable = cls(f, name, chunk_size=chunk_size, hash_table_name=hash_table_name) raw_data = f['_version_data'][name]['raw_data'] chunks = ChunkSize(raw_data.chunks) for c in chunks.indices(raw_data.shape): data_hash = hashtable.hash(raw_data[c.raw]) hashtable.setdefault(data_hash, c.args[0]) hashtable.write() return hashtable
def test_write_dataset_offset_multidimension(h5file): chunks = ChunkSize(3 * (CHUNK_SIZE_3D, )) shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D) data = np.zeros(shape) slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks) shape2 = (2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2) data2 = np.empty(shape2) for n, c in enumerate(chunks.indices(shape)): data2[c.raw] = n slices2 = write_dataset(h5file, 'test_data', data2, chunks=chunks) assert slices1 == { ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), } assert slices2 == { ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(2 * CHUNK_SIZE_3D, 3 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(3 * CHUNK_SIZE_3D, 4 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(4 * CHUNK_SIZE_3D, 5 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(5 * CHUNK_SIZE_3D, 6 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(6 * CHUNK_SIZE_3D, 7 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(7 * CHUNK_SIZE_3D, 8 * CHUNK_SIZE_3D - 2), } ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (8 * CHUNK_SIZE_3D, CHUNK_SIZE_3D, CHUNK_SIZE_3D) for n, c in enumerate(chunks.indices(shape2)): a = np.zeros(chunks) a[Tuple(*[slice(0, i) for i in shape2]).as_subindex(c).raw] = n assert_equal(ds[n * CHUNK_SIZE_3D:(n + 1) * CHUNK_SIZE_3D], a) assert ds.dtype == np.float64
def _recreate_raw_data(f, name, versions_to_delete, tmp=False): """ Return a new raw data set for a dataset without the chunks from versions_to_delete. If no chunks would be left, i.e., the dataset does not appear in any version not in versions_to_delete, None is returned. If tmp is True, the new raw dataset is called '_tmp_raw_data' and is placed alongside the existing raw dataset. Otherwise the existing raw dataset is replaced. """ chunks_map = defaultdict(dict) for version_name in all_versions(f): if (version_name in versions_to_delete or name not in f['_version_data/versions'][version_name]): continue dataset = f['_version_data/versions'][version_name][name] virtual_sources = dataset.virtual_sources() slice_map = { spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space) for i in virtual_sources } chunks_map[version_name].update(slice_map) chunks_to_keep = set().union( *[map.values() for map in chunks_map.values()]) chunks_to_keep = sorted(chunks_to_keep, key=lambda i: i.args[0].args[0]) if not chunks_to_keep: return {} raw_data = f['_version_data'][name]['raw_data'] chunks = ChunkSize(raw_data.chunks) new_shape = (len(chunks_to_keep) * chunks[0], *chunks[1:]) new_raw_data = f['_version_data'][name].create_dataset( '_tmp_raw_data', shape=new_shape, maxshape=(None, ) + chunks[1:], chunks=raw_data.chunks, dtype=raw_data.dtype, compression=raw_data.compression, compression_opts=raw_data.compression_opts, fillvalue=raw_data.fillvalue) for key, val in raw_data.attrs.items(): new_raw_data.attrs[key] = val r = raw_data[:] n = np.full(new_raw_data.shape, new_raw_data.fillvalue, dtype=new_raw_data.dtype) raw_data_chunks_map = {} for new_chunk, chunk in zip(chunks.indices(new_shape), chunks_to_keep): # Shrink new_chunk to the size of chunk, in case chunk isn't a full # chunk in one of the dimensions. # TODO: Implement something in ndindex to do this. new_chunk = Tuple(*[ Slice(new_chunk.args[i].start, new_chunk.args[i].start + len(chunk.args[i])) for i in range(len(new_chunk.args)) ]) raw_data_chunks_map[chunk] = new_chunk n[new_chunk.raw] = r[chunk.raw] new_raw_data[:] = n if not tmp: del f['_version_data'][name]['raw_data'] f['_version_data'][name].move('_tmp_raw_data', 'raw_data') return raw_data_chunks_map
class InMemorySparseDataset(DatasetLike): """ Class that looks like a Dataset that has no data (only the fillvalue) """ def __init__(self, name, *, shape, dtype, parent, chunks=None, fillvalue=None): if shape is None: raise TypeError("shape must be specified for sparse datasets") self.name = name self.shape = shape self.dtype = np.dtype(dtype) self.attrs = {} self._fillvalue = fillvalue if chunks in [True, None]: if len(shape) == 1: chunks = (DEFAULT_CHUNK_SIZE,) else: raise NotImplementedError("chunks must be specified for multi-dimensional datasets") self.chunks = ChunkSize(chunks) self.parent = parent # This works like a mix between InMemoryArrayDataset and # InMemoryDatasetID. Explicit array data is stored in a data_dict like # with InMemoryDatasetID, but unlike it, missing data (which equals # the fill value) is omitted. self.data_dict = {} def as_dtype(self, name, dtype, parent, casting='unsafe'): """ Return a copy of `self` as a new dataset with the given `name` and `dtype` in the group `parent`. `casting` should be as in the numpy astype() method. """ if self.fillvalue is not None: new_fillvalue = self.fillvalue.astype(dtype, casting=casting) else: new_fillvalue = None new_data_dict = {} for c, index in self.data_dict.copy().items(): new_data_dict[c] = self.data_dict[c].astype(dtype, casting=casting) return self.from_data_dict(name, new_data_dict, dtype=dtype, parent=parent, fillvalue=new_fillvalue, chunks=self.chunks, shape=self.shape) @classmethod def from_data_dict(cls, name, data_dict, *, shape, dtype, parent, chunks, fillvalue=None): """ Create a InMemorySparseDataset from a data dict. This does not do any consistency checks with the metadata provide. """ dataset = cls(name, shape=shape, dtype=dtype, parent=parent, chunks=chunks, fillvalue=fillvalue) dataset.data_dict = data_dict return dataset @classmethod def from_dataset(cls, dataset, parent=None): # np.testing.assert_equal(dataset[()], dataset.fillvalue) return cls(dataset.name, shape=dataset.shape, dtype=dataset.dtype, parent=parent or dataset.parent, chunks=dataset.chunks, fillvalue=dataset.fillvalue) def resize(self, size, axis=None): if axis is not None: if not (axis >=0 and axis < self.ndim): raise ValueError("Invalid axis (0 to %s allowed)" % (self.ndim-1)) try: newlen = int(size) except TypeError: raise TypeError("Argument must be a single int if axis is specified") size = list(self.shape) size[axis] = newlen size = tuple(size) if len(size) > 1: raise NotImplementedError("More than one dimension is not yet supported") self.shape = size def __getitem__(self, index): idx = ndindex(index).reduce(self.shape) newshape = idx.newshape(self.shape) arr = np.full(newshape, self.fillvalue, dtype=self.dtype) for c in self.chunks.as_subchunks(idx, self.shape): if c not in self.data_dict: fill = np.broadcast_to(self.fillvalue, c.newshape(self.shape)) self.data_dict[c] = fill if self.data_dict[c].size != 0: arr_idx = c.as_subindex(idx) chunk_idx = idx.as_subindex(c) arr[arr_idx.raw] = self.data_dict[c][chunk_idx.raw] # Return arr as a scalar if it is shape () (matching h5py) return arr[()] def __setitem__(self, index, value): self.parent._check_committed() idx = ndindex(index).reduce(self.shape) val = np.broadcast_to(value, idx.newshape(self.shape)) for c in self.chunks.as_subchunks(idx, self.shape): if c not in self.data_dict: # Broadcasted arrays do not actually consume memory fill = np.broadcast_to(self.fillvalue, c.newshape(self.shape)) self.data_dict[c] = fill if self.data_dict[c].size != 0: val_idx = c.as_subindex(idx) if not self.data_dict[c].flags.writeable: # self.data_dict[c] is a broadcasted array from above self.data_dict[c] = self.data_dict[c].copy() chunk_idx = idx.as_subindex(c) self.data_dict[c][chunk_idx.raw] = val[val_idx.raw]
def chunks(self): return ChunkSize(self.id.chunks)