def test_write_dataset_chunks_multidimension(h5file): chunks = ChunkSize(3 * (CHUNK_SIZE_3D, )) shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D) data = np.zeros(shape) slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks) data_dict = {} for n, c in enumerate(chunks.indices(shape)): if n == 0: data_dict[c] = slices1[c] else: data_dict[c] = n * np.ones(chunks) slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks) slices2 = write_dataset_chunks(h5file, 'test_data', data_dict) assert slices1 == { c: slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D) for c in chunks.indices(shape) } assert slices2 == { c: slice(i * CHUNK_SIZE_3D, (i + 1) * CHUNK_SIZE_3D) for i, c in enumerate(chunks.indices(shape)) } ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (8 * CHUNK_SIZE_3D, CHUNK_SIZE_3D, CHUNK_SIZE_3D) for n in range(8): assert_equal(ds[n * CHUNK_SIZE_3D:(n + 1) * CHUNK_SIZE_3D], n) assert ds.dtype == np.float64
def test_create_virtual_dataset_offset_multidimension(h5file): chunks = ChunkSize(3 * (CHUNK_SIZE_3D, )) shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D) data = np.zeros(shape) write_dataset(h5file, 'test_data', data, chunks=chunks) shape2 = (2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2) data2 = np.empty(shape2) for n, c in enumerate(chunks.indices(shape)): data2[c.raw] = n slices2 = write_dataset(h5file, 'test_data', data2, chunks=chunks) virtual_data = create_virtual_dataset(h5file, 'test_version', 'test_data', shape2, slices2) assert virtual_data.shape == shape2 assert_equal(virtual_data[()], data2) assert virtual_data.dtype == np.float64
def from_raw_data(cls, f, name, chunk_size=None, hash_table_name='hash_table'): if hash_table_name in f['_version_data'][name]: raise ValueError( f"a hash table {hash_table_name!r} for {name!r} already exists" ) hashtable = cls(f, name, chunk_size=chunk_size, hash_table_name=hash_table_name) raw_data = f['_version_data'][name]['raw_data'] chunks = ChunkSize(raw_data.chunks) for c in chunks.indices(raw_data.shape): data_hash = hashtable.hash(raw_data[c.raw]) hashtable.setdefault(data_hash, c.args[0]) hashtable.write() return hashtable
def test_write_dataset_offset_multidimension(h5file): chunks = ChunkSize(3 * (CHUNK_SIZE_3D, )) shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D) data = np.zeros(shape) slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks) shape2 = (2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2) data2 = np.empty(shape2) for n, c in enumerate(chunks.indices(shape)): data2[c.raw] = n slices2 = write_dataset(h5file, 'test_data', data2, chunks=chunks) assert slices1 == { ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), } assert slices2 == { ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(2 * CHUNK_SIZE_3D, 3 * CHUNK_SIZE_3D), ( Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(3 * CHUNK_SIZE_3D, 4 * CHUNK_SIZE_3D), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(4 * CHUNK_SIZE_3D, 5 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(5 * CHUNK_SIZE_3D, 6 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1), ): slice(6 * CHUNK_SIZE_3D, 7 * CHUNK_SIZE_3D - 2), ( Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1), ): slice(7 * CHUNK_SIZE_3D, 8 * CHUNK_SIZE_3D - 2), } ds = h5file['/_version_data/test_data/raw_data'] assert ds.shape == (8 * CHUNK_SIZE_3D, CHUNK_SIZE_3D, CHUNK_SIZE_3D) for n, c in enumerate(chunks.indices(shape2)): a = np.zeros(chunks) a[Tuple(*[slice(0, i) for i in shape2]).as_subindex(c).raw] = n assert_equal(ds[n * CHUNK_SIZE_3D:(n + 1) * CHUNK_SIZE_3D], a) assert ds.dtype == np.float64
def _recreate_raw_data(f, name, versions_to_delete, tmp=False): """ Return a new raw data set for a dataset without the chunks from versions_to_delete. If no chunks would be left, i.e., the dataset does not appear in any version not in versions_to_delete, None is returned. If tmp is True, the new raw dataset is called '_tmp_raw_data' and is placed alongside the existing raw dataset. Otherwise the existing raw dataset is replaced. """ chunks_map = defaultdict(dict) for version_name in all_versions(f): if (version_name in versions_to_delete or name not in f['_version_data/versions'][version_name]): continue dataset = f['_version_data/versions'][version_name][name] virtual_sources = dataset.virtual_sources() slice_map = { spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space) for i in virtual_sources } chunks_map[version_name].update(slice_map) chunks_to_keep = set().union( *[map.values() for map in chunks_map.values()]) chunks_to_keep = sorted(chunks_to_keep, key=lambda i: i.args[0].args[0]) if not chunks_to_keep: return {} raw_data = f['_version_data'][name]['raw_data'] chunks = ChunkSize(raw_data.chunks) new_shape = (len(chunks_to_keep) * chunks[0], *chunks[1:]) new_raw_data = f['_version_data'][name].create_dataset( '_tmp_raw_data', shape=new_shape, maxshape=(None, ) + chunks[1:], chunks=raw_data.chunks, dtype=raw_data.dtype, compression=raw_data.compression, compression_opts=raw_data.compression_opts, fillvalue=raw_data.fillvalue) for key, val in raw_data.attrs.items(): new_raw_data.attrs[key] = val r = raw_data[:] n = np.full(new_raw_data.shape, new_raw_data.fillvalue, dtype=new_raw_data.dtype) raw_data_chunks_map = {} for new_chunk, chunk in zip(chunks.indices(new_shape), chunks_to_keep): # Shrink new_chunk to the size of chunk, in case chunk isn't a full # chunk in one of the dimensions. # TODO: Implement something in ndindex to do this. new_chunk = Tuple(*[ Slice(new_chunk.args[i].start, new_chunk.args[i].start + len(chunk.args[i])) for i in range(len(new_chunk.args)) ]) raw_data_chunks_map[chunk] = new_chunk n[new_chunk.raw] = r[chunk.raw] new_raw_data[:] = n if not tmp: del f['_version_data'][name]['raw_data'] f['_version_data'][name].move('_tmp_raw_data', 'raw_data') return raw_data_chunks_map