コード例 #1
0
def test_write_dataset_chunks_multidimension(h5file):
    chunks = ChunkSize(3 * (CHUNK_SIZE_3D, ))
    shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D)
    data = np.zeros(shape)
    slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks)
    data_dict = {}
    for n, c in enumerate(chunks.indices(shape)):
        if n == 0:
            data_dict[c] = slices1[c]
        else:
            data_dict[c] = n * np.ones(chunks)

    slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks)
    slices2 = write_dataset_chunks(h5file, 'test_data', data_dict)

    assert slices1 == {
        c: slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D)
        for c in chunks.indices(shape)
    }
    assert slices2 == {
        c: slice(i * CHUNK_SIZE_3D, (i + 1) * CHUNK_SIZE_3D)
        for i, c in enumerate(chunks.indices(shape))
    }

    ds = h5file['/_version_data/test_data/raw_data']
    assert ds.shape == (8 * CHUNK_SIZE_3D, CHUNK_SIZE_3D, CHUNK_SIZE_3D)
    for n in range(8):
        assert_equal(ds[n * CHUNK_SIZE_3D:(n + 1) * CHUNK_SIZE_3D], n)
    assert ds.dtype == np.float64
コード例 #2
0
ファイル: backend.py プロジェクト: deshaw/versioned-hdf5
def write_dataset(f, name, data, chunks=None, dtype=None, compression=None,
                  compression_opts=None, fillvalue=None):

    if name not in f['_version_data']:
        return create_base_dataset(f, name, data=data, dtype=dtype,
                                   chunks=chunks, compression=compression,
                                   compression_opts=compression_opts, fillvalue=fillvalue)

    ds = f['_version_data'][name]['raw_data']
    if isinstance(chunks, int) and not isinstance(chunks, bool):
        chunks = (chunks,)
    if chunks is None:
        chunks = tuple(ds.attrs['chunks'])
    else:
        if chunks != tuple(ds.attrs['chunks']):
            raise ValueError("Chunk size specified but doesn't match already existing chunk size")

    if dtype is not None:
        if dtype != ds.dtype:
            raise ValueError("dtype specified but doesn't match already existing dtype")

    if compression and compression != ds.compression or compression_opts and compression_opts != ds.compression_opts:
        raise ValueError("Compression options can only be specified for the first version of a dataset")
    if fillvalue is not None and fillvalue != ds.fillvalue:
        dtype = ds.dtype
        if dtype.metadata and ('vlen' in dtype.metadata or 'h5py_encoding' in dtype.metadata):
            # Variable length string dtype. The ds.fillvalue will be None in
            # this case (see create_virtual_dataset() below)
            pass
        else:
            raise ValueError(f"fillvalues do not match ({fillvalue} != {ds.fillvalue})")
    if data.dtype != ds.dtype:
        raise ValueError(f"dtypes do not match ({data.dtype} != {ds.dtype})")
    # TODO: Handle more than one dimension
    old_shape = ds.shape
    slices = {}
    slices_to_write = {}
    chunk_size = chunks[0]

    with Hashtable(f, name) as hashtable:
        if len(data.shape) != 0:
            for s in ChunkSize(chunks).indices(data.shape):
                idx = hashtable.largest_index
                data_s = data[s.raw]
                raw_slice = Slice(idx*chunk_size, idx*chunk_size + data_s.shape[0])
                data_hash = hashtable.hash(data_s)
                raw_slice2 = hashtable.setdefault(data_hash, raw_slice)
                if raw_slice2 == raw_slice:
                    slices_to_write[raw_slice] = s
                slices[s] = raw_slice2

            ds.resize((old_shape[0] + len(slices_to_write)*chunk_size,) + chunks[1:])
            for raw_slice, s in slices_to_write.items():
                # idx = raw_slice.expand(ds.shape[:1] + s.newshape(data.shape)[1:])
                data_s = data[s.raw]
                idx = Tuple(raw_slice, *[slice(0, i) for i in data_s.shape[1:]])
                ds[idx.raw] = data[s.raw]
    return slices
コード例 #3
0
def test_create_virtual_dataset_offset_multidimension(h5file):
    chunks = ChunkSize(3 * (CHUNK_SIZE_3D, ))
    shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D)
    data = np.zeros(shape)
    write_dataset(h5file, 'test_data', data, chunks=chunks)
    shape2 = (2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2,
              2 * CHUNK_SIZE_3D - 2)
    data2 = np.empty(shape2)
    for n, c in enumerate(chunks.indices(shape)):
        data2[c.raw] = n

    slices2 = write_dataset(h5file, 'test_data', data2, chunks=chunks)

    virtual_data = create_virtual_dataset(h5file, 'test_version', 'test_data',
                                          shape2, slices2)

    assert virtual_data.shape == shape2
    assert_equal(virtual_data[()], data2)
    assert virtual_data.dtype == np.float64
コード例 #4
0
ファイル: wrappers.py プロジェクト: asmeurer/versioned-hdf5
    def __init__(self, name, *, shape, dtype, parent, chunks=None, fillvalue=None):
        if shape is None:
            raise TypeError("shape must be specified for sparse datasets")
        self.name = name
        self.shape = shape
        self.dtype = np.dtype(dtype)
        self.attrs = {}
        self._fillvalue = fillvalue
        if chunks in [True, None]:
            if len(shape) == 1:
                chunks = (DEFAULT_CHUNK_SIZE,)
            else:
                raise NotImplementedError("chunks must be specified for multi-dimensional datasets")
        self.chunks = ChunkSize(chunks)
        self.parent = parent

        # This works like a mix between InMemoryArrayDataset and
        # InMemoryDatasetID. Explicit array data is stored in a data_dict like
        # with InMemoryDatasetID, but unlike it, missing data (which equals
        # the fill value) is omitted.
        self.data_dict = {}
コード例 #5
0
    def from_raw_data(cls,
                      f,
                      name,
                      chunk_size=None,
                      hash_table_name='hash_table'):
        if hash_table_name in f['_version_data'][name]:
            raise ValueError(
                f"a hash table {hash_table_name!r} for {name!r} already exists"
            )

        hashtable = cls(f,
                        name,
                        chunk_size=chunk_size,
                        hash_table_name=hash_table_name)

        raw_data = f['_version_data'][name]['raw_data']
        chunks = ChunkSize(raw_data.chunks)
        for c in chunks.indices(raw_data.shape):
            data_hash = hashtable.hash(raw_data[c.raw])
            hashtable.setdefault(data_hash, c.args[0])

        hashtable.write()
        return hashtable
コード例 #6
0
def test_write_dataset_offset_multidimension(h5file):
    chunks = ChunkSize(3 * (CHUNK_SIZE_3D, ))
    shape = (2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D)
    data = np.zeros(shape)
    slices1 = write_dataset(h5file, 'test_data', data, chunks=chunks)
    shape2 = (2 * CHUNK_SIZE_3D - 2, 2 * CHUNK_SIZE_3D - 2,
              2 * CHUNK_SIZE_3D - 2)
    data2 = np.empty(shape2)
    for n, c in enumerate(chunks.indices(shape)):
        data2[c.raw] = n

    slices2 = write_dataset(h5file, 'test_data', data2, chunks=chunks)

    assert slices1 == {
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
    }

    assert slices2 == {
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
        ):
        slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(2 * CHUNK_SIZE_3D, 3 * CHUNK_SIZE_3D),
        (
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
        ):
        slice(3 * CHUNK_SIZE_3D, 4 * CHUNK_SIZE_3D),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(4 * CHUNK_SIZE_3D, 5 * CHUNK_SIZE_3D - 2),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
        ):
        slice(5 * CHUNK_SIZE_3D, 6 * CHUNK_SIZE_3D - 2),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(0 * CHUNK_SIZE_3D, 1 * CHUNK_SIZE_3D, 1),
        ):
        slice(6 * CHUNK_SIZE_3D, 7 * CHUNK_SIZE_3D - 2),
        (
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
            Slice(1 * CHUNK_SIZE_3D, 2 * CHUNK_SIZE_3D - 2, 1),
        ):
        slice(7 * CHUNK_SIZE_3D, 8 * CHUNK_SIZE_3D - 2),
    }

    ds = h5file['/_version_data/test_data/raw_data']
    assert ds.shape == (8 * CHUNK_SIZE_3D, CHUNK_SIZE_3D, CHUNK_SIZE_3D)
    for n, c in enumerate(chunks.indices(shape2)):
        a = np.zeros(chunks)
        a[Tuple(*[slice(0, i) for i in shape2]).as_subindex(c).raw] = n
        assert_equal(ds[n * CHUNK_SIZE_3D:(n + 1) * CHUNK_SIZE_3D], a)
    assert ds.dtype == np.float64
コード例 #7
0
def _recreate_raw_data(f, name, versions_to_delete, tmp=False):
    """
    Return a new raw data set for a dataset without the chunks from
    versions_to_delete.

    If no chunks would be left, i.e., the dataset does not appear in any
    version not in versions_to_delete, None is returned.

    If tmp is True, the new raw dataset is called '_tmp_raw_data' and is
    placed alongside the existing raw dataset. Otherwise the existing raw
    dataset is replaced.

    """
    chunks_map = defaultdict(dict)

    for version_name in all_versions(f):
        if (version_name in versions_to_delete
                or name not in f['_version_data/versions'][version_name]):
            continue

        dataset = f['_version_data/versions'][version_name][name]

        virtual_sources = dataset.virtual_sources()
        slice_map = {
            spaceid_to_slice(i.vspace): spaceid_to_slice(i.src_space)
            for i in virtual_sources
        }
        chunks_map[version_name].update(slice_map)

    chunks_to_keep = set().union(
        *[map.values() for map in chunks_map.values()])

    chunks_to_keep = sorted(chunks_to_keep, key=lambda i: i.args[0].args[0])

    if not chunks_to_keep:
        return {}

    raw_data = f['_version_data'][name]['raw_data']
    chunks = ChunkSize(raw_data.chunks)
    new_shape = (len(chunks_to_keep) * chunks[0], *chunks[1:])

    new_raw_data = f['_version_data'][name].create_dataset(
        '_tmp_raw_data',
        shape=new_shape,
        maxshape=(None, ) + chunks[1:],
        chunks=raw_data.chunks,
        dtype=raw_data.dtype,
        compression=raw_data.compression,
        compression_opts=raw_data.compression_opts,
        fillvalue=raw_data.fillvalue)
    for key, val in raw_data.attrs.items():
        new_raw_data.attrs[key] = val

    r = raw_data[:]
    n = np.full(new_raw_data.shape,
                new_raw_data.fillvalue,
                dtype=new_raw_data.dtype)
    raw_data_chunks_map = {}
    for new_chunk, chunk in zip(chunks.indices(new_shape), chunks_to_keep):
        # Shrink new_chunk to the size of chunk, in case chunk isn't a full
        # chunk in one of the dimensions.
        # TODO: Implement something in ndindex to do this.
        new_chunk = Tuple(*[
            Slice(new_chunk.args[i].start, new_chunk.args[i].start +
                  len(chunk.args[i])) for i in range(len(new_chunk.args))
        ])
        raw_data_chunks_map[chunk] = new_chunk
        n[new_chunk.raw] = r[chunk.raw]

    new_raw_data[:] = n
    if not tmp:
        del f['_version_data'][name]['raw_data']
        f['_version_data'][name].move('_tmp_raw_data', 'raw_data')

    return raw_data_chunks_map
コード例 #8
0
ファイル: wrappers.py プロジェクト: asmeurer/versioned-hdf5
class InMemorySparseDataset(DatasetLike):
    """
    Class that looks like a Dataset that has no data (only the fillvalue)
    """
    def __init__(self, name, *, shape, dtype, parent, chunks=None, fillvalue=None):
        if shape is None:
            raise TypeError("shape must be specified for sparse datasets")
        self.name = name
        self.shape = shape
        self.dtype = np.dtype(dtype)
        self.attrs = {}
        self._fillvalue = fillvalue
        if chunks in [True, None]:
            if len(shape) == 1:
                chunks = (DEFAULT_CHUNK_SIZE,)
            else:
                raise NotImplementedError("chunks must be specified for multi-dimensional datasets")
        self.chunks = ChunkSize(chunks)
        self.parent = parent

        # This works like a mix between InMemoryArrayDataset and
        # InMemoryDatasetID. Explicit array data is stored in a data_dict like
        # with InMemoryDatasetID, but unlike it, missing data (which equals
        # the fill value) is omitted.
        self.data_dict = {}

    def as_dtype(self, name, dtype, parent, casting='unsafe'):
        """
        Return a copy of `self` as a new dataset with the given `name` and `dtype`
        in the group `parent`.

        `casting` should be as in the numpy astype() method.

        """
        if self.fillvalue is not None:
            new_fillvalue = self.fillvalue.astype(dtype, casting=casting)
        else:
            new_fillvalue = None
        new_data_dict = {}
        for c, index in self.data_dict.copy().items():
            new_data_dict[c] = self.data_dict[c].astype(dtype, casting=casting)

        return self.from_data_dict(name, new_data_dict, dtype=dtype,
                                   parent=parent, fillvalue=new_fillvalue,
                                   chunks=self.chunks, shape=self.shape)

    @classmethod
    def from_data_dict(cls, name, data_dict, *, shape, dtype, parent,
                       chunks, fillvalue=None):
        """
        Create a InMemorySparseDataset from a data dict.

        This does not do any consistency checks with the metadata provide.
        """
        dataset = cls(name, shape=shape, dtype=dtype, parent=parent, chunks=chunks, fillvalue=fillvalue)
        dataset.data_dict = data_dict
        return dataset

    @classmethod
    def from_dataset(cls, dataset, parent=None):
        # np.testing.assert_equal(dataset[()], dataset.fillvalue)
        return cls(dataset.name, shape=dataset.shape, dtype=dataset.dtype,
                   parent=parent or dataset.parent, chunks=dataset.chunks,
                   fillvalue=dataset.fillvalue)

    def resize(self, size, axis=None):
        if axis is not None:
            if not (axis >=0 and axis < self.ndim):
                raise ValueError("Invalid axis (0 to %s allowed)" % (self.ndim-1))
            try:
                newlen = int(size)
            except TypeError:
                raise TypeError("Argument must be a single int if axis is specified")
            size = list(self.shape)
            size[axis] = newlen

        size = tuple(size)
        if len(size) > 1:
            raise NotImplementedError("More than one dimension is not yet supported")
        self.shape = size

    def __getitem__(self, index):
        idx = ndindex(index).reduce(self.shape)

        newshape = idx.newshape(self.shape)
        arr = np.full(newshape, self.fillvalue, dtype=self.dtype)

        for c in self.chunks.as_subchunks(idx, self.shape):
            if c not in self.data_dict:
                fill = np.broadcast_to(self.fillvalue, c.newshape(self.shape))
                self.data_dict[c] = fill

            if self.data_dict[c].size != 0:
                arr_idx = c.as_subindex(idx)
                chunk_idx = idx.as_subindex(c)
                arr[arr_idx.raw] = self.data_dict[c][chunk_idx.raw]

        # Return arr as a scalar if it is shape () (matching h5py)
        return arr[()]

    def __setitem__(self, index, value):
        self.parent._check_committed()

        idx = ndindex(index).reduce(self.shape)

        val = np.broadcast_to(value, idx.newshape(self.shape))

        for c in self.chunks.as_subchunks(idx, self.shape):
            if c not in self.data_dict:
                # Broadcasted arrays do not actually consume memory
                fill = np.broadcast_to(self.fillvalue, c.newshape(self.shape))
                self.data_dict[c] = fill

            if self.data_dict[c].size != 0:
                val_idx = c.as_subindex(idx)
                if not self.data_dict[c].flags.writeable:
                    # self.data_dict[c] is a broadcasted array from above
                    self.data_dict[c] = self.data_dict[c].copy()
                chunk_idx = idx.as_subindex(c)
                self.data_dict[c][chunk_idx.raw] = val[val_idx.raw]
コード例 #9
0
ファイル: wrappers.py プロジェクト: asmeurer/versioned-hdf5
 def chunks(self):
     return ChunkSize(self.id.chunks)