def test_consolidated_with_chunk_store():
    # setup initial data
    store = MemoryStore()
    chunk_store = MemoryStore()
    z = group(store, chunk_store=chunk_store)
    z.create_group('g1')
    g2 = z.create_group('g2')
    g2.attrs['hello'] = 'world'
    arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8')
    assert 16 == arr.nchunks
    assert 0 == arr.nchunks_initialized
    arr.attrs['data'] = 1
    arr[:] = 1.0
    assert 16 == arr.nchunks_initialized

    # perform consolidation
    out = consolidate_metadata(store)
    assert isinstance(out, Group)
    assert '.zmetadata' in store
    for key in [
            '.zgroup', 'g1/.zgroup', 'g2/.zgroup', 'g2/.zattrs',
            'g2/arr/.zarray', 'g2/arr/.zattrs'
    ]:
        del store[key]
    # open consolidated
    z2 = open_consolidated(store, chunk_store=chunk_store)
    assert ['g1', 'g2'] == list(z2)
    assert 'world' == z2.g2.attrs['hello']
    assert 1 == z2.g2.arr.attrs['data']
    assert (z2.g2.arr[:] == 1.0).all()
    assert 16 == z2.g2.arr.nchunks
    assert 16 == z2.g2.arr.nchunks_initialized

    # test the data are writeable
    z2.g2.arr[:] = 2
    assert (z2.g2.arr[:] == 2).all()

    # test invalid modes
    with pytest.raises(ValueError):
        open_consolidated(store, mode='a', chunk_store=chunk_store)
    with pytest.raises(ValueError):
        open_consolidated(store, mode='w', chunk_store=chunk_store)

    # make sure keyword arguments are passed through without error
    open_consolidated(store,
                      cache_attrs=True,
                      synchronizer=None,
                      chunk_store=chunk_store)
def test_consolidate_metadata():

    # setup initial data
    store = MemoryStore()
    z = group(store)
    z.create_group('g1')
    g2 = z.create_group('g2')
    g2.attrs['hello'] = 'world'
    arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8')
    assert 16 == arr.nchunks
    assert 0 == arr.nchunks_initialized
    arr.attrs['data'] = 1
    arr[:] = 1.0
    assert 16 == arr.nchunks_initialized

    # perform consolidation
    out = consolidate_metadata(store)
    assert isinstance(out, Group)
    assert '.zmetadata' in store
    for key in [
            '.zgroup', 'g1/.zgroup', 'g2/.zgroup', 'g2/.zattrs',
            'g2/arr/.zarray', 'g2/arr/.zattrs'
    ]:
        del store[key]

    # open consolidated
    z2 = open_consolidated(store)
    assert ['g1', 'g2'] == list(z2)
    assert 'world' == z2.g2.attrs['hello']
    assert 1 == z2.g2.arr.attrs['data']
    assert (z2.g2.arr[:] == 1.0).all()
    assert 16 == z2.g2.arr.nchunks
    assert 16 == z2.g2.arr.nchunks_initialized

    # tests del/write on the store
    cmd = ConsolidatedMetadataStore(store)
    with pytest.raises(PermissionError):
        del cmd['.zgroup']
    with pytest.raises(PermissionError):
        cmd['.zgroup'] = None

    # test getsize on the store
    assert isinstance(getsize(cmd), Integral)

    # test new metadata are not writeable
    with pytest.raises(PermissionError):
        z2.create_group('g3')
    with pytest.raises(PermissionError):
        z2.create_dataset('spam', shape=42, chunks=7, dtype='i4')
    with pytest.raises(PermissionError):
        del z2['g2']

    # test consolidated metadata are not writeable
    with pytest.raises(PermissionError):
        z2.g2.attrs['hello'] = 'universe'
    with pytest.raises(PermissionError):
        z2.g2.arr.attrs['foo'] = 'bar'

    # test the data are writeable
    z2.g2.arr[:] = 2
    assert (z2.g2.arr[:] == 2).all()

    # test invalid modes
    with pytest.raises(ValueError):
        open_consolidated(store, mode='a')
    with pytest.raises(ValueError):
        open_consolidated(store, mode='w')

    # make sure keyword arguments are passed through without error
    open_consolidated(store, cache_attrs=True, synchronizer=None)
Example #3
0
def xds_from_zarr(store, columns=None, chunks=None, **kwargs):
    """
    Reads the zarr data store in `store` and returns list of
    Dataset's containing the data.

    Parameters
    ----------
    store : str or Path
        Path containing the data
    columns : list of str or str or None
        Columns to read. `None` or `"ALL"` stores all columns on each dataset.
        Otherwise, a list of columns should be supplied.
    chunks: dict or list of dicts
        chunking schema for each dataset
    **kwargs: optional

    Returns
    -------
    writes : Dataset or list of Datasets
        Dataset(s) representing write operations
    """

    if isinstance(store, DaskMSStore):
        pass
    elif isinstance(store, (Path, str)):
        store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {}))
    else:
        raise TypeError(f"store '{store}' must be "
                        f"Path, str or DaskMSStore")

    # If any kwargs are added, they should be popped prior to this check.
    if len(kwargs) > 0:
        warnings.warn(
            f"The following unsupported kwargs were ignored in "
            f"xds_from_zarr: {kwargs}", UserWarning)

    columns = promote_columns(columns)

    if chunks is None:
        pass
    elif isinstance(chunks, (tuple, list)):
        if not all(isinstance(v, dict) for v in chunks):
            raise TypeError("chunks must be None, a dict or a list of dicts")
    elif isinstance(chunks, dict):
        chunks = [chunks]
    else:
        raise TypeError("chunks must be None, a dict or a list of dicts")

    datasets = []
    numpy_vars = []

    # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY
    # expensive if the metadata has not been consolidated.
    zc.consolidate_metadata(store.map)
    table_path = store.table if store.table else "MAIN"
    table_group = zarr.open_consolidated(store.map)[table_path]

    for g, (group_name,
            group) in enumerate(sorted(table_group.groups(),
                                       key=group_sortkey)):
        group_attrs = decode_attr(dict(group.attrs))
        dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY)
        natural_chunks = dask_ms_attrs["chunks"]
        group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()}

        if chunks:
            # Defer to user-supplied chunking strategy
            try:
                group_chunks.update(chunks[g])
            except IndexError:
                group_chunks.update(chunks[-1])  # Reuse last chunking.
                pass

        data_vars = {}
        coords = {}

        for name, zarray in column_iterator(group, columns):
            attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY]))
            dims = attrs["dims"]
            coordinate = attrs.get("coordinate", False)
            array_chunks = tuple(
                group_chunks.get(d, s) for d, s in zip(dims, zarray.shape))

            array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape)
            ext_args = extent_args(dims, array_chunks)
            token_name = f"read~{name}-{tokenize(zarray, *ext_args)}"

            read = da.blockwise(zarr_getter,
                                dims,
                                zarray,
                                None,
                                *ext_args,
                                concatenate=False,
                                name=token_name,
                                meta=np.empty((0, ) * zarray.ndim,
                                              zarray.dtype))

            read = inlined_array(read, ext_args[::2])
            var = Variable(dims, read, attrs)
            (coords if coordinate else data_vars)[name] = var

            # Save numpy arrays for reification
            typ = decode_type(attrs["array_type"])

            if typ is np.ndarray:
                numpy_vars.append(var)
            elif typ is da.Array:
                pass
            else:
                raise TypeError(f"Unknown array_type '{attrs['array_type']}'")

        datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs))

    # Reify any numpy arrays directly into their variables
    for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]):
        v.data = a

    return datasets
Example #4
0
def test_consolidate_metadata(with_chunk_store,
                              zarr_version,
                              listable,
                              monkeypatch,
                              stores_from_path):

    # setup initial data
    if stores_from_path:
        store = tempfile.mkdtemp()
        atexit.register(atexit_rmtree, store)
        if with_chunk_store:
            chunk_store = tempfile.mkdtemp()
            atexit.register(atexit_rmtree, chunk_store)
        else:
            chunk_store = None
        version_kwarg = {'zarr_version': zarr_version}
    else:
        if zarr_version == 2:
            store = MemoryStore()
            chunk_store = MemoryStore() if with_chunk_store else None
        elif zarr_version == 3:
            store = MemoryStoreV3()
            chunk_store = MemoryStoreV3() if with_chunk_store else None
        version_kwarg = {}
    path = 'dataset' if zarr_version == 3 else None
    z = group(store, chunk_store=chunk_store, path=path, **version_kwarg)

    # Reload the actual store implementation in case str
    store_to_copy = z.store

    z.create_group('g1')
    g2 = z.create_group('g2')
    g2.attrs['hello'] = 'world'
    arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8')
    assert 16 == arr.nchunks
    assert 0 == arr.nchunks_initialized
    arr.attrs['data'] = 1
    arr[:] = 1.0
    assert 16 == arr.nchunks_initialized

    if stores_from_path:
        # get the actual store class for use with consolidate_metadata
        store_class = z._store
    else:
        store_class = store

    if zarr_version == 3:
        # error on v3 if path not provided
        with pytest.raises(ValueError):
            consolidate_metadata(store_class, path=None)

        with pytest.raises(ValueError):
            consolidate_metadata(store_class, path='')

    # perform consolidation
    out = consolidate_metadata(store_class, path=path)
    assert isinstance(out, Group)
    assert ['g1', 'g2'] == list(out)
    if not stores_from_path:
        if zarr_version == 2:
            assert isinstance(out._store, ConsolidatedMetadataStore)
            assert '.zmetadata' in store
            meta_keys = ['.zgroup',
                         'g1/.zgroup',
                         'g2/.zgroup',
                         'g2/.zattrs',
                         'g2/arr/.zarray',
                         'g2/arr/.zattrs']
        else:
            assert isinstance(out._store, ConsolidatedMetadataStoreV3)
            assert 'meta/root/consolidated/.zmetadata' in store
            meta_keys = ['zarr.json',
                         meta_root + 'dataset.group.json',
                         meta_root + 'dataset/g1.group.json',
                         meta_root + 'dataset/g2.group.json',
                         meta_root + 'dataset/g2/arr.array.json',
                         'meta/root/consolidated.group.json']
        for key in meta_keys:
            del store[key]

    # https://github.com/zarr-developers/zarr-python/issues/993
    # Make sure we can still open consolidated on an unlistable store:
    if not listable:
        fs_memory = pytest.importorskip("fsspec.implementations.memory")
        monkeypatch.setattr(fs_memory.MemoryFileSystem, "isdir", lambda x, y: False)
        monkeypatch.delattr(fs_memory.MemoryFileSystem, "ls")
        fs = fs_memory.MemoryFileSystem()
        if zarr_version == 2:
            store_to_open = FSStore("", fs=fs)
        else:
            store_to_open = FSStoreV3("", fs=fs)

        # copy original store to new unlistable store
        store_to_open.update(store_to_copy)

    else:
        store_to_open = store

    # open consolidated
    z2 = open_consolidated(store_to_open, chunk_store=chunk_store, path=path, **version_kwarg)
    assert ['g1', 'g2'] == list(z2)
    assert 'world' == z2.g2.attrs['hello']
    assert 1 == z2.g2.arr.attrs['data']
    assert (z2.g2.arr[:] == 1.0).all()
    assert 16 == z2.g2.arr.nchunks
    if listable:
        assert 16 == z2.g2.arr.nchunks_initialized
    else:
        with pytest.raises(NotImplementedError):
            _ = z2.g2.arr.nchunks_initialized

    if stores_from_path:
        # path string is note a BaseStore subclass so cannot be used to
        # initialize a ConsolidatedMetadataStore.
        if zarr_version == 2:
            with pytest.raises(ValueError):
                cmd = ConsolidatedMetadataStore(store)
        elif zarr_version == 3:
            with pytest.raises(ValueError):
                cmd = ConsolidatedMetadataStoreV3(store)
    else:
        # tests del/write on the store
        if zarr_version == 2:
            cmd = ConsolidatedMetadataStore(store)
            with pytest.raises(PermissionError):
                del cmd['.zgroup']
            with pytest.raises(PermissionError):
                cmd['.zgroup'] = None
        else:
            cmd = ConsolidatedMetadataStoreV3(store)
            with pytest.raises(PermissionError):
                del cmd[meta_root + 'dataset.group.json']
            with pytest.raises(PermissionError):
                cmd[meta_root + 'dataset.group.json'] = None

        # test getsize on the store
        assert isinstance(getsize(cmd), Integral)

    # test new metadata are not writeable
    with pytest.raises(PermissionError):
        z2.create_group('g3')
    with pytest.raises(PermissionError):
        z2.create_dataset('spam', shape=42, chunks=7, dtype='i4')
    with pytest.raises(PermissionError):
        del z2['g2']

    # test consolidated metadata are not writeable
    with pytest.raises(PermissionError):
        z2.g2.attrs['hello'] = 'universe'
    with pytest.raises(PermissionError):
        z2.g2.arr.attrs['foo'] = 'bar'

    # test the data are writeable
    z2.g2.arr[:] = 2
    assert (z2.g2.arr[:] == 2).all()

    # test invalid modes
    with pytest.raises(ValueError):
        open_consolidated(store, chunk_store=chunk_store, mode='a', path=path)
    with pytest.raises(ValueError):
        open_consolidated(store, chunk_store=chunk_store, mode='w', path=path)
    with pytest.raises(ValueError):
        open_consolidated(store, chunk_store=chunk_store, mode='w-', path=path)

    # make sure keyword arguments are passed through without error
    open_consolidated(
        store, chunk_store=chunk_store, path=path, cache_attrs=True, synchronizer=None,
        **version_kwarg,
    )