def test_consolidated_with_chunk_store(): # setup initial data store = MemoryStore() chunk_store = MemoryStore() z = group(store, chunk_store=chunk_store) z.create_group('g1') g2 = z.create_group('g2') g2.attrs['hello'] = 'world' arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8') assert 16 == arr.nchunks assert 0 == arr.nchunks_initialized arr.attrs['data'] = 1 arr[:] = 1.0 assert 16 == arr.nchunks_initialized # perform consolidation out = consolidate_metadata(store) assert isinstance(out, Group) assert '.zmetadata' in store for key in [ '.zgroup', 'g1/.zgroup', 'g2/.zgroup', 'g2/.zattrs', 'g2/arr/.zarray', 'g2/arr/.zattrs' ]: del store[key] # open consolidated z2 = open_consolidated(store, chunk_store=chunk_store) assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] assert (z2.g2.arr[:] == 1.0).all() assert 16 == z2.g2.arr.nchunks assert 16 == z2.g2.arr.nchunks_initialized # test the data are writeable z2.g2.arr[:] = 2 assert (z2.g2.arr[:] == 2).all() # test invalid modes with pytest.raises(ValueError): open_consolidated(store, mode='a', chunk_store=chunk_store) with pytest.raises(ValueError): open_consolidated(store, mode='w', chunk_store=chunk_store) # make sure keyword arguments are passed through without error open_consolidated(store, cache_attrs=True, synchronizer=None, chunk_store=chunk_store)
def test_consolidate_metadata(): # setup initial data store = MemoryStore() z = group(store) z.create_group('g1') g2 = z.create_group('g2') g2.attrs['hello'] = 'world' arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8') assert 16 == arr.nchunks assert 0 == arr.nchunks_initialized arr.attrs['data'] = 1 arr[:] = 1.0 assert 16 == arr.nchunks_initialized # perform consolidation out = consolidate_metadata(store) assert isinstance(out, Group) assert '.zmetadata' in store for key in [ '.zgroup', 'g1/.zgroup', 'g2/.zgroup', 'g2/.zattrs', 'g2/arr/.zarray', 'g2/arr/.zattrs' ]: del store[key] # open consolidated z2 = open_consolidated(store) assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] assert (z2.g2.arr[:] == 1.0).all() assert 16 == z2.g2.arr.nchunks assert 16 == z2.g2.arr.nchunks_initialized # tests del/write on the store cmd = ConsolidatedMetadataStore(store) with pytest.raises(PermissionError): del cmd['.zgroup'] with pytest.raises(PermissionError): cmd['.zgroup'] = None # test getsize on the store assert isinstance(getsize(cmd), Integral) # test new metadata are not writeable with pytest.raises(PermissionError): z2.create_group('g3') with pytest.raises(PermissionError): z2.create_dataset('spam', shape=42, chunks=7, dtype='i4') with pytest.raises(PermissionError): del z2['g2'] # test consolidated metadata are not writeable with pytest.raises(PermissionError): z2.g2.attrs['hello'] = 'universe' with pytest.raises(PermissionError): z2.g2.arr.attrs['foo'] = 'bar' # test the data are writeable z2.g2.arr[:] = 2 assert (z2.g2.arr[:] == 2).all() # test invalid modes with pytest.raises(ValueError): open_consolidated(store, mode='a') with pytest.raises(ValueError): open_consolidated(store, mode='w') # make sure keyword arguments are passed through without error open_consolidated(store, cache_attrs=True, synchronizer=None)
def xds_from_zarr(store, columns=None, chunks=None, **kwargs): """ Reads the zarr data store in `store` and returns list of Dataset's containing the data. Parameters ---------- store : str or Path Path containing the data columns : list of str or str or None Columns to read. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. chunks: dict or list of dicts chunking schema for each dataset **kwargs: optional Returns ------- writes : Dataset or list of Datasets Dataset(s) representing write operations """ if isinstance(store, DaskMSStore): pass elif isinstance(store, (Path, str)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_from_zarr: {kwargs}", UserWarning) columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if not all(isinstance(v, dict) for v in chunks): raise TypeError("chunks must be None, a dict or a list of dicts") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None, a dict or a list of dicts") datasets = [] numpy_vars = [] # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY # expensive if the metadata has not been consolidated. zc.consolidate_metadata(store.map) table_path = store.table if store.table else "MAIN" table_group = zarr.open_consolidated(store.map)[table_path] for g, (group_name, group) in enumerate(sorted(table_group.groups(), key=group_sortkey)): group_attrs = decode_attr(dict(group.attrs)) dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY) natural_chunks = dask_ms_attrs["chunks"] group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()} if chunks: # Defer to user-supplied chunking strategy try: group_chunks.update(chunks[g]) except IndexError: group_chunks.update(chunks[-1]) # Reuse last chunking. pass data_vars = {} coords = {} for name, zarray in column_iterator(group, columns): attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY])) dims = attrs["dims"] coordinate = attrs.get("coordinate", False) array_chunks = tuple( group_chunks.get(d, s) for d, s in zip(dims, zarray.shape)) array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape) ext_args = extent_args(dims, array_chunks) token_name = f"read~{name}-{tokenize(zarray, *ext_args)}" read = da.blockwise(zarr_getter, dims, zarray, None, *ext_args, concatenate=False, name=token_name, meta=np.empty((0, ) * zarray.ndim, zarray.dtype)) read = inlined_array(read, ext_args[::2]) var = Variable(dims, read, attrs) (coords if coordinate else data_vars)[name] = var # Save numpy arrays for reification typ = decode_type(attrs["array_type"]) if typ is np.ndarray: numpy_vars.append(var) elif typ is da.Array: pass else: raise TypeError(f"Unknown array_type '{attrs['array_type']}'") datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs)) # Reify any numpy arrays directly into their variables for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]): v.data = a return datasets
def test_consolidate_metadata(with_chunk_store, zarr_version, listable, monkeypatch, stores_from_path): # setup initial data if stores_from_path: store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) if with_chunk_store: chunk_store = tempfile.mkdtemp() atexit.register(atexit_rmtree, chunk_store) else: chunk_store = None version_kwarg = {'zarr_version': zarr_version} else: if zarr_version == 2: store = MemoryStore() chunk_store = MemoryStore() if with_chunk_store else None elif zarr_version == 3: store = MemoryStoreV3() chunk_store = MemoryStoreV3() if with_chunk_store else None version_kwarg = {} path = 'dataset' if zarr_version == 3 else None z = group(store, chunk_store=chunk_store, path=path, **version_kwarg) # Reload the actual store implementation in case str store_to_copy = z.store z.create_group('g1') g2 = z.create_group('g2') g2.attrs['hello'] = 'world' arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8') assert 16 == arr.nchunks assert 0 == arr.nchunks_initialized arr.attrs['data'] = 1 arr[:] = 1.0 assert 16 == arr.nchunks_initialized if stores_from_path: # get the actual store class for use with consolidate_metadata store_class = z._store else: store_class = store if zarr_version == 3: # error on v3 if path not provided with pytest.raises(ValueError): consolidate_metadata(store_class, path=None) with pytest.raises(ValueError): consolidate_metadata(store_class, path='') # perform consolidation out = consolidate_metadata(store_class, path=path) assert isinstance(out, Group) assert ['g1', 'g2'] == list(out) if not stores_from_path: if zarr_version == 2: assert isinstance(out._store, ConsolidatedMetadataStore) assert '.zmetadata' in store meta_keys = ['.zgroup', 'g1/.zgroup', 'g2/.zgroup', 'g2/.zattrs', 'g2/arr/.zarray', 'g2/arr/.zattrs'] else: assert isinstance(out._store, ConsolidatedMetadataStoreV3) assert 'meta/root/consolidated/.zmetadata' in store meta_keys = ['zarr.json', meta_root + 'dataset.group.json', meta_root + 'dataset/g1.group.json', meta_root + 'dataset/g2.group.json', meta_root + 'dataset/g2/arr.array.json', 'meta/root/consolidated.group.json'] for key in meta_keys: del store[key] # https://github.com/zarr-developers/zarr-python/issues/993 # Make sure we can still open consolidated on an unlistable store: if not listable: fs_memory = pytest.importorskip("fsspec.implementations.memory") monkeypatch.setattr(fs_memory.MemoryFileSystem, "isdir", lambda x, y: False) monkeypatch.delattr(fs_memory.MemoryFileSystem, "ls") fs = fs_memory.MemoryFileSystem() if zarr_version == 2: store_to_open = FSStore("", fs=fs) else: store_to_open = FSStoreV3("", fs=fs) # copy original store to new unlistable store store_to_open.update(store_to_copy) else: store_to_open = store # open consolidated z2 = open_consolidated(store_to_open, chunk_store=chunk_store, path=path, **version_kwarg) assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] assert (z2.g2.arr[:] == 1.0).all() assert 16 == z2.g2.arr.nchunks if listable: assert 16 == z2.g2.arr.nchunks_initialized else: with pytest.raises(NotImplementedError): _ = z2.g2.arr.nchunks_initialized if stores_from_path: # path string is note a BaseStore subclass so cannot be used to # initialize a ConsolidatedMetadataStore. if zarr_version == 2: with pytest.raises(ValueError): cmd = ConsolidatedMetadataStore(store) elif zarr_version == 3: with pytest.raises(ValueError): cmd = ConsolidatedMetadataStoreV3(store) else: # tests del/write on the store if zarr_version == 2: cmd = ConsolidatedMetadataStore(store) with pytest.raises(PermissionError): del cmd['.zgroup'] with pytest.raises(PermissionError): cmd['.zgroup'] = None else: cmd = ConsolidatedMetadataStoreV3(store) with pytest.raises(PermissionError): del cmd[meta_root + 'dataset.group.json'] with pytest.raises(PermissionError): cmd[meta_root + 'dataset.group.json'] = None # test getsize on the store assert isinstance(getsize(cmd), Integral) # test new metadata are not writeable with pytest.raises(PermissionError): z2.create_group('g3') with pytest.raises(PermissionError): z2.create_dataset('spam', shape=42, chunks=7, dtype='i4') with pytest.raises(PermissionError): del z2['g2'] # test consolidated metadata are not writeable with pytest.raises(PermissionError): z2.g2.attrs['hello'] = 'universe' with pytest.raises(PermissionError): z2.g2.arr.attrs['foo'] = 'bar' # test the data are writeable z2.g2.arr[:] = 2 assert (z2.g2.arr[:] == 2).all() # test invalid modes with pytest.raises(ValueError): open_consolidated(store, chunk_store=chunk_store, mode='a', path=path) with pytest.raises(ValueError): open_consolidated(store, chunk_store=chunk_store, mode='w', path=path) with pytest.raises(ValueError): open_consolidated(store, chunk_store=chunk_store, mode='w-', path=path) # make sure keyword arguments are passed through without error open_consolidated( store, chunk_store=chunk_store, path=path, cache_attrs=True, synchronizer=None, **version_kwarg, )