def _extract_dataset_zattrs(dataset: xr.Dataset): """ helper function to create zattrs dictionary from Dataset global attrs """ zattrs = {} for k, v in dataset.attrs.items(): zattrs[k] = encode_zarr_attr_value(v) # remove xpublish internal attribute zattrs.pop(DATASET_ID_ATTR_KEY, None) return zattrs
def _extract_dataarray_zattrs(da): """ helper function to extract zattrs dictionary from DataArray """ zattrs = {} for k, v in da.attrs.items(): zattrs[k] = encode_zarr_attr_value(v) zattrs[DIMENSION_KEY] = list(da.dims) # We don't want `_FillValue` in `.zattrs` # It should go in `fill_value` section of `.zarray` _ = zattrs.pop('_FillValue', None) return zattrs
def _encode_zarr_attributes(attrs): return {k: encode_zarr_attr_value(v) for k, v in attrs.items()}
def rechunk_dataset( source: xarray.Dataset, encoding: Mapping, max_mem, target_store, temp_store=None, executor: Union[str, Executor] = "dask", ): def _encode_zarr_attributes(attrs): return {k: encode_zarr_attr_value(v) for k, v in attrs.items()} if isinstance(executor, str): executor = _get_executor(executor) if temp_store: temp_group = zarr.group(temp_store) else: temp_group = zarr.group( tempfile.mkdtemp(".zarr", "temp_store_") ) # pragma: no cover target_group = zarr.group(target_store) target_group.attrs.update(_encode_zarr_attributes(source.attrs)) copy_specs = [] for variable in source: array = source[variable].copy() # Update the array encoding with provided parameters and apply it has_chunk_encoding = "chunks" in array.encoding array.encoding.update(encoding.get(variable, {})) array = encode_zarr_variable(array) # Determine target chunking for array and remove it prior to # validation/extraction ONLY if the array isn't also coming # from a Zarr store (otherwise blocks need to be checked for overlap) target_chunks = array.encoding.get("chunks") if not has_chunk_encoding: array.encoding.pop("chunks", None) array_encoding = extract_zarr_variable_encoding( array, raise_on_invalid=True, name=variable ) # Default to chunking based on array shape if not explicitly provided default_chunks = array_encoding.pop("chunks") target_chunks = target_chunks or default_chunks # Extract array attributes along with reserved property for # xarray dimension names array_attrs = _encode_zarr_attributes(array.attrs) array_attrs[DIMENSION_KEY] = encode_zarr_attr_value(array.dims) copy_spec = _setup_array_rechunk( dask.array.asarray(array), target_chunks, max_mem, target_group, target_options=array_encoding, temp_store_or_group=temp_group, temp_options=array_encoding, name=variable, ) copy_spec.write.array.attrs.update(array_attrs) # type: ignore copy_specs.append(copy_spec) plan = executor.prepare_plan(copy_specs) return Rechunked(executor, plan, source, temp_group, target_group)
def _setup_rechunk( source, target_chunks, max_mem, target_store, target_options=None, temp_store=None, temp_options=None, ): if temp_options is None: temp_options = target_options target_options = target_options or {} temp_options = temp_options or {} if isinstance(source, xarray.Dataset): if not isinstance(target_chunks, dict): raise ValueError( "You must specify ``target-chunks`` as a dict when rechunking a dataset." ) variables, attrs = encode_dataset_coordinates(source) attrs = _encode_zarr_attributes(attrs) if temp_store: temp_group = zarr.group(temp_store) else: temp_group = None target_group = zarr.group(target_store) target_group.attrs.update(attrs) copy_specs = [] for name, variable in variables.items(): # This isn't strictly necessary because a shallow copy # also occurs in `encode_dataset_coordinates` but do it # anyways in case the coord encoding function changes variable = variable.copy() # Update the array encoding with provided options and apply it; # note that at this point the `options` may contain any valid property # applicable for the `encoding` parameter in Dataset.to_zarr other than "chunks" options = target_options.get(name, {}) if "chunks" in options: raise ValueError( f"Chunks must be provided in ``target_chunks`` rather than options (variable={name})" ) variable.encoding.update(options) variable = encode_zarr_variable(variable) # Extract the array encoding to get a default chunking, a step # which will also ensure that the target chunking is compatible # with the current chunking (only necessary for on-disk arrays) variable_encoding = extract_zarr_variable_encoding( variable, raise_on_invalid=False, name=name) variable_chunks = target_chunks.get(name, variable_encoding["chunks"]) # Restrict options to only those that are specific to zarr and # not managed internally options = {k: v for k, v in options.items() if k in ZARR_OPTIONS} _validate_options(options) # Extract array attributes along with reserved property for # xarray dimension names variable_attrs = _encode_zarr_attributes(variable.attrs) variable_attrs[DIMENSION_KEY] = encode_zarr_attr_value( variable.dims) copy_spec = _setup_array_rechunk( dask.array.asarray(variable), variable_chunks, max_mem, target_group, target_options=options, temp_store_or_group=temp_group, temp_options=options, name=name, ) copy_spec.write.array.attrs.update(variable_attrs) # type: ignore copy_specs.append(copy_spec) return copy_specs, temp_group, target_group elif isinstance(source, zarr.hierarchy.Group): if not isinstance(target_chunks, dict): raise ValueError( "You must specify ``target-chunks`` as a dict when rechunking a group." ) if temp_store: temp_group = zarr.group(temp_store) else: temp_group = None target_group = zarr.group(target_store) target_group.attrs.update(source.attrs) copy_specs = [] for array_name, array_target_chunks in target_chunks.items(): copy_spec = _setup_array_rechunk( source[array_name], array_target_chunks, max_mem, target_group, target_options=target_options.get(array_name), temp_store_or_group=temp_group, temp_options=temp_options.get(array_name), name=array_name, ) copy_specs.append(copy_spec) return copy_specs, temp_group, target_group elif isinstance(source, (zarr.core.Array, dask.array.Array)): copy_spec = _setup_array_rechunk( source, target_chunks, max_mem, target_store, target_options=target_options, temp_store_or_group=temp_store, temp_options=temp_options, ) intermediate = copy_spec.intermediate.array target = copy_spec.write.array return [copy_spec], intermediate, target else: raise ValueError( f"Source must be a Zarr Array, Zarr Group, Dask Array or Xarray Dataset (not {type(source)})." )
def _get_zattrs(self): """ helper method to create zattrs dictionary """ zattrs = {} for k, v in self._obj.attrs.items(): zattrs[k] = encode_zarr_attr_value(v) return zattrs
def _extract_dataset_zattrs(dataset: xr.Dataset): """ helper function to create zattrs dictionary from Dataset global attrs """ zattrs = {} for k, v in dataset.attrs.items(): zattrs[k] = encode_zarr_attr_value(v) return zattrs