def finalize_target(target: CacheFSSpecTarget, consolidate_zarr: bool) -> None: if target is None: raise ValueError("target has not been set.") if consolidate_zarr: logger.info("Consolidating Zarr metadata") target_mapper = target.get_mapper() zarr.consolidate_metadata(target_mapper)
def finalize_target(self) -> None: if self.target is None: raise ValueError("target has not been set.") if self.consolidate_zarr: logger.info("Consolidating Zarr metadata") target_mapper = self.target.get_mapper() zarr.consolidate_metadata(target_mapper)
def store( self, variables, attributes, check_encoding_set=frozenset(), writer=None, unlimited_dims=None, ): """ Top level method for putting data on this store, this method: - encodes variables/attributes - sets dimensions - sets variables Parameters ---------- variables : dict-like Dictionary of key/value (variable name / xr.Variable) pairs attributes : dict-like Dictionary of key/value (attribute name / attribute) pairs check_encoding_set : list-like List of variables that should be checked for invalid encoding values writer : ArrayWriter unlimited_dims : list-like List of dimension names that should be treated as unlimited dimensions. dimension on which the zarray will be appended only needed in append mode """ import zarr existing_variables = { vn for vn in variables if _encode_variable_name(vn) in self.ds } new_variables = set(variables) - existing_variables variables_without_encoding = {vn: variables[vn] for vn in new_variables} variables_encoded, attributes = self.encode( variables_without_encoding, attributes ) if len(existing_variables) > 0: # there are variables to append # their encoding must be the same as in the store ds = open_zarr(self.ds.store, group=self.ds.path, chunks=None) variables_with_encoding = {} for vn in existing_variables: variables_with_encoding[vn] = variables[vn].copy(deep=False) variables_with_encoding[vn].encoding = ds[vn].encoding variables_with_encoding, _ = self.encode(variables_with_encoding, {}) variables_encoded.update(variables_with_encoding) if self._write_region is None: self.set_attributes(attributes) self.set_dimensions(variables_encoded, unlimited_dims=unlimited_dims) self.set_variables( variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims ) if self._consolidate_on_close: zarr.consolidate_metadata(self.ds.store)
def make_annual(model, scenario, member, method): if 'hist' in scenario: tslice = slice(None, '2014-12') else: tslice = slice('2015', '2100') monthly_mapper = zarr.storage.ABSStore( 'carbonplan-downscaling', prefix=f'cmip6/{method}/conus/4000m/monthly/{model}.{scenario}.{member}.zarr', account_name="carbonplan", account_key=account_key, ) annual_mapper = zarr.storage.ABSStore( 'carbonplan-downscaling', prefix=f'cmip6/{method}/conus/4000m/annual/{model}.{scenario}.{member}.zarr', account_name="carbonplan", account_key=account_key, ) if skip_existing and '.zmetadata' in annual_mapper: return 'skipped' ds_monthly = xr.open_zarr(monthly_mapper, consolidated=True).sel(time=tslice).chunk(chunks) template = _annual(ds_monthly, compute=False).chunk(chunks) ds_annual = ds_monthly.map_blocks(_annual, template=template) annual_mapper.clear() task = ds_annual.to_zarr(annual_mapper, mode='w', compute=False) dask.compute(task, retries=task_retries) zarr.consolidate_metadata(annual_mapper) return 'done'
def split_and_write(model, scenario, member, method): ds = get_scratch_ds(model, scenario, member, method) scen_mapper = zarr.storage.ABSStore( 'carbonplan-downscaling', prefix=f'cmip6/{method}/conus/4000m/monthly/{model}.{scenario}.{member}.zarr', account_name="carbonplan", account_key=account_key, ) if not (skip_existing and 'pdsi/.zarray' in scen_mapper): print('writing scen') ds.sel(time=slice('2015-01', None)).to_zarr(scen_mapper, mode='a') zarr.consolidate_metadata(scen_mapper) hist_mapper = zarr.storage.ABSStore( 'carbonplan-downscaling', prefix=f'cmip6/{method}/conus/4000m/monthly/{model}.historical.{member}.zarr', account_name="carbonplan", account_key=account_key, ) if not (skip_existing and 'pdsi/.zarray' in hist_mapper): print('writing hist') ds.sel(time=slice(None, '2014-12')).to_zarr(hist_mapper, mode='a') zarr.consolidate_metadata(hist_mapper)
def _unchunk_vars(dataset_path: str, var_names: List[str]): for var_name in var_names: var_path = os.path.join(dataset_path, var_name) # Optimization: if "shape" and "chunks" are equal in ${var}/.zarray, we are done var_array_info_path = os.path.join(var_path, '.zarray') with open(var_array_info_path, 'r') as fp: var_array_info = json.load(fp) if var_array_info.get('shape') == var_array_info.get('chunks'): continue # Open array and remove chunks from the data var_array = zarr.convenience.open_array(var_path, 'r+') if var_array.shape != var_array.chunks: # TODO (forman): Fully loading data is inefficient and dangerous for large arrays. # Instead save unchunked to temp and replace existing chunked array dir with temp. # Fully load data and attrs so we no longer depend on files data = np.array(var_array) attributes = var_array.attrs.asdict() # Save array data zarr.convenience.save_array(var_path, data, chunks=False, fill_value=var_array.fill_value) # zarr.convenience.save_array() does not seem save user attributes (file ".zattrs" not written), # therefore we must modify attrs explicitly: var_array = zarr.convenience.open_array(var_path, 'r+') var_array.attrs.update(attributes) zarr.consolidate_metadata(dataset_path)
def write_group_to_zarr(consolidated=False): path = str(tmpdir.join("test.zarr")) z = zarr.open_group(path) arr = z.create_dataset("var1", shape=(3, 5)) arr[:] = 1.0 if consolidated: zarr.consolidate_metadata(path) return z, path
def update(self, output_path: str, global_attrs: Dict[str, Any] = None, **kwargs): if global_attrs: import zarr ds = zarr.open_group(output_path, mode='r+', **kwargs) ds.attrs.update(global_attrs) zarr.consolidate_metadata(output_path)
def consolidate_metadata(self, metadata_key='.zmetadata'): ''' Wrapper over zarr.consolidate_metadata to pass chunk store when opening the zarr store ''' zarr.consolidate_metadata(self.store, metadata_key=metadata_key) store_mode_cons = 'r' if self.store_mode == 'r' else 'r+' self.zgroup = zarr.open_consolidated(self.store, metadata_key=metadata_key, mode=store_mode_cons, chunk_store=self.zgroup.chunk_store, path=self.store_path) return self.zgroup
def main(): parser = argparse.ArgumentParser( description=("Ensure that value of calendar attribute of time " "variable in a Zarr store is in lower case.")) parser.add_argument("--config", "-c", type=str, action="append", help="Read S3 configuration from this/these config(s)") parser.add_argument("--force", "-f", action="store_true", help="Rename calendar even if it's already lower-case") parser.add_argument("--dry-run", "-s", action="store_true", help="Don't rename, " "just show what would have been done") parser.add_argument("zarr_store", type=str, help="Zarr store specifier (path or URL)") args = parser.parse_args() store_arg = args.zarr_store if args.config and store_arg.lower().startswith("s3://"): s3_config = {} config = nc2zarr.config.load_config(args.config, return_kwargs=True) if "output_s3" in config: s3_config = config["output_s3"] # We have to create this store manually to set normalize_keys=False, # because normalize_keys=True can break consolidate_metadata. store = zarr.storage.FSStore(store_arg, mode="r+", **s3_config, normalize_keys=False) else: store = zarr.creation.normalize_store_arg(store_arg) z = zarr.open_group(store, mode="r+") calendar = z.time.attrs["calendar"] log(f"Current calendar: \"{calendar}\"") if calendar.islower() and not args.force: log("Already lower case; leaving unchanged.") else: new_calendar = calendar.lower() if args.dry_run: log(f"New name: \"{new_calendar}\"") log("Dry run requested -- not actually renaming.") else: log(f"Renaming to \"{new_calendar}\"...") z.time.attrs["calendar"] = new_calendar log("Consolidating...") zarr.consolidate_metadata(store) log("Done.")
def consolidate_metadata(target): """ Consolidate Zarr metadata Parameters ---------- target : str Path or url of the Zarr store. """ mapper = fsspec.get_mapper(target) zarr.consolidate_metadata(mapper)
def consolidate_metadata(writes: List[str], target: str) -> None: """ Consolidate the metadata the Zarr group at `target`. Parameters ---------- writes : List[str] The URLs the combined stores were written to. This is only a parameter to introduce a dependency. The actual value isn't used. target : str The URL for the (combined) Zarr group. """ mapper = fsspec.get_mapper(target) zarr.consolidate_metadata(mapper)
def consolidate_metadata(target, writes: Optional[List[str]] = None) -> None: """ Consolidate the metadata the Zarr group at `target`. Parameters ---------- target : str The URL for the (combined) Zarr group. writes : list of strings, optional The URLs the combined stores were written to. This is only a parameter to introduce a dependency in the pipeline execution graph. The actual value isn't used. """ mapper = fsspec.get_mapper(target) zarr.consolidate_metadata(mapper)
def update_hydat_database(path): project_root = '/tmp' data_dir = os.path.join(project_root, 'data') stations_list = get_available_stations_from_hydat() # # results = [] for station_number in stations_list: if verify_data_type_exists(station_number, 'Flow'): import_hydat_to_parquet(station_number) storage_options = {"client_kwargs": {'endpoint_url': 'https://s3.us-east-2.wasabisys.com', 'region_name': 'us-east-2'}} df = pd.read_parquet(os.path.join(data_dir, 'basin.parquet'), engine='pyarrow') df.to_parquet('s3://hydrology/timeseries/sources/hydat/basin.parquet', engine='fastparquet', compression='gzip', storage_options=storage_options) df = pd.read_parquet(os.path.join(data_dir, 'context.parquet'), engine='pyarrow') df.to_parquet('s3://hydrology/timeseries/sources/hydat/context.parquet', engine='fastparquet', compression='gzip', storage_options=storage_options) client_kwargs = {'endpoint_url': 'https://s3.us-east-2.wasabisys.com', 'region_name': 'us-east-2'} config_kwargs = {'max_pool_connections': 30} bucket_source = os.path.join(data_dir, 'zarr') bucket_sink = "s3://hydrology/timeseries/sources/hydat/values.zarr " endpoint_url = 'https://s3.us-east-2.wasabisys.com' region='us-east-2' s3 = s3fs.S3FileSystem(client_kwargs=client_kwargs, config_kwargs=config_kwargs) store = s3fs.S3Map(root=bucket_sink, s3=s3) aws_command = "aws s3 sync {} {} --endpoint-url={} --region={}".format(bucket_source, bucket_sink, endpoint_url, region) print(aws_command) subprocess.call(aws_command, shell=True) zarr.consolidate_metadata(store)
def create_vcfzarr( shared_datadir, tmpdir, *, fields=None, grouped_by_contig=False, consolidated=False ): """Create a vcfzarr file using scikit-allel""" vcf_path = shared_datadir / "sample.vcf" output_path = tmpdir / "sample.vcf.zarr" if grouped_by_contig: for contig in ["19", "20", "X"]: allel.vcf_to_zarr( str(vcf_path), str(output_path), fields=fields, group=contig, region=contig, ) else: allel.vcf_to_zarr(str(vcf_path), str(output_path), fields=fields) if consolidated: zarr.consolidate_metadata(str(output_path)) return output_path
def main(products=['GLAH01', 'GLAH14']): # convert hdf5 granules to zarr print('granules_h5_to_zarr') results = granules_h5_to_zarr(products) for p, presults in results.items(): uris = presults['skipped'] + presults['converted'] print('open xarray datasets') ds_list = dask.compute([lazy_open(uri) for uri in uris])[0] print('concat list of datasets') with dask.config.set(**{'array.slicing.split_large_chunks': True}): ds = xr.concat(ds_list, dim='record_index').chunk({'record_index': chunksize}) for k in ds: _ = ds[k].encoding.pop('chunks', None) # write print(f'writing {p}') print(ds) print(f'ds.nbytes: {ds.nbytes / 1e9}') mapper = fsspec.get_mapper(f'gs://carbonplan-climatetrace/intermediates/{p.lower()}.zarr') mapper.clear() # print('writing zarr dataset') ds.to_zarr(mapper, compute=False, mode='w') stepsize = chunksize * 1 recs = ds.dims['record_index'] print('writing zarr dataset chunks') for left, right in zip( range(0, recs, stepsize), range(stepsize, recs + stepsize, stepsize) ): s = slice(left, right) print(s, flush=True) ds.isel(record_index=s).drop(drop_keys[p]).to_zarr(mapper, region={'record_index': s}) zarr.consolidate_metadata(mapper)
def main(): parser = argparse.ArgumentParser() parser.add_argument("zarr", metavar="PATH_OR_URL", description="Update a Zarr's start_date and" "stop_date attributes to match its data.") parser.add_argument("--dry-run", "-d", action="store_true", help="Don't actually write metadata") parser.add_argument("--verbose", "-v", action="store_true", help="Report progress to standard output") args = parser.parse_args() ds = xr.open_zarr(args.zarr) z = zarr.open(args.zarr) t0 = ds.time[0].values t1 = ds.time[-1].values if args.verbose: print("First/last times:", t0, t1) new_attrs = dict(start_date=pd.to_datetime(t0).strftime("%Y-%m-%d"), stop_date=pd.to_datetime(t1).strftime("%Y-%m-%d")) if args.verbose: for title, dic in ("Old", z.attrs), ("New", new_attrs): print(f"{title} attributes:") for key in "start_date", "stop_date": print(f' {key}: ' + (dic[key] if key in dic else "not present")) if args.dry_run: if args.verbose: print("Dry run -- not updating.") else: z.attrs.update(new_attrs) zarr.consolidate_metadata(args.zarr) if args.verbose: print("Attributes updated.")
def concat_zarrs_optimized( zarr_files: Sequence[str], output: Union[PathType, MutableMapping[str, bytes]], vars_to_rechunk: List[Hashable], vars_to_copy: List[Hashable], fix_strings: bool = False, ) -> None: if isinstance(output, Path): output = str(output) zarr_groups = [zarr.open_group(f) for f in zarr_files] first_zarr_group = zarr_groups[0] # create the top-level group zarr.open_group(output, mode="w") # copy variables that are to be rechunked # NOTE: that this uses _to_zarr function defined here that is needed to avoid # race conditions between writing the array contents and its metadata # see https://github.com/pystatgen/sgkit/pull/486 delayed = [] # do all the rechunking operations in one computation for var in vars_to_rechunk: dtype = None if fix_strings and var in {"variant_id", "variant_allele"}: max_len = _get_max_len(zarr_groups, f"max_length_{var}") dtype = f"S{max_len}" arr = concatenate_and_rechunk([group[var] for group in zarr_groups], dtype=dtype) _to_zarr_kwargs = dict( compressor=first_zarr_group[var].compressor, filters=first_zarr_group[var].filters, fill_value=None, ) if not fix_strings and arr.dtype == "O": # We assume that all object dtypes are variable length strings var_len_str_codec = numcodecs.VLenUTF8() _to_zarr_kwargs["object_codec"] = var_len_str_codec # Remove from filters to avoid double encoding error if var_len_str_codec in first_zarr_group[var].filters: filters = list(first_zarr_group[var].filters) filters.remove(var_len_str_codec) _to_zarr_kwargs["filters"] = filters d = _to_zarr( # type: ignore[no-untyped-call] arr, output, component=var, overwrite=True, compute=False, attrs=first_zarr_group[var].attrs.asdict(), **_to_zarr_kwargs, ) delayed.append(d) da.compute(*delayed) # copy unchanged variables and top-level metadata with zarr.open_group(output) as output_zarr: # copy variables that are not rechunked (e.g. sample_id) for var in vars_to_copy: output_zarr[var] = first_zarr_group[var] output_zarr[var].attrs.update(first_zarr_group[var].attrs) # copy top-level attributes group_attrs = dict(first_zarr_group.attrs) if "max_alt_alleles_seen" in group_attrs: max_alt_alleles_seen = _get_max_len(zarr_groups, "max_alt_alleles_seen") group_attrs["max_alt_alleles_seen"] = max_alt_alleles_seen output_zarr.attrs.update(group_attrs) # consolidate metadata zarr.consolidate_metadata(output)
def update_slice(store: Union[str, MutableMapping], insert_index: int, dataslice: xr.Dataset, mode: str, dimension: str = "time") -> None: """ Update existing Zarr dataset with new data slice. :param store: A Zarr store. :param insert_index: index at which to insert :param dataslice: slice to insert :param mode: Update mode, 'insert' or 'replace' :param dimension: name of dimension perpendicular to slice """ if mode not in ('insert', 'replace'): raise ValueError(f'illegal mode value: {mode!r}') insert_mode = mode == 'insert' append_dim_var_names = [] encoding = {} # Neither Zarr nor xarray offer an explicit API function to check whether # a Zarr is consolidated. Here we use the workaround of attempting to # open as consolidated, and catching the resulting exception if this # isn't possible. In the case of a consolidated Zarr, there is a slight # inefficiency, since the consolidated metadata object is fetched twice # (by Zarr and thereafter by xarray). See comments on PR #48 for # discussion of possible optimizations. consolidated = True try: _ = zarr.open_consolidated(store) except KeyError: consolidated = False with xr.open_zarr(store, consolidated=consolidated) as ds: for var_name in ds.variables: var = ds[var_name] if var.ndim >= 1 and dimension in var.dims: if var.dims[0] != dimension: # TODO: Remove this restriction -- it's not fundamentally # necessary. Removal should be accompanied by appropriate # unit tests and the addition of a warning to the user # about potential slowness / inefficiency. raise ValueError(f"dimension '{dimension}' of variable " f"{var_name!r} must be first dimension") append_dim_var_names.append(var_name) enc = dict(ds[var_name].encoding) # xarray 0.17+ supports engine preferred chunks if exposed by # the backend zarr does that, but when we use the new # 'preferred_chunks' when writing to zarr it raises and says, # 'preferred_chunks' is an unsupported encoding if 'preferred_chunks' in enc: del enc['preferred_chunks'] encoding[var_name] = enc temp_dir = tempfile.TemporaryDirectory(prefix='nc2zarr-slice-', suffix='.zarr') dataslice.to_zarr(temp_dir.name, encoding=encoding) slice_root_group = zarr.open(temp_dir.name, mode='r') slice_arrays = dict(slice_root_group.arrays()) root_group = zarr.open(store, mode='r+') for var_name, var_array in root_group.arrays(): if var_name in append_dim_var_names: slice_array = slice_arrays[var_name] if insert_mode: # Add one empty step empty = zarr.creation.empty(slice_array.shape, dtype=var_array.dtype) var_array.append(empty, axis=0) # Shift contents var_array[insert_index + 1:, ...] = \ var_array[insert_index:-1, ...] # Replace slice var_array[insert_index, ...] = slice_array[0] if consolidated: zarr.consolidate_metadata(store)
def _finalize_target(): if self.consolidate_zarr: logger.info("Consolidating Zarr metadata") target_mapper = self.target.get_mapper() zarr.consolidate_metadata(target_mapper)
def write_zarr(ds, mapper): task = ds.to_zarr(mapper, mode='a', compute=False) task.compute(retries=task_retries) zarr.consolidate_metadata(mapper)
def consolidate(self): zarr.consolidate_metadata(self.zgroup.store) self.consolidated = True
def store( self, variables, attributes, check_encoding_set=frozenset(), writer=None, unlimited_dims=None, ): """ Top level method for putting data on this store, this method: - encodes variables/attributes - sets dimensions - sets variables Parameters ---------- variables : dict-like Dictionary of key/value (variable name / xr.Variable) pairs attributes : dict-like Dictionary of key/value (attribute name / attribute) pairs check_encoding_set : list-like List of variables that should be checked for invalid encoding values writer : ArrayWriter unlimited_dims : list-like List of dimension names that should be treated as unlimited dimensions. dimension on which the zarray will be appended only needed in append mode """ existing_variable_names = { vn for vn in variables if _encode_variable_name(vn) in self.zarr_group } new_variables = set(variables) - existing_variable_names variables_without_encoding = {vn: variables[vn] for vn in new_variables} variables_encoded, attributes = self.encode( variables_without_encoding, attributes ) if existing_variable_names: # Decode variables directly, without going via xarray.Dataset to # avoid needing to load index variables into memory. # TODO: consider making loading indexes lazy again? existing_vars, _, _ = conventions.decode_cf_variables( self.get_variables(), self.get_attrs() ) # Modified variables must use the same encoding as the store. vars_with_encoding = {} for vn in existing_variable_names: vars_with_encoding[vn] = variables[vn].copy(deep=False) vars_with_encoding[vn].encoding = existing_vars[vn].encoding vars_with_encoding, _ = self.encode(vars_with_encoding, {}) variables_encoded.update(vars_with_encoding) for var_name in existing_variable_names: new_var = variables_encoded[var_name] existing_var = existing_vars[var_name] _validate_existing_dims( var_name, new_var, existing_var, self._write_region, self._append_dim, ) if self._mode not in ["r", "r+"]: self.set_attributes(attributes) self.set_dimensions(variables_encoded, unlimited_dims=unlimited_dims) self.set_variables( variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims ) if self._consolidate_on_close: zarr.consolidate_metadata(self.zarr_group.store)
def consolidate_metadata(self): mapper = fsspec.get_mapper(self.targets) zarr.consolidate_metadata(mapper)
def write_vis(mxds, outfile, chunks_on_disk=None, partition=None, consolidated=True, compressor=None, graph_name='write_zarr'): """ Write xarray dataset to zarr format on disk. When chunks_on_disk is not specified the chunking in the input dataset is used. When chunks_on_disk is specified that dataset is saved using that chunking. Parameters ---------- mxds : xarray.core.dataset.Dataset Dataset of dataset to write to disk outfile : str outfile filename, generally ends in .zarr chunks_on_disk : dict of int A dictionary with the chunk size that will be used when writing to disk. For example {'time': 20, 'chan': 6}. If chunks_on_disk is not specified the chunking of dataset will be used. partition : str or list Name of partition xds to write into outfile (from the mxds attributes section). Overwrites existing partition of same name. Default None writes entire mxds compressor : numcodecs.blosc.Blosc The blosc compressor to use when saving the converted data to disk using zarr. If None the zstd compression algorithm used with compression level 2. graph_name : string The time taken to execute the graph and save the dataset is measured and saved as an attribute in the zarr file. The graph_name is the label for this timing information. Returns ------- """ import xarray as xr import zarr import time from numcodecs import Blosc from itertools import cycle import os import numpy as np if compressor is None: compressor = Blosc(cname='zstd', clevel=2, shuffle=0) if partition is None: partition = list(mxds.attrs.keys()) partition = list(np.atleast_1d(partition)) os.system("rm -fr " + outfile) os.system("mkdir " + outfile) for xds_name in partition: if "xds" in xds_name: xds_outfile = outfile + '/' + xds_name xds_for_disk = mxds.attrs[xds_name] if chunks_on_disk is not None: xds_for_disk = xds_for_disk.chunk(chunks=chunks_on_disk) else: xds_outfile = outfile + '/global/' + xds_name xds_for_disk = mxds.attrs[xds_name] # Create compression encoding for each datavariable encoding = dict(zip(list(xds_for_disk.data_vars), cycle([{'compressor': compressor}]))) start = time.time() # Consolidated is set to False so that the timing information is included in the consolidate metadata. xr.Dataset.to_zarr(xds_for_disk, store=xds_outfile, mode='w', encoding=encoding,consolidated=False) time_to_calc_and_store = time.time() - start print('Time to store and execute graph for ', xds_name, graph_name, time_to_calc_and_store) #Add timing information dataset_group = zarr.open_group(xds_outfile, mode='a') dataset_group.attrs[graph_name+'_time'] = time_to_calc_and_store if consolidated == True: zarr.consolidate_metadata(xds_outfile)
def append_zarr(list_xarray_data_variables, outfile, chunks_return={}, compressor=None, graph_name='append_zarr'): """ Append a list of dask arrays to a zarr file on disk. If a data variable with the same name is found it will be overwritten. Data will probably be corrupted if append_zarr overwrites the data variable from which the dask array gets its data. All data variables that share dimensions and coordinates with data variables already on disk must have the same values (chunking can be different). Parameters ---------- list_xarray_data_variables: list of dask arrays List of xarray datavariables to append. outfile : str The file name of the dataset on disk, generally ends in .zarr chunks_return : dict of int A dictionary with the chunk size that will be returned. For example {'time': 20, 'chan': 6}. If chunks_return is not specified the chunking on disk will be used. compressor : numcodecs.blosc.Blosc The blosc compressor to use when saving the converted data to disk using zarr. If None the zstd compression algorithm used with compression level 2. graph_name : string The time taken to execute the graph and save the dataset is measured and saved as an attribute in the zarr file. The graph_name is the label for this timing information. Returns ------- """ #Why this function is needed https://stackoverflow.com/questions/58042559/adding-new-xarray-dataarray-to-an-existing-zarr-store-without-re-writing-the-who #To understand this function go over dask/array/core.py and xarray/backends/common.py. from fsspec import get_mapper import xarray as xr import zarr import dask import dask.array as da import time from numcodecs import Blosc start = time.time() n_arrays = len(list_xarray_data_variables) try: disk_dataset = xr.open_zarr(outfile) except ValueError: print("######### ERROR: Could not open " + outfile) if compressor is None: compressor = Blosc(cname='zstd', clevel=2, shuffle=0) ###################################################################################### #Create a list of delayed zarr.create commands (list_target_zarr) for each dask array in list_dask_array. ###################################################################################### list_target_zarr = [] list_dask_array = [] for i in range(n_arrays): list_new_dim_name = [] list_new_coord_name = [] list_new_coord_dim_names = [] #Create list of dimension chunk sizes on disk chunksize_on_disk = [] #Get array chunksize on disk and add new dimentions #The order of the for loop is important, since chunksize_on_disk must have the correct dimention ordering. for dim_name in list_xarray_data_variables[i].dims: if dim_name in disk_dataset.dims: chunksize_on_disk.append(disk_dataset.chunks[dim_name][0]) else: #Since the dimention does not exist on disk use chunking in list_xarray_data_variables[i] chunksize_on_disk.append( list_xarray_data_variables[i].to_dataset().chunks[dim_name] [0]) #Add dim to be stored dask_dim_array = da.from_array( list_xarray_data_variables[i][dim_name].data, chunks=(1, )) mapper = get_mapper(outfile + '/' + dim_name) list_target_zarr.append( dask.delayed(zarr.create)(shape=dask_dim_array.shape, compressor=compressor, chunks=(1, ), dtype=dask_dim_array.dtype, store=mapper, overwrite=True)) list_dask_array.append(dask_dim_array) list_new_dim_name.append(dim_name) #Add all other non dimentional coordinates. Order does nor matter for coord_name in list_xarray_data_variables[i].coords._names: if coord_name not in list_xarray_data_variables[i].dims: if coord_name not in disk_dataset.coords._names: coord_dim_names = list_xarray_data_variables[i][ coord_name].dims coord_chunksize_on_disk = [] for coord_dim_name in coord_dim_names: if coord_dim_name in disk_dataset.dims: coord_chunksize_on_disk.append( disk_dataset.chunks[coord_dim_name][0]) else: #Since the dimention does not exist on disk use chunking in list_xarray_data_variables[i] coord_chunksize_on_disk.append( list_xarray_data_variables[i].to_dataset( ).chunks[coord_dim_name][0]) #Add coord to be stored dask_coord_array = list_xarray_data_variables[i][ coord_name].data.rechunk( chunks=coord_chunksize_on_disk) mapper = get_mapper(outfile + '/' + coord_name) list_target_zarr.append( dask.delayed(zarr.create)( shape=dask_coord_array.shape, compressor=compressor, chunks=coord_chunksize_on_disk, dtype=dask_coord_array.dtype, store=mapper, overwrite=True)) list_dask_array.append(dask_coord_array) list_new_coord_dim_names.append(coord_dim_names) list_new_coord_name.append(coord_name) #Rechunk the dask arrays to match the chunking on disk dask_array = list_xarray_data_variables[i].data.rechunk( chunksize_on_disk) list_dask_array.append(dask_array) #Create list of delayed objects mapper = get_mapper(outfile + '/' + list_xarray_data_variables[i].name) list_target_zarr.append( dask.delayed(zarr.create)(shape=dask_array.shape, compressor=compressor, chunks=chunksize_on_disk, dtype=dask_array.dtype, store=mapper, overwrite=True) ) #Can not specify the zarr file attributes at creation , attrs={'_ARRAY_DIMENSIONS':array_dimensions[i]} (last checked on May 2020 ) #Trigger compute of delayed zarr.create functions in list_target_zarr. da.store(list_dask_array, list_target_zarr, compute=True, flush=True, lock=False) # Open zarr to add array dimension labels so that xarray.open_zarr works. This is the magic that allows xarray to understand zarr. dataset_group = zarr.open_group(outfile, mode='a') #This should one day be done during zarr.create. See https://github.com/zarr-developers/zarr-python/issues/538. #Data variables labels for i in range(n_arrays): dataset_group[list_xarray_data_variables[i].name].attrs[ '_ARRAY_DIMENSIONS'] = list_xarray_data_variables[i].dims #Dimention labels for new_dim_name in list_new_dim_name: dataset_group[new_dim_name].attrs['_ARRAY_DIMENSIONS'] = new_dim_name #Coord labels for new_coord_name, new_coord_dim_names in zip(list_new_coord_name, list_new_coord_dim_names): dataset_group[new_coord_name].attrs[ '_ARRAY_DIMENSIONS'] = new_coord_dim_names time_to_calc_and_store = time.time() - start print('Time to append and execute graph ', graph_name, time_to_calc_and_store) dataset_group.attrs[graph_name + '_time'] = time_to_calc_and_store #Consolidate metadata #Can be improved by only adding appended metadata zarr.consolidate_metadata(outfile) if bool(chunks_return): return xr.open_zarr(outfile, chunks=chunks_return, overwrite_encoded_chunks=True, consolidated=True) else: return xr.open_zarr(outfile, overwrite_encoded_chunks=True, consolidated=True)
def close(self): if self._consolidate_on_close: import zarr zarr.consolidate_metadata(self.ds.store)
} with tempfile.TemporaryDirectory( prefix="bgen_to_zarr_", suffix=".zarr", dir=tempdir ) as tmpdir: rechunked = rechunker_api.rechunk( ds, max_mem=max_mem, target_chunks=target_chunks, target_store=output, target_options=target_options, temp_store=tmpdir, executor="dask", ) rechunked.execute() zarr.consolidate_metadata(output) ds: Dataset = xr.open_zarr(output, concat_characters=False) # type: ignore[no-untyped-call] if pack: ds = unpack_variables(ds) return ds def bgen_to_zarr( input: PathType, output: Union[PathType, MutableMapping[str, bytes]], region: Optional[Mapping[Hashable, Any]] = None, chunk_length: int = 10_000, chunk_width: int = 1_000, temp_chunk_length: int = 100,
def main(model, scenario, member): print('---------->', model, scenario, member) # get the output store key = f'{model}.{scenario}.{member}' target_uri = target.format(key=key) print(target_uri) store = get_store(target_uri) if skip_existing and '.zmetadata' in store: print(f'{key} in store, skipping...') return 'skipped' y_hist = get_obs().pipe(load_coords) if xy_region: y_hist = y_hist.isel(**xy_region) print('y_hist:\n', y_hist) x_hist = open_single(model, 'historical', member).pipe(process_cmip) if xy_region: x_hist = x_hist.isel(**xy_region) print('x_hist:\n', x_hist) x_scen = open_single(model, scenario, member).pipe(process_cmip) if xy_region: x_scen = x_scen.isel(**xy_region) if 'hist' in scenario: x_scen = x_scen.sel(time=hist_time) else: x_scen = x_scen.sel(time=future_time) print('x_scen:\n', x_scen) print('fitting models') models = {} y_scen = xr.Dataset() for v in bc_vars: print(v) models[v] = PointWiseDownscaler( TrendAwareQuantileMappingRegressor(QuantileMappingReressor(extrapolate='1to1')) ) # train models with historical data models[v].fit(x_hist[v].sel(time=train_time), y_hist[v].sel(time=train_time)) # predict this ensemble member y_scen[v] = models[v].predict(x_scen[v]) y_scen = y_scen.chunk(chunks) print('y_scen:\n', y_scen) if dry_run: print('skipping write of ... dry_run=True') return 'skipped' else: store.clear() write = y_scen.to_zarr(store, compute=False, mode='w') write.compute(retries=3) zarr.consolidate_metadata(store) return 'done'
def write_zarr(dataset, outfile, chunks_return={}, chunks_on_disk={}, compressor=None, graph_name='write_zarr'): """ Write xarray dataset to zarr format on disk. When chunks_on_disk is not specified the chunking in the input dataset is used. When chunks_on_disk is specified that dataset is saved using that chunking. The dataset on disk is then opened and rechunked using chunks_return or the chunking of dataset. Parameters ---------- dataset : xarray.core.dataset.Dataset Dataset to write to disk outfile : str outfile filename, generally ends in .zarr chunks_return : dict of int A dictionary with the chunk size that will be returned. For example {'time': 20, 'chan': 6}. If chunks_return is not specified the chunking of dataset will be used. chunks_on_disk : dict of int A dictionary with the chunk size that will be used when writing to disk. For example {'time': 20, 'chan': 6}. If chunks_on_disk is not specified the chunking of dataset will be used. compressor : numcodecs.blosc.Blosc The blosc compressor to use when saving the converted data to disk using zarr. If None the zstd compression algorithm used with compression level 2. graph_name : string The time taken to execute the graph and save the dataset is measured and saved as an attribute in the zarr file. The graph_name is the label for this timing information. Returns ------- """ import xarray as xr import zarr import time from numcodecs import Blosc from itertools import cycle from zarr.meta import json_dumps, json_loads from zarr.creation import normalize_store_arg, open_array #Check if disk chunking is specified if bool(chunks_on_disk): dataset_for_disk = dataset.chunk(chunks=chunks_on_disk) else: dataset_for_disk = dataset if compressor is None: compressor = Blosc(cname='zstd', clevel=2, shuffle=0) #Create compression encoding for each datavariable encoding = dict( zip(list(dataset_for_disk.data_vars), cycle([{ 'compressor': compressor }]))) start = time.time() #Consolidated is set to False so that the timing information is included in the consolidate metadata. xr.Dataset.to_zarr(dataset_for_disk, store=outfile, mode='w', encoding=encoding, consolidated=False) time_to_calc_and_store = time.time() - start print('Time to store and execute graph ', graph_name, time_to_calc_and_store) #Add timing information dataset_group = zarr.open_group(outfile, mode='a') dataset_group.attrs[graph_name + '_time'] = time_to_calc_and_store #Consolidate metadata zarr.consolidate_metadata(outfile) if bool(chunks_return): return xr.open_zarr(outfile, consolidated=True, overwrite_encoded_chunks=True) else: #Get input dataset chunking for dim_key in dataset.chunks: chunks_return[dim_key] = dataset.chunks[dim_key][0] return xr.open_zarr(outfile, chunks=chunks_return, consolidated=True, overwrite_encoded_chunks=True)