def finalize_target(target: CacheFSSpecTarget, consolidate_zarr: bool) -> None:
    if target is None:
        raise ValueError("target has not been set.")
    if consolidate_zarr:
        logger.info("Consolidating Zarr metadata")
        target_mapper = target.get_mapper()
        zarr.consolidate_metadata(target_mapper)
Exemple #2
0
 def finalize_target(self) -> None:
     if self.target is None:
         raise ValueError("target has not been set.")
     if self.consolidate_zarr:
         logger.info("Consolidating Zarr metadata")
         target_mapper = self.target.get_mapper()
         zarr.consolidate_metadata(target_mapper)
Exemple #3
0
    def store(
        self,
        variables,
        attributes,
        check_encoding_set=frozenset(),
        writer=None,
        unlimited_dims=None,
    ):
        """
        Top level method for putting data on this store, this method:
          - encodes variables/attributes
          - sets dimensions
          - sets variables

        Parameters
        ----------
        variables : dict-like
            Dictionary of key/value (variable name / xr.Variable) pairs
        attributes : dict-like
            Dictionary of key/value (attribute name / attribute) pairs
        check_encoding_set : list-like
            List of variables that should be checked for invalid encoding
            values
        writer : ArrayWriter
        unlimited_dims : list-like
            List of dimension names that should be treated as unlimited
            dimensions.
            dimension on which the zarray will be appended
            only needed in append mode
        """
        import zarr

        existing_variables = {
            vn for vn in variables if _encode_variable_name(vn) in self.ds
        }
        new_variables = set(variables) - existing_variables
        variables_without_encoding = {vn: variables[vn] for vn in new_variables}
        variables_encoded, attributes = self.encode(
            variables_without_encoding, attributes
        )

        if len(existing_variables) > 0:
            # there are variables to append
            # their encoding must be the same as in the store
            ds = open_zarr(self.ds.store, group=self.ds.path, chunks=None)
            variables_with_encoding = {}
            for vn in existing_variables:
                variables_with_encoding[vn] = variables[vn].copy(deep=False)
                variables_with_encoding[vn].encoding = ds[vn].encoding
            variables_with_encoding, _ = self.encode(variables_with_encoding, {})
            variables_encoded.update(variables_with_encoding)

        if self._write_region is None:
            self.set_attributes(attributes)
            self.set_dimensions(variables_encoded, unlimited_dims=unlimited_dims)
        self.set_variables(
            variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims
        )
        if self._consolidate_on_close:
            zarr.consolidate_metadata(self.ds.store)
def make_annual(model, scenario, member, method):

    if 'hist' in scenario:
        tslice = slice(None, '2014-12')
    else:
        tslice = slice('2015', '2100')
    monthly_mapper = zarr.storage.ABSStore(
        'carbonplan-downscaling',
        prefix=f'cmip6/{method}/conus/4000m/monthly/{model}.{scenario}.{member}.zarr',
        account_name="carbonplan",
        account_key=account_key,
    )

    annual_mapper = zarr.storage.ABSStore(
        'carbonplan-downscaling',
        prefix=f'cmip6/{method}/conus/4000m/annual/{model}.{scenario}.{member}.zarr',
        account_name="carbonplan",
        account_key=account_key,
    )

    if skip_existing and '.zmetadata' in annual_mapper:
        return 'skipped'

    ds_monthly = xr.open_zarr(monthly_mapper, consolidated=True).sel(time=tslice).chunk(chunks)
    template = _annual(ds_monthly, compute=False).chunk(chunks)
    ds_annual = ds_monthly.map_blocks(_annual, template=template)
    annual_mapper.clear()
    task = ds_annual.to_zarr(annual_mapper, mode='w', compute=False)
    dask.compute(task, retries=task_retries)
    zarr.consolidate_metadata(annual_mapper)
    return 'done'
Exemple #5
0
def split_and_write(model, scenario, member, method):

    ds = get_scratch_ds(model, scenario, member, method)

    scen_mapper = zarr.storage.ABSStore(
        'carbonplan-downscaling',
        prefix=f'cmip6/{method}/conus/4000m/monthly/{model}.{scenario}.{member}.zarr',
        account_name="carbonplan",
        account_key=account_key,
    )

    if not (skip_existing and 'pdsi/.zarray' in scen_mapper):
        print('writing scen')
        ds.sel(time=slice('2015-01', None)).to_zarr(scen_mapper, mode='a')
        zarr.consolidate_metadata(scen_mapper)

    hist_mapper = zarr.storage.ABSStore(
        'carbonplan-downscaling',
        prefix=f'cmip6/{method}/conus/4000m/monthly/{model}.historical.{member}.zarr',
        account_name="carbonplan",
        account_key=account_key,
    )
    if not (skip_existing and 'pdsi/.zarray' in hist_mapper):
        print('writing hist')
        ds.sel(time=slice(None, '2014-12')).to_zarr(hist_mapper, mode='a')
        zarr.consolidate_metadata(hist_mapper)
Exemple #6
0
def _unchunk_vars(dataset_path: str, var_names: List[str]):
    for var_name in var_names:
        var_path = os.path.join(dataset_path, var_name)

        # Optimization: if "shape" and "chunks" are equal in ${var}/.zarray, we are done
        var_array_info_path = os.path.join(var_path, '.zarray')
        with open(var_array_info_path, 'r') as fp:
            var_array_info = json.load(fp)
            if var_array_info.get('shape') == var_array_info.get('chunks'):
                continue

        # Open array and remove chunks from the data
        var_array = zarr.convenience.open_array(var_path, 'r+')
        if var_array.shape != var_array.chunks:
            # TODO (forman): Fully loading data is inefficient and dangerous for large arrays.
            #                Instead save unchunked to temp and replace existing chunked array dir with temp.
            # Fully load data and attrs so we no longer depend on files
            data = np.array(var_array)
            attributes = var_array.attrs.asdict()
            # Save array data
            zarr.convenience.save_array(var_path,
                                        data,
                                        chunks=False,
                                        fill_value=var_array.fill_value)
            # zarr.convenience.save_array() does not seem save user attributes (file ".zattrs" not written),
            # therefore we must modify attrs explicitly:
            var_array = zarr.convenience.open_array(var_path, 'r+')
            var_array.attrs.update(attributes)

    zarr.consolidate_metadata(dataset_path)
Exemple #7
0
 def write_group_to_zarr(consolidated=False):
     path = str(tmpdir.join("test.zarr"))
     z = zarr.open_group(path)
     arr = z.create_dataset("var1", shape=(3, 5))
     arr[:] = 1.0
     if consolidated:
         zarr.consolidate_metadata(path)
     return z, path
Exemple #8
0
 def update(self,
            output_path: str,
            global_attrs: Dict[str, Any] = None,
            **kwargs):
     if global_attrs:
         import zarr
         ds = zarr.open_group(output_path, mode='r+', **kwargs)
         ds.attrs.update(global_attrs)
         zarr.consolidate_metadata(output_path)
Exemple #9
0
 def consolidate_metadata(self, metadata_key='.zmetadata'):
     '''
     Wrapper over zarr.consolidate_metadata to pass chunk store when opening the zarr store
     '''
     zarr.consolidate_metadata(self.store, metadata_key=metadata_key)
     store_mode_cons = 'r' if self.store_mode == 'r' else 'r+'
     self.zgroup = zarr.open_consolidated(self.store, metadata_key=metadata_key,
                                          mode=store_mode_cons, chunk_store=self.zgroup.chunk_store,
                                          path=self.store_path)
     return self.zgroup
Exemple #10
0
def main():
    parser = argparse.ArgumentParser(
        description=("Ensure that value of calendar attribute of time "
                     "variable in a Zarr store is in lower case."))
    parser.add_argument("--config",
                        "-c",
                        type=str,
                        action="append",
                        help="Read S3 configuration from this/these config(s)")
    parser.add_argument("--force",
                        "-f",
                        action="store_true",
                        help="Rename calendar even if it's already lower-case")
    parser.add_argument("--dry-run",
                        "-s",
                        action="store_true",
                        help="Don't rename, "
                        "just show what would have been done")
    parser.add_argument("zarr_store",
                        type=str,
                        help="Zarr store specifier (path or URL)")
    args = parser.parse_args()
    store_arg = args.zarr_store

    if args.config and store_arg.lower().startswith("s3://"):
        s3_config = {}
        config = nc2zarr.config.load_config(args.config, return_kwargs=True)
        if "output_s3" in config:
            s3_config = config["output_s3"]
        # We have to create this store manually to set normalize_keys=False,
        # because normalize_keys=True can break consolidate_metadata.
        store = zarr.storage.FSStore(store_arg,
                                     mode="r+",
                                     **s3_config,
                                     normalize_keys=False)
    else:
        store = zarr.creation.normalize_store_arg(store_arg)

    z = zarr.open_group(store, mode="r+")
    calendar = z.time.attrs["calendar"]

    log(f"Current calendar: \"{calendar}\"")
    if calendar.islower() and not args.force:
        log("Already lower case; leaving unchanged.")
    else:
        new_calendar = calendar.lower()
        if args.dry_run:
            log(f"New name: \"{new_calendar}\"")
            log("Dry run requested -- not actually renaming.")
        else:
            log(f"Renaming to \"{new_calendar}\"...")
            z.time.attrs["calendar"] = new_calendar
            log("Consolidating...")
            zarr.consolidate_metadata(store)
            log("Done.")
Exemple #11
0
def consolidate_metadata(target):
    """
    Consolidate Zarr metadata

    Parameters
    ----------
    target : str
         Path or url of the Zarr store.
    """
    mapper = fsspec.get_mapper(target)
    zarr.consolidate_metadata(mapper)
Exemple #12
0
def consolidate_metadata(writes: List[str], target: str) -> None:
    """
    Consolidate the metadata the Zarr group at `target`.

    Parameters
    ----------
    writes : List[str]
        The URLs the combined stores were written to. This is only a
        parameter to introduce a dependency. The actual value isn't used.
    target : str
        The URL for the (combined) Zarr group.
    """
    mapper = fsspec.get_mapper(target)
    zarr.consolidate_metadata(mapper)
Exemple #13
0
def consolidate_metadata(target, writes: Optional[List[str]] = None) -> None:
    """
    Consolidate the metadata the Zarr group at `target`.

    Parameters
    ----------
    target : str
        The URL for the (combined) Zarr group.
    writes : list of strings, optional
        The URLs the combined stores were written to. This is only a
        parameter to introduce a dependency in the pipeline execution graph.
        The actual value isn't used.
    """
    mapper = fsspec.get_mapper(target)
    zarr.consolidate_metadata(mapper)
def update_hydat_database(path):
    project_root = '/tmp'
    data_dir = os.path.join(project_root, 'data')
    stations_list = get_available_stations_from_hydat()
    #
    # results = []
    for station_number in stations_list:
        if verify_data_type_exists(station_number, 'Flow'):
            import_hydat_to_parquet(station_number)

    storage_options = {"client_kwargs": {'endpoint_url': 'https://s3.us-east-2.wasabisys.com',
                                         'region_name': 'us-east-2'}}

    df = pd.read_parquet(os.path.join(data_dir, 'basin.parquet'), engine='pyarrow')
    df.to_parquet('s3://hydrology/timeseries/sources/hydat/basin.parquet',
                  engine='fastparquet',
                  compression='gzip',
                  storage_options=storage_options)
    df = pd.read_parquet(os.path.join(data_dir, 'context.parquet'), engine='pyarrow')
    df.to_parquet('s3://hydrology/timeseries/sources/hydat/context.parquet',
                  engine='fastparquet',
                  compression='gzip',
                  storage_options=storage_options)

    client_kwargs = {'endpoint_url': 'https://s3.us-east-2.wasabisys.com',
                     'region_name': 'us-east-2'}
    config_kwargs = {'max_pool_connections': 30}

    bucket_source = os.path.join(data_dir, 'zarr')
    bucket_sink = "s3://hydrology/timeseries/sources/hydat/values.zarr "
    endpoint_url = 'https://s3.us-east-2.wasabisys.com'
    region='us-east-2'

    s3 = s3fs.S3FileSystem(client_kwargs=client_kwargs,
                           config_kwargs=config_kwargs)
    store = s3fs.S3Map(root=bucket_sink,
                       s3=s3)


    aws_command = "aws s3 sync {} {} --endpoint-url={} --region={}".format(bucket_source,
                                                                           bucket_sink,
                                                                           endpoint_url,
                                                                           region)
    print(aws_command)
    subprocess.call(aws_command, shell=True)

    zarr.consolidate_metadata(store)
Exemple #15
0
def create_vcfzarr(
    shared_datadir, tmpdir, *, fields=None, grouped_by_contig=False, consolidated=False
):
    """Create a vcfzarr file using scikit-allel"""
    vcf_path = shared_datadir / "sample.vcf"
    output_path = tmpdir / "sample.vcf.zarr"
    if grouped_by_contig:
        for contig in ["19", "20", "X"]:
            allel.vcf_to_zarr(
                str(vcf_path),
                str(output_path),
                fields=fields,
                group=contig,
                region=contig,
            )
    else:
        allel.vcf_to_zarr(str(vcf_path), str(output_path), fields=fields)
    if consolidated:
        zarr.consolidate_metadata(str(output_path))
    return output_path
Exemple #16
0
def main(products=['GLAH01', 'GLAH14']):

    # convert hdf5 granules to zarr
    print('granules_h5_to_zarr')
    results = granules_h5_to_zarr(products)

    for p, presults in results.items():
        uris = presults['skipped'] + presults['converted']

        print('open xarray datasets')
        ds_list = dask.compute([lazy_open(uri) for uri in uris])[0]

        print('concat list of datasets')
        with dask.config.set(**{'array.slicing.split_large_chunks': True}):
            ds = xr.concat(ds_list, dim='record_index').chunk({'record_index': chunksize})
            for k in ds:
                _ = ds[k].encoding.pop('chunks', None)

        # write
        print(f'writing {p}')
        print(ds)
        print(f'ds.nbytes: {ds.nbytes / 1e9}')
        mapper = fsspec.get_mapper(f'gs://carbonplan-climatetrace/intermediates/{p.lower()}.zarr')
        mapper.clear()

        # print('writing zarr dataset')
        ds.to_zarr(mapper, compute=False, mode='w')
        stepsize = chunksize * 1
        recs = ds.dims['record_index']
        print('writing zarr dataset chunks')
        for left, right in zip(
            range(0, recs, stepsize), range(stepsize, recs + stepsize, stepsize)
        ):
            s = slice(left, right)
            print(s, flush=True)
            ds.isel(record_index=s).drop(drop_keys[p]).to_zarr(mapper, region={'record_index': s})

        zarr.consolidate_metadata(mapper)
Exemple #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("zarr",
                        metavar="PATH_OR_URL",
                        description="Update a Zarr's start_date and"
                        "stop_date attributes to match its data.")
    parser.add_argument("--dry-run",
                        "-d",
                        action="store_true",
                        help="Don't actually write metadata")
    parser.add_argument("--verbose",
                        "-v",
                        action="store_true",
                        help="Report progress to standard output")
    args = parser.parse_args()
    ds = xr.open_zarr(args.zarr)
    z = zarr.open(args.zarr)
    t0 = ds.time[0].values
    t1 = ds.time[-1].values
    if args.verbose:
        print("First/last times:", t0, t1)
    new_attrs = dict(start_date=pd.to_datetime(t0).strftime("%Y-%m-%d"),
                     stop_date=pd.to_datetime(t1).strftime("%Y-%m-%d"))
    if args.verbose:
        for title, dic in ("Old", z.attrs), ("New", new_attrs):
            print(f"{title} attributes:")
            for key in "start_date", "stop_date":
                print(f'    {key}: ' +
                      (dic[key] if key in dic else "not present"))
    if args.dry_run:
        if args.verbose:
            print("Dry run -- not updating.")
    else:
        z.attrs.update(new_attrs)
        zarr.consolidate_metadata(args.zarr)
        if args.verbose:
            print("Attributes updated.")
Exemple #18
0
def concat_zarrs_optimized(
    zarr_files: Sequence[str],
    output: Union[PathType, MutableMapping[str, bytes]],
    vars_to_rechunk: List[Hashable],
    vars_to_copy: List[Hashable],
    fix_strings: bool = False,
) -> None:
    if isinstance(output, Path):
        output = str(output)

    zarr_groups = [zarr.open_group(f) for f in zarr_files]

    first_zarr_group = zarr_groups[0]

    # create the top-level group
    zarr.open_group(output, mode="w")

    # copy variables that are to be rechunked
    # NOTE: that this uses _to_zarr function defined here that is needed to avoid
    # race conditions between writing the array contents and its metadata
    # see https://github.com/pystatgen/sgkit/pull/486
    delayed = []  # do all the rechunking operations in one computation
    for var in vars_to_rechunk:
        dtype = None
        if fix_strings and var in {"variant_id", "variant_allele"}:
            max_len = _get_max_len(zarr_groups, f"max_length_{var}")
            dtype = f"S{max_len}"
        arr = concatenate_and_rechunk([group[var] for group in zarr_groups],
                                      dtype=dtype)

        _to_zarr_kwargs = dict(
            compressor=first_zarr_group[var].compressor,
            filters=first_zarr_group[var].filters,
            fill_value=None,
        )
        if not fix_strings and arr.dtype == "O":
            # We assume that all object dtypes are variable length strings
            var_len_str_codec = numcodecs.VLenUTF8()
            _to_zarr_kwargs["object_codec"] = var_len_str_codec
            # Remove from filters to avoid double encoding error
            if var_len_str_codec in first_zarr_group[var].filters:
                filters = list(first_zarr_group[var].filters)
                filters.remove(var_len_str_codec)
                _to_zarr_kwargs["filters"] = filters

        d = _to_zarr(  # type: ignore[no-untyped-call]
            arr,
            output,
            component=var,
            overwrite=True,
            compute=False,
            attrs=first_zarr_group[var].attrs.asdict(),
            **_to_zarr_kwargs,
        )
        delayed.append(d)
    da.compute(*delayed)

    # copy unchanged variables and top-level metadata
    with zarr.open_group(output) as output_zarr:

        # copy variables that are not rechunked (e.g. sample_id)
        for var in vars_to_copy:
            output_zarr[var] = first_zarr_group[var]
            output_zarr[var].attrs.update(first_zarr_group[var].attrs)

        # copy top-level attributes
        group_attrs = dict(first_zarr_group.attrs)
        if "max_alt_alleles_seen" in group_attrs:
            max_alt_alleles_seen = _get_max_len(zarr_groups,
                                                "max_alt_alleles_seen")
            group_attrs["max_alt_alleles_seen"] = max_alt_alleles_seen
        output_zarr.attrs.update(group_attrs)

    # consolidate metadata
    zarr.consolidate_metadata(output)
Exemple #19
0
def update_slice(store: Union[str, MutableMapping],
                 insert_index: int,
                 dataslice: xr.Dataset,
                 mode: str,
                 dimension: str = "time") -> None:
    """
    Update existing Zarr dataset with new data slice.

    :param store: A Zarr store.
    :param insert_index: index at which to insert
    :param dataslice: slice to insert
    :param mode: Update mode, 'insert' or 'replace'
    :param dimension: name of dimension perpendicular to slice
    """

    if mode not in ('insert', 'replace'):
        raise ValueError(f'illegal mode value: {mode!r}')

    insert_mode = mode == 'insert'

    append_dim_var_names = []
    encoding = {}

    # Neither Zarr nor xarray offer an explicit API function to check whether
    # a Zarr is consolidated. Here we use the workaround of attempting to
    # open as consolidated, and catching the resulting exception if this
    # isn't possible. In the case of a consolidated Zarr, there is a slight
    # inefficiency, since the consolidated metadata object is fetched twice
    # (by Zarr and thereafter by xarray). See comments on PR #48 for
    # discussion of possible optimizations.
    consolidated = True
    try:
        _ = zarr.open_consolidated(store)
    except KeyError:
        consolidated = False

    with xr.open_zarr(store, consolidated=consolidated) as ds:
        for var_name in ds.variables:
            var = ds[var_name]
            if var.ndim >= 1 and dimension in var.dims:
                if var.dims[0] != dimension:
                    # TODO: Remove this restriction -- it's not fundamentally
                    #   necessary. Removal should be accompanied by appropriate
                    #   unit tests and the addition of a warning to the user
                    #   about potential slowness / inefficiency.
                    raise ValueError(f"dimension '{dimension}' of variable "
                                     f"{var_name!r} must be first dimension")
                append_dim_var_names.append(var_name)
                enc = dict(ds[var_name].encoding)
                # xarray 0.17+ supports engine preferred chunks if exposed by
                # the backend zarr does that, but when we use the new
                # 'preferred_chunks' when writing to zarr it raises and says,
                # 'preferred_chunks' is an unsupported encoding
                if 'preferred_chunks' in enc:
                    del enc['preferred_chunks']
                encoding[var_name] = enc

    temp_dir = tempfile.TemporaryDirectory(prefix='nc2zarr-slice-',
                                           suffix='.zarr')
    dataslice.to_zarr(temp_dir.name, encoding=encoding)
    slice_root_group = zarr.open(temp_dir.name, mode='r')
    slice_arrays = dict(slice_root_group.arrays())

    root_group = zarr.open(store, mode='r+')
    for var_name, var_array in root_group.arrays():
        if var_name in append_dim_var_names:
            slice_array = slice_arrays[var_name]
            if insert_mode:
                # Add one empty step
                empty = zarr.creation.empty(slice_array.shape,
                                            dtype=var_array.dtype)
                var_array.append(empty, axis=0)
                # Shift contents
                var_array[insert_index + 1:, ...] = \
                    var_array[insert_index:-1, ...]
            # Replace slice
            var_array[insert_index, ...] = slice_array[0]

    if consolidated:
        zarr.consolidate_metadata(store)
Exemple #20
0
 def _finalize_target():
     if self.consolidate_zarr:
         logger.info("Consolidating Zarr metadata")
         target_mapper = self.target.get_mapper()
         zarr.consolidate_metadata(target_mapper)
def write_zarr(ds, mapper):
    task = ds.to_zarr(mapper, mode='a', compute=False)
    task.compute(retries=task_retries)
    zarr.consolidate_metadata(mapper)
Exemple #22
0
 def consolidate(self):
     zarr.consolidate_metadata(self.zgroup.store)
     self.consolidated = True
Exemple #23
0
    def store(
        self,
        variables,
        attributes,
        check_encoding_set=frozenset(),
        writer=None,
        unlimited_dims=None,
    ):
        """
        Top level method for putting data on this store, this method:
          - encodes variables/attributes
          - sets dimensions
          - sets variables

        Parameters
        ----------
        variables : dict-like
            Dictionary of key/value (variable name / xr.Variable) pairs
        attributes : dict-like
            Dictionary of key/value (attribute name / attribute) pairs
        check_encoding_set : list-like
            List of variables that should be checked for invalid encoding
            values
        writer : ArrayWriter
        unlimited_dims : list-like
            List of dimension names that should be treated as unlimited
            dimensions.
            dimension on which the zarray will be appended
            only needed in append mode
        """
        existing_variable_names = {
            vn for vn in variables if _encode_variable_name(vn) in self.zarr_group
        }
        new_variables = set(variables) - existing_variable_names
        variables_without_encoding = {vn: variables[vn] for vn in new_variables}
        variables_encoded, attributes = self.encode(
            variables_without_encoding, attributes
        )

        if existing_variable_names:
            # Decode variables directly, without going via xarray.Dataset to
            # avoid needing to load index variables into memory.
            # TODO: consider making loading indexes lazy again?
            existing_vars, _, _ = conventions.decode_cf_variables(
                self.get_variables(), self.get_attrs()
            )
            # Modified variables must use the same encoding as the store.
            vars_with_encoding = {}
            for vn in existing_variable_names:
                vars_with_encoding[vn] = variables[vn].copy(deep=False)
                vars_with_encoding[vn].encoding = existing_vars[vn].encoding
            vars_with_encoding, _ = self.encode(vars_with_encoding, {})
            variables_encoded.update(vars_with_encoding)

            for var_name in existing_variable_names:
                new_var = variables_encoded[var_name]
                existing_var = existing_vars[var_name]
                _validate_existing_dims(
                    var_name,
                    new_var,
                    existing_var,
                    self._write_region,
                    self._append_dim,
                )

        if self._mode not in ["r", "r+"]:
            self.set_attributes(attributes)
            self.set_dimensions(variables_encoded, unlimited_dims=unlimited_dims)

        self.set_variables(
            variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims
        )
        if self._consolidate_on_close:
            zarr.consolidate_metadata(self.zarr_group.store)
Exemple #24
0
 def consolidate_metadata(self):
     mapper = fsspec.get_mapper(self.targets)
     zarr.consolidate_metadata(mapper)
Exemple #25
0
def write_vis(mxds, outfile, chunks_on_disk=None, partition=None, consolidated=True, compressor=None, graph_name='write_zarr'):
    """
    Write xarray dataset to zarr format on disk. When chunks_on_disk is not specified the chunking in the input dataset is used.
    When chunks_on_disk is specified that dataset is saved using that chunking.

    Parameters
    ----------
    mxds : xarray.core.dataset.Dataset
        Dataset of dataset to write to disk
    outfile : str
        outfile filename, generally ends in .zarr
    chunks_on_disk : dict of int
        A dictionary with the chunk size that will be used when writing to disk. For example {'time': 20, 'chan': 6}.
        If chunks_on_disk is not specified the chunking of dataset will be used.
    partition : str or list
        Name of partition xds to write into outfile (from the mxds attributes section). Overwrites existing partition of same name.
        Default None writes entire mxds
    compressor : numcodecs.blosc.Blosc
        The blosc compressor to use when saving the converted data to disk using zarr.
        If None the zstd compression algorithm used with compression level 2.
    graph_name : string
        The time taken to execute the graph and save the dataset is measured and saved as an attribute in the zarr file.
        The graph_name is the label for this timing information.

    Returns
    -------
    """
    import xarray as xr
    import zarr
    import time
    from numcodecs import Blosc
    from itertools import cycle
    import os
    import numpy as np

    if compressor is None:
        compressor = Blosc(cname='zstd', clevel=2, shuffle=0)

    if partition is None:
        partition = list(mxds.attrs.keys())
    partition = list(np.atleast_1d(partition))
        
    os.system("rm -fr " + outfile)
    os.system("mkdir " + outfile)
        
    for xds_name in partition:
        if "xds" in xds_name:
            xds_outfile = outfile + '/' + xds_name
            xds_for_disk = mxds.attrs[xds_name]
            if chunks_on_disk is not None:
                xds_for_disk = xds_for_disk.chunk(chunks=chunks_on_disk)
        else:
            xds_outfile = outfile + '/global/' + xds_name
            xds_for_disk = mxds.attrs[xds_name]
            
        # Create compression encoding for each datavariable
        encoding = dict(zip(list(xds_for_disk.data_vars), cycle([{'compressor': compressor}])))
        start = time.time()

        # Consolidated is set to False so that the timing information is included in the consolidate metadata.
        xr.Dataset.to_zarr(xds_for_disk, store=xds_outfile, mode='w', encoding=encoding,consolidated=False)
        time_to_calc_and_store = time.time() - start
        print('Time to store and execute graph for ', xds_name, graph_name, time_to_calc_and_store)

        #Add timing information
        dataset_group = zarr.open_group(xds_outfile, mode='a')
        dataset_group.attrs[graph_name+'_time'] = time_to_calc_and_store
            
        if consolidated == True:
            zarr.consolidate_metadata(xds_outfile)
def append_zarr(list_xarray_data_variables,
                outfile,
                chunks_return={},
                compressor=None,
                graph_name='append_zarr'):
    """
    Append a list of dask arrays to a zarr file on disk. If a data variable with the same name is found it will be overwritten.
    Data will probably be corrupted if append_zarr overwrites the data variable from which the dask array gets its data.
    All data variables that share dimensions and coordinates with data variables already on disk must have the same values (chunking can be different).
    
    Parameters
    ----------
    list_xarray_data_variables: list of dask arrays
        List of xarray datavariables to append.
    outfile : str
        The file name of the dataset on disk, generally ends in .zarr
    chunks_return : dict of int
        A dictionary with the chunk size that will be returned. For example {'time': 20, 'chan': 6}.
        If chunks_return is not specified the chunking on disk will be used.
    compressor : numcodecs.blosc.Blosc
        The blosc compressor to use when saving the converted data to disk using zarr.
        If None the zstd compression algorithm used with compression level 2.
    graph_name : string
        The time taken to execute the graph and save the dataset is measured and saved as an attribute in the zarr file.
        The graph_name is the label for this timing information.
    Returns
    -------
    """
    #Why this function is needed https://stackoverflow.com/questions/58042559/adding-new-xarray-dataarray-to-an-existing-zarr-store-without-re-writing-the-who
    #To understand this function go over dask/array/core.py and xarray/backends/common.py.

    from fsspec import get_mapper
    import xarray as xr
    import zarr
    import dask
    import dask.array as da
    import time
    from numcodecs import Blosc

    start = time.time()
    n_arrays = len(list_xarray_data_variables)
    try:
        disk_dataset = xr.open_zarr(outfile)
    except ValueError:
        print("######### ERROR: Could not open " + outfile)

    if compressor is None:
        compressor = Blosc(cname='zstd', clevel=2, shuffle=0)
    ######################################################################################
    #Create a list of delayed zarr.create commands (list_target_zarr) for each dask array in list_dask_array.
    ######################################################################################

    list_target_zarr = []
    list_dask_array = []

    for i in range(n_arrays):
        list_new_dim_name = []
        list_new_coord_name = []
        list_new_coord_dim_names = []

        #Create list of dimension chunk sizes on disk
        chunksize_on_disk = []

        #Get array chunksize on disk and add new dimentions
        #The order of the for loop is important, since chunksize_on_disk must have the correct dimention ordering.
        for dim_name in list_xarray_data_variables[i].dims:
            if dim_name in disk_dataset.dims:
                chunksize_on_disk.append(disk_dataset.chunks[dim_name][0])
            else:
                #Since the dimention does not exist on disk use chunking in list_xarray_data_variables[i]
                chunksize_on_disk.append(
                    list_xarray_data_variables[i].to_dataset().chunks[dim_name]
                    [0])

                #Add dim to be stored
                dask_dim_array = da.from_array(
                    list_xarray_data_variables[i][dim_name].data, chunks=(1, ))
                mapper = get_mapper(outfile + '/' + dim_name)
                list_target_zarr.append(
                    dask.delayed(zarr.create)(shape=dask_dim_array.shape,
                                              compressor=compressor,
                                              chunks=(1, ),
                                              dtype=dask_dim_array.dtype,
                                              store=mapper,
                                              overwrite=True))

                list_dask_array.append(dask_dim_array)
                list_new_dim_name.append(dim_name)

        #Add all other non dimentional coordinates. Order does nor matter
        for coord_name in list_xarray_data_variables[i].coords._names:
            if coord_name not in list_xarray_data_variables[i].dims:
                if coord_name not in disk_dataset.coords._names:
                    coord_dim_names = list_xarray_data_variables[i][
                        coord_name].dims

                    coord_chunksize_on_disk = []
                    for coord_dim_name in coord_dim_names:
                        if coord_dim_name in disk_dataset.dims:
                            coord_chunksize_on_disk.append(
                                disk_dataset.chunks[coord_dim_name][0])
                        else:
                            #Since the dimention does not exist on disk use chunking in list_xarray_data_variables[i]
                            coord_chunksize_on_disk.append(
                                list_xarray_data_variables[i].to_dataset(
                                ).chunks[coord_dim_name][0])

                    #Add coord to be stored
                    dask_coord_array = list_xarray_data_variables[i][
                        coord_name].data.rechunk(
                            chunks=coord_chunksize_on_disk)
                    mapper = get_mapper(outfile + '/' + coord_name)
                    list_target_zarr.append(
                        dask.delayed(zarr.create)(
                            shape=dask_coord_array.shape,
                            compressor=compressor,
                            chunks=coord_chunksize_on_disk,
                            dtype=dask_coord_array.dtype,
                            store=mapper,
                            overwrite=True))

                    list_dask_array.append(dask_coord_array)

                    list_new_coord_dim_names.append(coord_dim_names)
                    list_new_coord_name.append(coord_name)

        #Rechunk the dask arrays to match the chunking on disk
        dask_array = list_xarray_data_variables[i].data.rechunk(
            chunksize_on_disk)
        list_dask_array.append(dask_array)

        #Create list of delayed objects
        mapper = get_mapper(outfile + '/' + list_xarray_data_variables[i].name)
        list_target_zarr.append(
            dask.delayed(zarr.create)(shape=dask_array.shape,
                                      compressor=compressor,
                                      chunks=chunksize_on_disk,
                                      dtype=dask_array.dtype,
                                      store=mapper,
                                      overwrite=True)
        )  #Can not specify the zarr file attributes at creation , attrs={'_ARRAY_DIMENSIONS':array_dimensions[i]} (last checked on May 2020 )

    #Trigger compute of delayed zarr.create functions in list_target_zarr.
    da.store(list_dask_array,
             list_target_zarr,
             compute=True,
             flush=True,
             lock=False)

    # Open zarr to add array dimension labels so that xarray.open_zarr works. This is the magic that allows xarray to understand zarr.
    dataset_group = zarr.open_group(outfile, mode='a')
    #This should one day be done during zarr.create. See https://github.com/zarr-developers/zarr-python/issues/538.
    #Data variables labels
    for i in range(n_arrays):
        dataset_group[list_xarray_data_variables[i].name].attrs[
            '_ARRAY_DIMENSIONS'] = list_xarray_data_variables[i].dims

    #Dimention labels
    for new_dim_name in list_new_dim_name:
        dataset_group[new_dim_name].attrs['_ARRAY_DIMENSIONS'] = new_dim_name

    #Coord labels
    for new_coord_name, new_coord_dim_names in zip(list_new_coord_name,
                                                   list_new_coord_dim_names):
        dataset_group[new_coord_name].attrs[
            '_ARRAY_DIMENSIONS'] = new_coord_dim_names

    time_to_calc_and_store = time.time() - start
    print('Time to append and execute graph ', graph_name,
          time_to_calc_and_store)
    dataset_group.attrs[graph_name + '_time'] = time_to_calc_and_store

    #Consolidate metadata #Can be improved by only adding appended metadata
    zarr.consolidate_metadata(outfile)

    if bool(chunks_return):
        return xr.open_zarr(outfile,
                            chunks=chunks_return,
                            overwrite_encoded_chunks=True,
                            consolidated=True)
    else:
        return xr.open_zarr(outfile,
                            overwrite_encoded_chunks=True,
                            consolidated=True)
Exemple #27
0
    def close(self):
        if self._consolidate_on_close:
            import zarr

            zarr.consolidate_metadata(self.ds.store)
Exemple #28
0
 def close(self):
     if self._consolidate_on_close:
         import zarr
         zarr.consolidate_metadata(self.ds.store)
Exemple #29
0
    }
    with tempfile.TemporaryDirectory(
        prefix="bgen_to_zarr_", suffix=".zarr", dir=tempdir
    ) as tmpdir:
        rechunked = rechunker_api.rechunk(
            ds,
            max_mem=max_mem,
            target_chunks=target_chunks,
            target_store=output,
            target_options=target_options,
            temp_store=tmpdir,
            executor="dask",
        )
        rechunked.execute()

    zarr.consolidate_metadata(output)

    ds: Dataset = xr.open_zarr(output, concat_characters=False)  # type: ignore[no-untyped-call]
    if pack:
        ds = unpack_variables(ds)

    return ds


def bgen_to_zarr(
    input: PathType,
    output: Union[PathType, MutableMapping[str, bytes]],
    region: Optional[Mapping[Hashable, Any]] = None,
    chunk_length: int = 10_000,
    chunk_width: int = 1_000,
    temp_chunk_length: int = 100,
def main(model, scenario, member):
    print('---------->', model, scenario, member)

    # get the output store
    key = f'{model}.{scenario}.{member}'

    target_uri = target.format(key=key)
    print(target_uri)
    store = get_store(target_uri)

    if skip_existing and '.zmetadata' in store:
        print(f'{key} in store, skipping...')
        return 'skipped'

    y_hist = get_obs().pipe(load_coords)

    if xy_region:
        y_hist = y_hist.isel(**xy_region)

    print('y_hist:\n', y_hist)

    x_hist = open_single(model, 'historical', member).pipe(process_cmip)

    if xy_region:
        x_hist = x_hist.isel(**xy_region)

    print('x_hist:\n', x_hist)

    x_scen = open_single(model, scenario, member).pipe(process_cmip)

    if xy_region:
        x_scen = x_scen.isel(**xy_region)
    if 'hist' in scenario:
        x_scen = x_scen.sel(time=hist_time)
    else:
        x_scen = x_scen.sel(time=future_time)
    print('x_scen:\n', x_scen)

    print('fitting models')

    models = {}
    y_scen = xr.Dataset()

    for v in bc_vars:
        print(v)

        models[v] = PointWiseDownscaler(
            TrendAwareQuantileMappingRegressor(QuantileMappingReressor(extrapolate='1to1'))
        )

        # train models with historical data
        models[v].fit(x_hist[v].sel(time=train_time), y_hist[v].sel(time=train_time))

        # predict this ensemble member
        y_scen[v] = models[v].predict(x_scen[v])

    y_scen = y_scen.chunk(chunks)
    print('y_scen:\n', y_scen)

    if dry_run:
        print('skipping write of ... dry_run=True')
        return 'skipped'
    else:
        store.clear()
        write = y_scen.to_zarr(store, compute=False, mode='w')
        write.compute(retries=3)
        zarr.consolidate_metadata(store)
        return 'done'
Exemple #31
0
def write_zarr(dataset,
               outfile,
               chunks_return={},
               chunks_on_disk={},
               compressor=None,
               graph_name='write_zarr'):
    """
    Write xarray dataset to zarr format on disk. When chunks_on_disk is not specified the chunking in the input dataset is used.
    When chunks_on_disk is specified that dataset is saved using that chunking. The dataset on disk is then opened and rechunked using chunks_return or the chunking of dataset.
    Parameters
    ----------
    dataset : xarray.core.dataset.Dataset
        Dataset to write to disk
    outfile : str
        outfile filename, generally ends in .zarr
    chunks_return : dict of int
        A dictionary with the chunk size that will be returned. For example {'time': 20, 'chan': 6}.
        If chunks_return is not specified the chunking of dataset will be used.
    chunks_on_disk : dict of int
        A dictionary with the chunk size that will be used when writing to disk. For example {'time': 20, 'chan': 6}.
        If chunks_on_disk is not specified the chunking of dataset will be used.
    compressor : numcodecs.blosc.Blosc
        The blosc compressor to use when saving the converted data to disk using zarr.
        If None the zstd compression algorithm used with compression level 2.
    graph_name : string
        The time taken to execute the graph and save the dataset is measured and saved as an attribute in the zarr file.
        The graph_name is the label for this timing information.
    Returns
    -------
    """
    import xarray as xr
    import zarr
    import time
    from numcodecs import Blosc
    from itertools import cycle
    from zarr.meta import json_dumps, json_loads
    from zarr.creation import normalize_store_arg, open_array

    #Check if disk chunking is specified
    if bool(chunks_on_disk):
        dataset_for_disk = dataset.chunk(chunks=chunks_on_disk)
    else:
        dataset_for_disk = dataset

    if compressor is None:
        compressor = Blosc(cname='zstd', clevel=2, shuffle=0)

    #Create compression encoding for each datavariable
    encoding = dict(
        zip(list(dataset_for_disk.data_vars),
            cycle([{
                'compressor': compressor
            }])))
    start = time.time()
    #Consolidated is set to False so that the timing information is included in the consolidate metadata.
    xr.Dataset.to_zarr(dataset_for_disk,
                       store=outfile,
                       mode='w',
                       encoding=encoding,
                       consolidated=False)
    time_to_calc_and_store = time.time() - start
    print('Time to store and execute graph ', graph_name,
          time_to_calc_and_store)

    #Add timing information
    dataset_group = zarr.open_group(outfile, mode='a')
    dataset_group.attrs[graph_name + '_time'] = time_to_calc_and_store

    #Consolidate metadata
    zarr.consolidate_metadata(outfile)

    if bool(chunks_return):
        return xr.open_zarr(outfile,
                            consolidated=True,
                            overwrite_encoded_chunks=True)
    else:
        #Get input dataset chunking
        for dim_key in dataset.chunks:
            chunks_return[dim_key] = dataset.chunks[dim_key][0]
        return xr.open_zarr(outfile,
                            chunks=chunks_return,
                            consolidated=True,
                            overwrite_encoded_chunks=True)