def chunk(cube, output, format=None, params=None, chunks=None): """ (Re-)chunk xcube dataset. Changes the external chunking of all variables of CUBE according to CHUNKS and writes the result to OUTPUT. """ chunk_sizes = None if chunks: chunk_sizes = parse_cli_kwargs(chunks, metavar="CHUNKS") for k, v in chunk_sizes.items(): if not isinstance(v, int) or v <= 0: raise click.ClickException("Invalid value for CHUNKS, " f"chunk sizes must be positive integers: {chunks}") write_kwargs = dict() if params: write_kwargs = parse_cli_kwargs(params, metavar="PARAMS") from xcube.util.dsio import guess_dataset_format format_name = format if format else guess_dataset_format(output) from xcube.api import open_dataset, chunk_dataset, write_dataset with open_dataset(input_path=cube) as ds: if chunk_sizes: for k in chunk_sizes: if k not in ds.dims: raise click.ClickException("Invalid value for CHUNKS, " f"{k!r} is not the name of any dimension: {chunks}") chunked_dataset = chunk_dataset(ds, chunk_sizes=chunk_sizes, format_name=format_name) write_dataset(chunked_dataset, output_path=output, format_name=format_name, **write_kwargs)
def _prune(input_path: str = None, dry_run: bool = False, monitor=None): from xcube.api import open_cube from xcube.util.dsio import guess_dataset_format input_format = guess_dataset_format(input_path) if input_format != FORMAT_NAME_ZARR: raise click.ClickException("input must be a cube in ZARR format") monitor(f'Opening cube from {input_path!r}...') with open_cube(input_path) as cube: monitor('Identifying empty blocks...') empty_chunks = get_empty_dataset_chunks(cube) num_deleted = 0 for var_name, chunk_indices in empty_chunks.items(): monitor( f'Deleting {len(chunk_indices)} empty block file(s) for variable {var_name!r}...' ) for chunk_index in chunk_indices: ok = _delete_block_file(input_path, var_name, chunk_index, dry_run, monitor) if ok: num_deleted += 1 monitor(f'Done, {num_deleted} block file(s) deleted.')
def read_dataset(input_path: str, format_name: str = None, is_cube: bool = False, **kwargs) -> xr.Dataset: """ Read dataset from *input_path*. If *format* is not provided it will be guessed from *output_path*. :param input_path: input path :param format_name: format, e.g. "zarr" or "netcdf4" :param is_cube: Weather a ValueError will be raised, if the dataset read from *input_path* is not a data cube. :param kwargs: format-specific keyword arguments :return: dataset object """ format_name = format_name if format_name else guess_dataset_format( input_path) if format_name is None: raise ValueError("Unknown input format") dataset_io = find_dataset_io(format_name, modes=["r"]) if dataset_io is None: raise ValueError( f"Unknown input format {format_name!r} for {input_path}") dataset = dataset_io.read(input_path, **kwargs) if is_cube: assert_cube(dataset) return dataset
def write_dataset(dataset: xr.Dataset, output_path: str, format_name: str = None, **kwargs) -> xr.Dataset: """ Write dataset to *output_path*. If *format* is not provided it will be guessed from *output_path*. :param dataset: Dataset to be written. :param output_path: output path :param format_name: format, e.g. "zarr" or "netcdf4" :param kwargs: format-specific keyword arguments :return: the input dataset """ format_name = format_name if format_name else guess_dataset_format( output_path) if format_name is None: raise ValueError("Unknown output format") dataset_io = find_dataset_io(format_name, modes=["w"]) if dataset_io is None: raise ValueError( f"Unknown output format {format_name!r} for {output_path}") dataset_io.write(dataset, output_path, **kwargs) return dataset
def vars2dim(cube, var_name, dim_name, output=None, format=None): """ Convert cube variables into new dimension. Moves all variables of <cube> into into a single new variable <var-name> with a new dimension <dim-name> and writes the results to <output>. """ from xcube.util.dsio import guess_dataset_format from xcube.api import open_dataset, vars_to_dim, write_dataset import os if not output: dirname = os.path.dirname(cube) basename = os.path.basename(cube) basename, ext = os.path.splitext(basename) output = os.path.join(dirname, basename + '-vars2dim' + ext) format_name = format if format else guess_dataset_format(output) with open_dataset(input_path=cube) as ds: converted_dataset = vars_to_dim(ds, dim_name=dim_name, var_name=var_name) write_dataset(converted_dataset, output_path=output, format_name=format_name)
def _resample_in_time(input_path: str = None, variables: Sequence[str] = None, metadata: Dict[str, Any] = None, output_path: str = DEFAULT_OUTPUT_PATH, output_format: str = None, methods: Sequence[str] = (DEFAULT_RESAMPLING_METHOD, ), frequency: str = DEFAULT_RESAMPLING_FREQUENCY, offset: str = None, base: str = DEFAULT_RESAMPLING_BASE, interp_kind: str = DEFAULT_INTERPOLATION_KIND, tolerance: str = None, dry_run: bool = False, monitor=None): from xcube.api import open_cube from xcube.api.readwrite import write_cube from xcube.api.resample import resample_in_time from xcube.util.dsio import guess_dataset_format if not output_format: output_format = guess_dataset_format(output_path) monitor(f'Opening cube from {input_path!r}...') with open_cube(input_path) as ds: monitor('Resampling...') agg_ds = resample_in_time(ds, frequency=frequency, method=methods, offset=offset, base=base, interp_kind=interp_kind, tolerance=tolerance, var_names=variables, metadata=metadata) monitor(f'Writing resampled cube to {output_path!r}...') if not dry_run: write_cube(agg_ds, output_path, output_format, cube_asserted=True) monitor(f'Done.')
def apply(output: str, script: str, input: str, params: str, vars: str, dask: str, format: str, dtype: str): """ Apply a function to data cubes. The function is used to transform N chunks of equal shape to a new chunk of same shape. N is the number of variables from all data cubes. Uses the Python program <script> to transform data cubes given by <inputs> into a new data cube given by <output>. The <script> must define a function ``apply(*variables, **params)`` where variables are numpy arrays (chunks) in the order given by <vars> or given by the variables returned by an optional ``init()`` function that my be defined in <script>. If neither <vars> nor an ``init()`` function is defined, all variables are passed in arbitrary order. The optional ``init(*cubes, **params)`` function can be used to validate the data cubes, extract the desired variables in desired order and to provide some extra processing parameters passed to the ``apply()`` function. The ``init()`` argument *cubes* are the ``xarray.Dataset`` objects according to <input> and *params* are according to <params>. The return value of ``init()`` is a tuple (*variables*, *new_params*) where *variables* is a list of ``xarray.DataArray`` objects and *new_params* are newly computed parameters passed to ``apply()``. """ input_paths = input output_path = output apply_function_name = "apply" init_function_name = "init" with open(script, "r") as fp: code = fp.read() locals_dict = dict() exec(code, globals(), locals_dict) var_names = list(map(lambda s: s.strip(), vars.split(","))) if vars else None init_function = locals_dict.get(init_function_name) if init_function is not None and not callable(init_function): raise click.ClickException(f"{init_function_name!r} in {script} is not a callable") apply_function = locals_dict.get(apply_function_name) if apply_function is None: raise click.ClickException(f"missing function {apply_function_name!r} in {script}") if not callable(apply_function): raise click.ClickException(f"{apply_function!r} in {script} is not a callable") from xcube.api import read_cube from xcube.util.cliutil import parse_cli_kwargs from xcube.util.dsio import guess_dataset_format, find_dataset_io kwargs = parse_cli_kwargs(params, "<params>") input_cube_0 = None input_cubes = [] for input_path in input_paths: input_cube = read_cube(input_path=input_path) if input_cube_0 is None: input_cube_0 = input_cube else: # TODO (forman): make sure input_cube's and input_cube_0's coords and chunking are compatible pass input_cubes.append(input_cube) if var_names: input_cubes = [input_cube.drop(labels=set(input_cube.data_vars).difference(set(var_names))) for input_cube in input_cubes] import xarray as xr if init_function: variables, params = init_function(*input_cubes, **kwargs) else: variables, params = xr.merge(input_cubes).data_vars.values(), kwargs output_variable = xr.apply_ufunc(apply_function, *variables, dask=dask, output_dtypes=[dtype] if dask == "parallelized" else None) format = format or guess_dataset_format(output_path) dataset_io = find_dataset_io(format, {"w"}) dataset_io.write(xr.Dataset(dict(output=output_variable)), output_path)