Esempio n. 1
0
def _prune(input_path: str = None, dry_run: bool = False, monitor=None):
    from xcube.core.chunk import get_empty_dataset_chunks
    from xcube.core.dsio import guess_dataset_format
    from xcube.core.dsio import open_cube

    input_format = guess_dataset_format(input_path)
    if input_format != FORMAT_NAME_ZARR:
        raise click.ClickException("input must be a cube in ZARR format")

    monitor(f'Opening cube from {input_path!r}...')
    with open_cube(input_path) as cube:
        monitor('Identifying empty blocks...')
        empty_chunks = get_empty_dataset_chunks(cube)

    num_deleted = 0
    for var_name, chunk_indices in empty_chunks.items():
        monitor(
            f'Deleting {len(chunk_indices)} empty block file(s) for variable {var_name!r}...'
        )
        for chunk_index in chunk_indices:
            ok = _delete_block_file(input_path, var_name, chunk_index, dry_run,
                                    monitor)
            if ok:
                num_deleted += 1

    monitor(f'Done, {num_deleted} block file(s) deleted.')
Esempio n. 2
0
def vars2dim(cube, variable, dim_name, output=None, format=None):
    """
    Convert cube variables into new dimension.
    Moves all variables of CUBE into into a single new variable <var-name>
    with a new dimension DIM-NAME and writes the results to OUTPUT.
    """

    from xcube.core.dsio import guess_dataset_format
    from xcube.core.dsio import open_dataset, write_dataset
    from xcube.core.vars2dim import vars_to_dim
    import os

    if not output:
        dirname = os.path.dirname(cube)
        basename = os.path.basename(cube)
        basename, ext = os.path.splitext(basename)
        output = os.path.join(dirname, basename + '-vars2dim' + ext)

    format_name = format if format else guess_dataset_format(output)

    with open_dataset(input_path=cube) as ds:
        converted_dataset = vars_to_dim(ds,
                                        dim_name=dim_name,
                                        var_name=variable)
        write_dataset(converted_dataset,
                      output_path=output,
                      format_name=format_name)
Esempio n. 3
0
def _rectify(input_path: str, xy_names: Optional[Tuple[str, str]],
             var_names: Optional[Sequence[str]], output_path: str,
             output_format: Optional[str], output_size: Optional[Tuple[int,
                                                                       int]],
             output_tile_size: Optional[Tuple[int, int]],
             output_point: Optional[Tuple[float,
                                          float]], output_res: Optional[float],
             delta: float, dry_run: bool, monitor):
    from xcube.core.dsio import guess_dataset_format
    from xcube.core.dsio import open_dataset
    from xcube.core.dsio import write_dataset
    from xcube.core.rectify import rectify_dataset
    from xcube.core.rectify import ImageGeom
    from xcube.core.sentinel3 import is_sentinel3_product
    from xcube.core.sentinel3 import open_sentinel3_product

    if not output_format:
        output_format = guess_dataset_format(output_path)

    output_geom = None
    if output_size is not None and output_point is not None and output_res is not None:
        output_geom = ImageGeom(size=output_size,
                                x_min=output_point[0],
                                y_min=output_point[1],
                                xy_res=output_res)
    elif output_size is not None or output_point is not None or output_res is not None:
        raise click.ClickException(
            'SIZE, POINT, and RES must all be given or none of them.')

    monitor(f'Opening dataset from {input_path!r}...')

    if is_sentinel3_product(input_path):
        src_ds = open_sentinel3_product(input_path)
    else:
        src_ds = open_dataset(input_path)

    monitor('Rectifying...')
    reproj_ds = rectify_dataset(src_ds,
                                xy_names=xy_names,
                                var_names=var_names,
                                output_geom=output_geom,
                                tile_size=output_tile_size,
                                uv_delta=delta)

    if reproj_ds is None:
        monitor(
            f'Dataset {input_path} does not seem to have an intersection with bounding box'
        )
        return

    monitor(f'Writing rectified dataset to {output_path!r}...')
    if not dry_run:
        write_dataset(reproj_ds, output_path, output_format)

    monitor(f'Done.')
Esempio n. 4
0
def guess_ml_dataset_format(path: str) -> str:
    """
    Guess a multilevel-dataset format for a file system path or URL given by *path*.

    :param path: A file system path or URL.
    :return: The name of a dataset format guessed from *path*.
    """
    if path.endswith('.levels'):
        return FORMAT_NAME_LEVELS
    if path.endswith('.py'):
        return FORMAT_NAME_SCRIPT
    return guess_dataset_format(path)
Esempio n. 5
0
def _resample_in_time(input_path: str = None,
                      variables: Sequence[str] = None,
                      metadata: Dict[str, Any] = None,
                      output_path: str = DEFAULT_OUTPUT_PATH,
                      output_format: str = None,
                      methods: Sequence[str] = (DEFAULT_RESAMPLING_METHOD,),
                      frequency: str = DEFAULT_RESAMPLING_FREQUENCY,
                      offset: str = None,
                      base: int = DEFAULT_RESAMPLING_BASE,
                      interp_kind: str = DEFAULT_INTERPOLATION_KIND,
                      tolerance: str = None,
                      dry_run: bool = False,
                      monitor=None):
    from xcube.core.dsio import guess_dataset_format
    from xcube.core.dsio import open_cube
    from xcube.core.dsio import write_cube
    from xcube.core.resample import resample_in_time
    from xcube.core.update import update_dataset_chunk_encoding

    if not output_format:
        output_format = guess_dataset_format(output_path)

    monitor(f'Opening cube from {input_path!r}...')
    with open_cube(input_path) as ds:

        monitor('Resampling...')
        agg_ds = resample_in_time(ds,
                                  frequency=frequency,
                                  method=methods,
                                  offset=offset,
                                  base=base,
                                  interp_kind=interp_kind,
                                  tolerance=tolerance,
                                  time_chunk_size=1,
                                  var_names=variables,
                                  metadata=metadata)

        agg_ds = update_dataset_chunk_encoding(agg_ds,
                                               chunk_sizes={},
                                               format_name=output_format,
                                               in_place=True)

        monitor(f'Writing resampled cube to {output_path!r}...')
        if not dry_run:
            write_cube(agg_ds, output_path, output_format, cube_asserted=True)

        monitor(f'Done.')
Esempio n. 6
0
def chunk(cube, output, format=None, params=None, chunks=None):
    """
    (Re-)chunk xcube dataset.
    Changes the external chunking of all variables of CUBE according to CHUNKS and writes
    the result to OUTPUT.

    Note: There is a possibly more efficient way to (re-)chunk datasets through the
    dedicated tool "rechunker", see https://rechunker.readthedocs.io.
    """
    chunk_sizes = None
    if chunks:
        chunk_sizes = parse_cli_kwargs(chunks, metavar="CHUNKS")
        for k, v in chunk_sizes.items():
            if not isinstance(v, int) or v <= 0:
                raise click.ClickException(
                    "Invalid value for CHUNKS, "
                    f"chunk sizes must be positive integers: {chunks}")

    write_kwargs = dict()
    if params:
        write_kwargs = parse_cli_kwargs(params, metavar="PARAMS")

    from xcube.core.chunk import chunk_dataset
    from xcube.core.dsio import guess_dataset_format
    from xcube.core.dsio import open_dataset, write_dataset

    format_name = format if format else guess_dataset_format(output)

    with open_dataset(input_path=cube) as ds:
        if chunk_sizes:
            for k in chunk_sizes:
                if k not in ds.dims:
                    raise click.ClickException(
                        "Invalid value for CHUNKS, "
                        f"{k!r} is not the name of any dimension: {chunks}")

        chunked_dataset = chunk_dataset(ds,
                                        chunk_sizes=chunk_sizes,
                                        format_name=format_name)
        write_dataset(chunked_dataset,
                      output_path=output,
                      format_name=format_name,
                      **write_kwargs)
Esempio n. 7
0
def _prune(input_path: str, dry_run: bool, monitor: Monitor):
    from xcube.core.chunk import get_empty_dataset_chunks
    from xcube.core.dsio import guess_dataset_format
    from xcube.core.dsio import open_dataset

    input_format = guess_dataset_format(input_path)
    if input_format != FORMAT_NAME_ZARR:
        raise click.ClickException("input must be a dataset in Zarr format")

    num_deleted_total = 0

    monitor(f'Opening dataset from {input_path!r}...', 1)
    with open_dataset(input_path) as dataset:
        monitor('Identifying empty chunks...', 1)
        for var_name, chunk_indices in get_empty_dataset_chunks(dataset):
            num_empty_chunks = 0
            num_deleted = 0
            for chunk_index in chunk_indices:
                num_empty_chunks += 1
                if num_empty_chunks == 1:
                    monitor(
                        f'Found empty chunks in variable {var_name!r}, '
                        f'deleting block files...', 2)

                ok = _delete_block_file(input_path, var_name, chunk_index,
                                        dry_run, monitor)
                if ok:
                    num_deleted += 1
            if num_deleted > 0:
                monitor(
                    f'Deleted {num_deleted} block file(s) '
                    f'for variable {var_name!r}.', 2)
            elif num_empty_chunks > 0:
                monitor(
                    f'No block files for variable {var_name!r} '
                    f'could be deleted.', 2)
            num_deleted_total += num_deleted

    monitor(f'Done, {num_deleted_total} block file(s) deleted total.', 1)
Esempio n. 8
0
def gen_cube(input_paths: Sequence[str] = None,
             input_processor_name: str = None,
             input_processor_params: Dict = None,
             input_reader_name: str = None,
             input_reader_params: Dict[str, Any] = None,
             output_region: Tuple[float, float, float, float] = None,
             output_size: Tuple[int, int] = DEFAULT_OUTPUT_SIZE,
             output_resampling: str = DEFAULT_OUTPUT_RESAMPLING,
             output_path: str = DEFAULT_OUTPUT_PATH,
             output_writer_name: str = None,
             output_writer_params: Dict[str, Any] = None,
             output_metadata: NameAnyDict = None,
             output_variables: NameDictPairList = None,
             processed_variables: NameDictPairList = None,
             profile_mode: bool = False,
             no_sort_mode: bool = False,
             append_mode: bool = None,
             dry_run: bool = False,
             monitor: Callable[..., None] = None) -> bool:
    """
    Generate a xcube dataset from one or more input files.

    :param no_sort_mode:
    :param input_paths: The input paths.
    :param input_processor_name: Name of a registered input processor
        (xcube.core.gen.inputprocessor.InputProcessor) to be used to transform the inputs.
    :param input_processor_params: Parameters to be passed to the input processor.
    :param input_reader_name: Name of a registered input reader (xcube.core.util.dsio.DatasetIO).
    :param input_reader_params: Parameters passed to the input reader.
    :param output_region: Output region as tuple of floats: (lon_min, lat_min, lon_max, lat_max).
    :param output_size: The spatial dimensions of the output as tuple of ints: (width, height).
    :param output_resampling: The resampling method for the output.
    :param output_path: The output directory.
    :param output_writer_name: Name of an output writer
        (xcube.core.util.dsio.DatasetIO) used to write the cube.
    :param output_writer_params: Parameters passed to the output writer.
    :param output_metadata: Extra metadata passed to output cube.
    :param output_variables: Output variables.
    :param processed_variables: Processed variables computed on-the-fly.
    :param profile_mode: Whether profiling should be enabled.
    :param append_mode: Deprecated. The function will always either insert, replace, or append new time slices.
    :param dry_run: Doesn't write any data. For testing.
    :param monitor: A progress monitor.
    :return: True for success.
    """

    if append_mode is not None:
        warnings.warn(
            'append_mode in gen_cube() is deprecated, '
            'time slices will now always be inserted, replaced, or appended.')

    if input_processor_name is None:
        input_processor_name = 'default'
    elif input_processor_name == '':
        raise ValueError('input_processor_name must not be empty')

    input_processor_class = find_input_processor_class(input_processor_name)
    if not input_processor_class:
        raise ValueError(
            f'Unknown input_processor_name {input_processor_name!r}')

    if not issubclass(input_processor_class, InputProcessor):
        raise ValueError(
            f'Invalid input_processor_name {input_processor_name!r}: '
            f'must name a sub-class of {InputProcessor.__qualname__}')

    try:
        input_processor = input_processor_class(
            **(input_processor_params or {}))
    except (ValueError, TypeError) as e:
        raise ValueError(
            f'Invalid input_processor_name or input_processor_params: {e}'
        ) from e

    input_reader = find_dataset_io(input_reader_name
                                   or input_processor.input_reader)
    if not input_reader:
        raise ValueError(f'Unknown input_reader_name {input_reader_name!r}')

    if not output_path:
        raise ValueError('Missing output_path')

    output_writer_name = output_writer_name or guess_dataset_format(
        output_path)
    if not output_writer_name:
        raise ValueError(
            f'Failed to guess output_writer_name from path {output_path}')
    output_writer = find_dataset_io(output_writer_name, modes={'w', 'a'})
    if not output_writer:
        raise ValueError(f'Unknown output_writer_name {output_writer_name!r}')

    if monitor is None:
        # noinspection PyUnusedLocal
        def monitor(*args):
            pass

    input_paths = [
        input_file for f in input_paths
        for input_file in glob.glob(f, recursive=True)
    ]

    if not no_sort_mode and len(input_paths) > 1:
        input_paths = _get_sorted_input_paths(input_processor, input_paths)

    if not dry_run:
        output_dir = os.path.abspath(os.path.dirname(output_path))
        os.makedirs(output_dir, exist_ok=True)

    effective_input_reader_params = dict(input_processor.input_reader_params
                                         or {})
    effective_input_reader_params.update(input_reader_params or {})

    effective_output_writer_params = output_writer_params or {}

    status = False

    ds_count = len(input_paths)
    ds_count_ok = 0
    ds_index = 0
    for input_file in input_paths:
        monitor(
            f'processing dataset {ds_index + 1} of {ds_count}: {input_file!r}...'
        )
        # noinspection PyTypeChecker
        status = _process_input(input_processor, input_reader,
                                effective_input_reader_params, output_writer,
                                effective_output_writer_params, input_file,
                                output_size, output_region, output_resampling,
                                output_path, output_metadata, output_variables,
                                processed_variables, profile_mode, dry_run,
                                monitor)
        ds_index += 1
        if status:
            ds_count_ok += 1

    monitor(f'{ds_count_ok} of {ds_count} datasets processed successfully, '
            f'{ds_count - ds_count_ok} were dropped due to errors')

    return status
Esempio n. 9
0
def guess_cube_format(path: str) -> str:
    if path.endswith('.levels'):
        return FORMAT_NAME_LEVELS
    return guess_dataset_format(path)
Esempio n. 10
0
def _rectify(input_path: str,
             xy_names: Optional[Tuple[str, str]],
             var_names: Optional[Sequence[str]],
             output_path: str,
             output_format: Optional[str],
             output_size: Optional[Tuple[int, int]],
             output_tile_size: Optional[Tuple[int, int]],
             output_point: Optional[Tuple[float, float]],
             output_res: Optional[float],
             output_crs: Optional[str],
             delta: float,
             dry_run: bool,
             monitor):
    import pyproj.crs

    from xcube.core.dsio import guess_dataset_format
    from xcube.core.dsio import open_dataset
    from xcube.core.dsio import write_dataset
    from xcube.core.gridmapping import GridMapping
    from xcube.core.resampling import rectify_dataset
    from xcube.core.sentinel3 import is_sentinel3_product
    from xcube.core.sentinel3 import open_sentinel3_product

    if not output_format:
        output_format = guess_dataset_format(output_path)

    output_gm = None
    output_gm_given = (output_size is not None,
                       output_point is not None,
                       output_res is not None,
                       output_crs is not None)
    if all(output_gm_given):
        output_gm = GridMapping.regular(size=output_size,
                                        xy_min=output_point,
                                        xy_res=output_res,
                                        crs=pyproj.crs.CRS.from_user_input(output_crs))
    elif any(output_gm_given):
        raise click.ClickException('SIZE, POINT, RES, and CRS must all be given or none of them.')

    monitor(f'Opening dataset from {input_path!r}...')

    if is_sentinel3_product(input_path):
        src_ds = open_sentinel3_product(input_path)
    else:
        src_ds = open_dataset(input_path)

    monitor('Rectifying...')
    rectified_ds = rectify_dataset(src_ds,
                                   xy_var_names=xy_names,
                                   var_names=var_names,
                                   target_gm=output_gm,
                                   tile_size=output_tile_size,
                                   uv_delta=delta)

    if rectified_ds is None:
        monitor(f'Dataset {input_path} does not seem to have an intersection with bounding box')
        return

    monitor(f'Writing rectified dataset to {output_path!r}...')
    if not dry_run:
        write_dataset(rectified_ds, output_path, output_format)

    monitor(f'Done.')
Esempio n. 11
0
def compute(script: str,
            cube: List[str],
            input_var_names: str,
            input_params: str,
            output_path: str,
            output_format: str,
            output_var_name: str,
            output_var_dtype: str):
    """
    Compute a cube from one or more other cubes.

    The command computes a cube variable from other cube variables in CUBEs
    using a user-provided Python function in SCRIPT.

    The SCRIPT must define a function named "compute":

    \b
        def compute(*input_vars: numpy.ndarray,
                    input_params: Mapping[str, Any] = None,
                    dim_coords: Mapping[str, np.ndarray] = None,
                    dim_ranges: Mapping[str, Tuple[int, int]] = None) \\
                    -> numpy.ndarray:
            # Compute new numpy array from inputs
            # output_array = ...
            return output_array

    where input_vars are numpy arrays (chunks) in the order given by VARIABLES or given by the variable names returned
    by an optional "initialize" function that my be defined in SCRIPT too, see below. input_params is a mapping of
    parameter names to values according to PARAMS or the ones returned by the aforesaid "initialize" function.
    dim_coords is a mapping from dimension name to coordinate labels for the current chunk to be computed.
    dim_ranges is a mapping from dimension name to index ranges into coordinate arrays of the cube.

    The SCRIPT may define a function named "initialize":

    \b
        def initialize(input_cubes: Sequence[xr.Dataset],
                       input_var_names: Sequence[str],
                       input_params: Mapping[str, Any]) \\
                       -> Tuple[Sequence[str], Mapping[str, Any]]:
            # Compute new variable names and/or new parameters
            # new_input_var_names = ...
            # new_input_params = ...
            return new_input_var_names, new_input_params

    where input_cubes are the respective CUBEs, input_var_names the respective VARIABLES, and input_params
    are the respective PARAMS. The "initialize" function can be used to validate the data cubes, extract
    the desired variables in desired order and to provide some extra processing parameters passed to the
    "compute" function.

    Note that if no input variable names are specified, no variables are passed to the "compute" function.

    The SCRIPT may also define a function named "finalize":

    \b
        def finalize(output_cube: xr.Dataset,
                     input_params: Mapping[str, Any]) \\
                     -> Optional[xr.Dataset]:
            # Optionally modify output_cube and return it or return None
            return output_cube

    If defined, the "finalize" function will be called before the command writes the
    new cube and then exists. The functions may perform a cleaning up or perform side effects such
    as write the cube to some sink. If the functions returns None, the CLI will *not* write
    any cube data.

    """
    from xcube.cli.common import parse_cli_kwargs
    from xcube.core.compute import compute_cube
    from xcube.core.dsio import open_cube
    from xcube.core.dsio import guess_dataset_format, find_dataset_io

    input_paths = cube

    compute_function_name = "compute"
    initialize_function_name = "initialize"
    finalize_function_name = "finalize"

    with open(script, "r") as fp:
        code = fp.read()

    locals_dict = dict()
    exec(code, globals(), locals_dict)

    input_var_names = list(map(lambda s: s.strip(), input_var_names.split(","))) if input_var_names else None

    compute_function = _get_function(locals_dict, compute_function_name, script, force=True)
    initialize_function = _get_function(locals_dict, initialize_function_name, script, force=False)
    finalize_function = _get_function(locals_dict, finalize_function_name, script, force=False)

    input_params = parse_cli_kwargs(input_params, "PARAMS")

    input_cubes = []
    for input_path in input_paths:
        input_cubes.append(open_cube(input_path=input_path))

    if initialize_function:
        input_var_names, input_params = initialize_function(input_cubes, input_var_names, input_params)

    output_cube = compute_cube(compute_function,
                               *input_cubes,
                               input_var_names=input_var_names,
                               input_params=input_params,
                               output_var_name=output_var_name,
                               output_var_dtype=output_var_dtype)

    if finalize_function:
        output_cube = finalize_function(output_cube)

    if output_cube is not None:
        output_format = output_format or guess_dataset_format(output_path)
        dataset_io = find_dataset_io(output_format, {"w"})
        dataset_io.write(output_cube, output_path)