Esempio n. 1
0
 def test_mask_dataset_for_chunked_input(self):
     cube = chunk_dataset(self.cube,
                          chunk_sizes=dict(time=1, lat=90, lon=90))
     cube = mask_dataset_by_geometry(cube, self.triangle)
     self._assert_clipped_dataset_has_basic_props(cube)
     self.assertEqual(((1, 1, 1, 1, 1), (4, ), (7, )), cube.temp.chunks)
     self.assertEqual(((1, 1, 1, 1, 1), (4, ), (7, )), cube.precip.chunks)
Esempio n. 2
0
 def setUp(self) -> None:
     cube = new_cube(width=360,
                     height=180,
                     time_periods=6,
                     variables=dict(analysed_sst=275.3, analysis_error=2.1))
     cube = chunk_dataset(cube, dict(time=3, lat=90, lon=90))
     self.cube = cube
Esempio n. 3
0
def append_time_slice(store: Union[str, MutableMapping],
                      time_slice: xr.Dataset,
                      chunk_sizes: Dict[str, int] = None):
    """
    Append time slice to existing zarr dataset.

    :param store: A zarr store.
    :param time_slice: Time slice to insert
    :param chunk_sizes: desired chunk sizes
    """
    if chunk_sizes:
        time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr')

    # Unfortunately time_slice.to_zarr(store, mode='a', append_dim='time') will replace global attributes of store
    # with attributes of time_slice (xarray bug?), which are usually empty in our case.
    # Hence, we must save our old attributes in a copy of time_slice.
    ds = zarr.open_group(store, mode='r')
    time_slice = time_slice.copy()
    time_slice.attrs.update(ds.attrs)
    if 'coordinates' in time_slice.attrs:
        # Remove 'coordinates', otherwise we get
        # ValueError: cannot serialize coordinates because the global attribute 'coordinates' already exists
        # from next time_slice.to_zarr(...) call.
        time_slice.attrs.pop('coordinates')

    time_slice.to_zarr(store, mode='a', append_dim='time')
    unchunk_dataset(store, coords_only=True)
Esempio n. 4
0
def update_time_slice(store: Union[str, MutableMapping],
                      insert_index: int,
                      time_slice: xr.Dataset,
                      mode: str,
                      chunk_sizes: Dict[str, int] = None):
    """
    Update existing zarr dataset by new time slice.

    :param store: A zarr store.
    :param insert_index: Time index
    :param time_slice: Time slice to insert
    :param mode: Update mode, 'insert' or 'replace'
    :param chunk_sizes: desired chunk sizes
    """

    if mode not in ('insert', 'replace'):
        raise ValueError(f'illegal mode value: {mode!r}')

    insert_mode = mode == 'insert'

    time_var_names = []
    encoding = {}
    with xr.open_zarr(store) as cube:
        for var_name in cube.variables:
            var = cube[var_name]
            if var.ndim >= 1 and 'time' in var.dims:
                if var.dims[0] != 'time':
                    raise ValueError(
                        f"dimension 'time' of variable {var_name!r} must be first dimension"
                    )
                time_var_names.append(var_name)
                encoding[var_name] = cube[var_name].encoding

    if chunk_sizes:
        time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr')
    temp_dir = tempfile.TemporaryDirectory(prefix='xcube-time-slice-',
                                           suffix='.zarr')
    time_slice.to_zarr(temp_dir.name, encoding=encoding)
    slice_root_group = zarr.open(temp_dir.name, mode='r')
    slice_arrays = dict(slice_root_group.arrays())

    cube_root_group = zarr.open(store, mode='r+')
    for var_name, var_array in cube_root_group.arrays():
        if var_name in time_var_names:
            slice_array = slice_arrays[var_name]
            if insert_mode:
                # Add one empty time step
                empty = zarr.creation.empty(slice_array.shape,
                                            dtype=var_array.dtype)
                var_array.append(empty, axis=0)
                # Shift contents
                var_array[insert_index + 1:, ...] = var_array[insert_index:-1,
                                                              ...]
            # Replace slice
            var_array[insert_index, ...] = slice_array[0]

    unchunk_dataset(store, coords_only=True)
Esempio n. 5
0
 def make_cube(self, start_date, num_days: int) -> xr.Dataset:
     cube = new_cube(time_periods=num_days,
                     time_freq='1D',
                     time_start=start_date,
                     variables=dict(precipitation=0.1,
                                    temperature=270.5,
                                    soil_moisture=0.2))
     chunk_sizes = dict(time=1, lat=90, lon=90)
     cube = chunk_dataset(cube, chunk_sizes, format_name='zarr')
     return cube
Esempio n. 6
0
    def chunk(self, chunk_sizes: Dict[str, int] = None, format_name: str = None) -> xr.Dataset:
        """
        Chunk this dataset and update encodings for given format.

        :param chunk_sizes: mapping from dimension name to new chunk size
        :param format_name: format, e.g. "zarr" or "netcdf4"
        :return: the re-chunked dataset
        """
        return chunk_dataset(self._dataset,
                             chunk_sizes=chunk_sizes,
                             format_name=format_name)
Esempio n. 7
0
    def test_chunk_dataset(self):
        dataset = new_test_dataset([
            "2010-01-01", "2010-01-02", "2010-01-03", "2010-01-04",
            "2010-01-05"
        ],
                                   precipitation=0.4,
                                   temperature=275.2)

        chunked_dataset = chunk_dataset(dataset,
                                        chunk_sizes=dict(time=1,
                                                         lat=10,
                                                         lon=20),
                                        format_name="zarr")
        self.assertEqual({'chunks': (1, 10, 20)},
                         chunked_dataset.precipitation.encoding)
        self.assertEqual({'chunks': (1, 10, 20)},
                         chunked_dataset.temperature.encoding)

        chunked_dataset = chunk_dataset(dataset,
                                        chunk_sizes=dict(time=1,
                                                         lat=20,
                                                         lon=40),
                                        format_name="netcdf4")
        self.assertEqual({'chunksizes': (1, 20, 40)},
                         chunked_dataset.precipitation.encoding)
        self.assertEqual({'chunksizes': (1, 20, 40)},
                         chunked_dataset.temperature.encoding)

        chunked_dataset = chunk_dataset(dataset,
                                        chunk_sizes=dict(time=1,
                                                         lat=20,
                                                         lon=40))
        self.assertEqual({}, chunked_dataset.precipitation.encoding)
        self.assertEqual({}, chunked_dataset.temperature.encoding)

        dataset = dataset.chunk(dict(time=2, lat=10, lon=20))

        chunked_dataset = chunk_dataset(dataset,
                                        chunk_sizes=None,
                                        format_name="zarr")
        self.assertEqual({}, chunked_dataset.precipitation.encoding)
        self.assertEqual({}, chunked_dataset.temperature.encoding)

        chunked_dataset = chunk_dataset(dataset,
                                        chunk_sizes={},
                                        format_name="zarr")
        self.assertEqual({'chunks': (2, 10, 20)},
                         chunked_dataset.precipitation.encoding)
        self.assertEqual({'chunks': (2, 10, 20)},
                         chunked_dataset.temperature.encoding)

        chunked_dataset = chunk_dataset(dataset,
                                        chunk_sizes=dict(time=1),
                                        format_name="zarr")
        self.assertEqual({'chunks': (1, 10, 20)},
                         chunked_dataset.precipitation.encoding)
        self.assertEqual({'chunks': (1, 10, 20)},
                         chunked_dataset.temperature.encoding)
Esempio n. 8
0
    def setUp(self) -> None:
        num_times = 30

        time = []
        periods = ['1D', '1D', '3D', '4D', '2D']
        t = pd.to_datetime('2017-07-01T10:30:15Z', utc=True)
        for i in range(num_times):
            time.append(t.isoformat())
            t += pd.to_timedelta(periods[i % len(periods)])

        temperature, precipitation = zip(*[(272 + 0.1 * i, 120 - 0.2 * i) for i in range(num_times)])

        input_cube = new_test_dataset(time, temperature=temperature, precipitation=precipitation)
        input_cube = chunk_dataset(input_cube, chunk_sizes=dict(time=1, lat=90, lon=180))
        self.input_cube = input_cube
Esempio n. 9
0
    def test_local(self):
        cube = new_cube(time_periods=10,
                        time_start='2019-01-01',
                        variables=dict(precipitation=0.1,
                                       temperature=270.5,
                                       soil_moisture=0.2))
        cube = chunk_dataset(cube,
                             dict(time=1, lat=90, lon=90),
                             format_name='zarr')
        cube.to_zarr(self.CUBE_PATH)
        cube.close()

        diagnostic_store = DiagnosticStore(
            zarr.DirectoryStore(self.CUBE_PATH),
            logging_observer(log_path='local-cube.log'))
        xr.open_zarr(diagnostic_store)
Esempio n. 10
0
    def test_unchunk_dataset(self):
        dataset = new_test_dataset(["2010-01-01", "2010-01-02",
                                    "2010-01-03", "2010-01-04",
                                    "2010-01-05"],
                                   precipitation=0.4,
                                   temperature=275.2)

        for var in dataset.data_vars.values():
            var.encoding.update({"chunks": (5, 180, 360),
                                 "_FillValue": -999.0})

        chunked_dataset = chunk_dataset(dataset, format_name="zarr")
        self.assertEqual({"_FillValue": -999.0},
                         chunked_dataset.precipitation.encoding)
        self.assertEqual({"_FillValue": -999.0},
                         chunked_dataset.temperature.encoding)
Esempio n. 11
0
def chunk(cube, output, format=None, params=None, chunks=None):
    """
    (Re-)chunk xcube dataset.
    Changes the external chunking of all variables of CUBE according to CHUNKS and writes
    the result to OUTPUT.

    Note: There is a possibly more efficient way to (re-)chunk datasets through the
    dedicated tool "rechunker", see https://rechunker.readthedocs.io.
    """
    chunk_sizes = None
    if chunks:
        chunk_sizes = parse_cli_kwargs(chunks, metavar="CHUNKS")
        for k, v in chunk_sizes.items():
            if not isinstance(v, int) or v <= 0:
                raise click.ClickException(
                    "Invalid value for CHUNKS, "
                    f"chunk sizes must be positive integers: {chunks}")

    write_kwargs = dict()
    if params:
        write_kwargs = parse_cli_kwargs(params, metavar="PARAMS")

    from xcube.core.chunk import chunk_dataset
    from xcube.core.dsio import guess_dataset_format
    from xcube.core.dsio import open_dataset, write_dataset

    format_name = format if format else guess_dataset_format(output)

    with open_dataset(input_path=cube) as ds:
        if chunk_sizes:
            for k in chunk_sizes:
                if k not in ds.dims:
                    raise click.ClickException(
                        "Invalid value for CHUNKS, "
                        f"{k!r} is not the name of any dimension: {chunks}")

        chunked_dataset = chunk_dataset(ds,
                                        chunk_sizes=chunk_sizes,
                                        format_name=format_name)
        write_dataset(chunked_dataset,
                      output_path=output,
                      format_name=format_name,
                      **write_kwargs)
Esempio n. 12
0
    def setUp(self):
        rimraf(self.TEST_ZARR)
        cube = new_cube(variables=dict(A=0.5, B=-1.5))
        cube = chunk_dataset(cube,
                             chunk_sizes=dict(time=1, lat=90, lon=90),
                             format_name=FORMAT_NAME_ZARR)
        cube.to_zarr(self.TEST_ZARR)

        self.chunked_a_files = {
            '.zarray', '.zattrs', '0.0.0', '0.0.1', '0.0.2', '0.0.3', '0.1.0',
            '0.1.1', '0.1.2', '0.1.3', '1.0.0', '1.0.1', '1.0.2', '1.0.3',
            '1.1.0', '1.1.1', '1.1.2', '1.1.3', '2.0.0', '2.0.1', '2.0.2',
            '2.0.3', '2.1.0', '2.1.1', '2.1.2', '2.1.3', '3.0.0', '3.0.1',
            '3.0.2', '3.0.3', '3.1.0', '3.1.1', '3.1.2', '3.1.3', '4.0.0',
            '4.0.1', '4.0.2', '4.0.3', '4.1.0', '4.1.1', '4.1.2', '4.1.3'
        }
        self.chunked_b_files = self.chunked_a_files
        self.chunked_time_files = {
            '.zarray', '.zattrs', '0', '1', '2', '3', '4'
        }
        self.chunked_lat_files = {'.zattrs', '.zarray', '0', '1'}
        self.chunked_lon_files = {'.zattrs', '.zarray', '0', '1', '2', '3'}
Esempio n. 13
0
import os
import os.path
import unittest
from typing import Set, Union, Sequence

from xcube.constants import FORMAT_NAME_ZARR
from xcube.core.chunk import chunk_dataset
from xcube.core.dsio import rimraf
from xcube.core.new import new_cube
from xcube.core.optimize import optimize_dataset

TEST_CUBE = chunk_dataset(new_cube(time_periods=3,
                                   variables=dict(A=0.5, B=-1.5)),
                          chunk_sizes=dict(time=1, lat=180, lon=360),
                          format_name=FORMAT_NAME_ZARR)

INPUT_CUBE_PATH = 'test.zarr'
OUTPUT_CUBE_PATH = 'test_opt.zarr'
OUTPUT_CUBE_PATTERN = '{input}_opt.zarr'

INPUT_CUBE_FILE_SET = {
    '.zattrs', '.zgroup', 'A/.zarray', 'A/.zattrs', 'A/0.0.0', 'A/1.0.0',
    'A/2.0.0', 'B/.zarray', 'B/.zattrs', 'B/0.0.0', 'B/1.0.0', 'B/2.0.0',
    'lat/.zarray', 'lat/.zattrs', 'lat/0', 'lat_bnds/.zarray',
    'lat_bnds/.zattrs', 'lat_bnds/0.0', 'lon/.zarray', 'lon/.zattrs', 'lon/0',
    'lon_bnds/.zarray', 'lon_bnds/.zattrs', 'lon_bnds/0.0', 'time/.zarray',
    'time/.zattrs', 'time/0', 'time/1', 'time/2', 'time_bnds/.zarray',
    'time_bnds/.zattrs', 'time_bnds/0.0', 'time_bnds/1.0', 'time_bnds/2.0'
}