def test_mask_dataset_for_chunked_input(self): cube = chunk_dataset(self.cube, chunk_sizes=dict(time=1, lat=90, lon=90)) cube = mask_dataset_by_geometry(cube, self.triangle) self._assert_clipped_dataset_has_basic_props(cube) self.assertEqual(((1, 1, 1, 1, 1), (4, ), (7, )), cube.temp.chunks) self.assertEqual(((1, 1, 1, 1, 1), (4, ), (7, )), cube.precip.chunks)
def setUp(self) -> None: cube = new_cube(width=360, height=180, time_periods=6, variables=dict(analysed_sst=275.3, analysis_error=2.1)) cube = chunk_dataset(cube, dict(time=3, lat=90, lon=90)) self.cube = cube
def append_time_slice(store: Union[str, MutableMapping], time_slice: xr.Dataset, chunk_sizes: Dict[str, int] = None): """ Append time slice to existing zarr dataset. :param store: A zarr store. :param time_slice: Time slice to insert :param chunk_sizes: desired chunk sizes """ if chunk_sizes: time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr') # Unfortunately time_slice.to_zarr(store, mode='a', append_dim='time') will replace global attributes of store # with attributes of time_slice (xarray bug?), which are usually empty in our case. # Hence, we must save our old attributes in a copy of time_slice. ds = zarr.open_group(store, mode='r') time_slice = time_slice.copy() time_slice.attrs.update(ds.attrs) if 'coordinates' in time_slice.attrs: # Remove 'coordinates', otherwise we get # ValueError: cannot serialize coordinates because the global attribute 'coordinates' already exists # from next time_slice.to_zarr(...) call. time_slice.attrs.pop('coordinates') time_slice.to_zarr(store, mode='a', append_dim='time') unchunk_dataset(store, coords_only=True)
def update_time_slice(store: Union[str, MutableMapping], insert_index: int, time_slice: xr.Dataset, mode: str, chunk_sizes: Dict[str, int] = None): """ Update existing zarr dataset by new time slice. :param store: A zarr store. :param insert_index: Time index :param time_slice: Time slice to insert :param mode: Update mode, 'insert' or 'replace' :param chunk_sizes: desired chunk sizes """ if mode not in ('insert', 'replace'): raise ValueError(f'illegal mode value: {mode!r}') insert_mode = mode == 'insert' time_var_names = [] encoding = {} with xr.open_zarr(store) as cube: for var_name in cube.variables: var = cube[var_name] if var.ndim >= 1 and 'time' in var.dims: if var.dims[0] != 'time': raise ValueError( f"dimension 'time' of variable {var_name!r} must be first dimension" ) time_var_names.append(var_name) encoding[var_name] = cube[var_name].encoding if chunk_sizes: time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr') temp_dir = tempfile.TemporaryDirectory(prefix='xcube-time-slice-', suffix='.zarr') time_slice.to_zarr(temp_dir.name, encoding=encoding) slice_root_group = zarr.open(temp_dir.name, mode='r') slice_arrays = dict(slice_root_group.arrays()) cube_root_group = zarr.open(store, mode='r+') for var_name, var_array in cube_root_group.arrays(): if var_name in time_var_names: slice_array = slice_arrays[var_name] if insert_mode: # Add one empty time step empty = zarr.creation.empty(slice_array.shape, dtype=var_array.dtype) var_array.append(empty, axis=0) # Shift contents var_array[insert_index + 1:, ...] = var_array[insert_index:-1, ...] # Replace slice var_array[insert_index, ...] = slice_array[0] unchunk_dataset(store, coords_only=True)
def make_cube(self, start_date, num_days: int) -> xr.Dataset: cube = new_cube(time_periods=num_days, time_freq='1D', time_start=start_date, variables=dict(precipitation=0.1, temperature=270.5, soil_moisture=0.2)) chunk_sizes = dict(time=1, lat=90, lon=90) cube = chunk_dataset(cube, chunk_sizes, format_name='zarr') return cube
def chunk(self, chunk_sizes: Dict[str, int] = None, format_name: str = None) -> xr.Dataset: """ Chunk this dataset and update encodings for given format. :param chunk_sizes: mapping from dimension name to new chunk size :param format_name: format, e.g. "zarr" or "netcdf4" :return: the re-chunked dataset """ return chunk_dataset(self._dataset, chunk_sizes=chunk_sizes, format_name=format_name)
def test_chunk_dataset(self): dataset = new_test_dataset([ "2010-01-01", "2010-01-02", "2010-01-03", "2010-01-04", "2010-01-05" ], precipitation=0.4, temperature=275.2) chunked_dataset = chunk_dataset(dataset, chunk_sizes=dict(time=1, lat=10, lon=20), format_name="zarr") self.assertEqual({'chunks': (1, 10, 20)}, chunked_dataset.precipitation.encoding) self.assertEqual({'chunks': (1, 10, 20)}, chunked_dataset.temperature.encoding) chunked_dataset = chunk_dataset(dataset, chunk_sizes=dict(time=1, lat=20, lon=40), format_name="netcdf4") self.assertEqual({'chunksizes': (1, 20, 40)}, chunked_dataset.precipitation.encoding) self.assertEqual({'chunksizes': (1, 20, 40)}, chunked_dataset.temperature.encoding) chunked_dataset = chunk_dataset(dataset, chunk_sizes=dict(time=1, lat=20, lon=40)) self.assertEqual({}, chunked_dataset.precipitation.encoding) self.assertEqual({}, chunked_dataset.temperature.encoding) dataset = dataset.chunk(dict(time=2, lat=10, lon=20)) chunked_dataset = chunk_dataset(dataset, chunk_sizes=None, format_name="zarr") self.assertEqual({}, chunked_dataset.precipitation.encoding) self.assertEqual({}, chunked_dataset.temperature.encoding) chunked_dataset = chunk_dataset(dataset, chunk_sizes={}, format_name="zarr") self.assertEqual({'chunks': (2, 10, 20)}, chunked_dataset.precipitation.encoding) self.assertEqual({'chunks': (2, 10, 20)}, chunked_dataset.temperature.encoding) chunked_dataset = chunk_dataset(dataset, chunk_sizes=dict(time=1), format_name="zarr") self.assertEqual({'chunks': (1, 10, 20)}, chunked_dataset.precipitation.encoding) self.assertEqual({'chunks': (1, 10, 20)}, chunked_dataset.temperature.encoding)
def setUp(self) -> None: num_times = 30 time = [] periods = ['1D', '1D', '3D', '4D', '2D'] t = pd.to_datetime('2017-07-01T10:30:15Z', utc=True) for i in range(num_times): time.append(t.isoformat()) t += pd.to_timedelta(periods[i % len(periods)]) temperature, precipitation = zip(*[(272 + 0.1 * i, 120 - 0.2 * i) for i in range(num_times)]) input_cube = new_test_dataset(time, temperature=temperature, precipitation=precipitation) input_cube = chunk_dataset(input_cube, chunk_sizes=dict(time=1, lat=90, lon=180)) self.input_cube = input_cube
def test_local(self): cube = new_cube(time_periods=10, time_start='2019-01-01', variables=dict(precipitation=0.1, temperature=270.5, soil_moisture=0.2)) cube = chunk_dataset(cube, dict(time=1, lat=90, lon=90), format_name='zarr') cube.to_zarr(self.CUBE_PATH) cube.close() diagnostic_store = DiagnosticStore( zarr.DirectoryStore(self.CUBE_PATH), logging_observer(log_path='local-cube.log')) xr.open_zarr(diagnostic_store)
def test_unchunk_dataset(self): dataset = new_test_dataset(["2010-01-01", "2010-01-02", "2010-01-03", "2010-01-04", "2010-01-05"], precipitation=0.4, temperature=275.2) for var in dataset.data_vars.values(): var.encoding.update({"chunks": (5, 180, 360), "_FillValue": -999.0}) chunked_dataset = chunk_dataset(dataset, format_name="zarr") self.assertEqual({"_FillValue": -999.0}, chunked_dataset.precipitation.encoding) self.assertEqual({"_FillValue": -999.0}, chunked_dataset.temperature.encoding)
def chunk(cube, output, format=None, params=None, chunks=None): """ (Re-)chunk xcube dataset. Changes the external chunking of all variables of CUBE according to CHUNKS and writes the result to OUTPUT. Note: There is a possibly more efficient way to (re-)chunk datasets through the dedicated tool "rechunker", see https://rechunker.readthedocs.io. """ chunk_sizes = None if chunks: chunk_sizes = parse_cli_kwargs(chunks, metavar="CHUNKS") for k, v in chunk_sizes.items(): if not isinstance(v, int) or v <= 0: raise click.ClickException( "Invalid value for CHUNKS, " f"chunk sizes must be positive integers: {chunks}") write_kwargs = dict() if params: write_kwargs = parse_cli_kwargs(params, metavar="PARAMS") from xcube.core.chunk import chunk_dataset from xcube.core.dsio import guess_dataset_format from xcube.core.dsio import open_dataset, write_dataset format_name = format if format else guess_dataset_format(output) with open_dataset(input_path=cube) as ds: if chunk_sizes: for k in chunk_sizes: if k not in ds.dims: raise click.ClickException( "Invalid value for CHUNKS, " f"{k!r} is not the name of any dimension: {chunks}") chunked_dataset = chunk_dataset(ds, chunk_sizes=chunk_sizes, format_name=format_name) write_dataset(chunked_dataset, output_path=output, format_name=format_name, **write_kwargs)
def setUp(self): rimraf(self.TEST_ZARR) cube = new_cube(variables=dict(A=0.5, B=-1.5)) cube = chunk_dataset(cube, chunk_sizes=dict(time=1, lat=90, lon=90), format_name=FORMAT_NAME_ZARR) cube.to_zarr(self.TEST_ZARR) self.chunked_a_files = { '.zarray', '.zattrs', '0.0.0', '0.0.1', '0.0.2', '0.0.3', '0.1.0', '0.1.1', '0.1.2', '0.1.3', '1.0.0', '1.0.1', '1.0.2', '1.0.3', '1.1.0', '1.1.1', '1.1.2', '1.1.3', '2.0.0', '2.0.1', '2.0.2', '2.0.3', '2.1.0', '2.1.1', '2.1.2', '2.1.3', '3.0.0', '3.0.1', '3.0.2', '3.0.3', '3.1.0', '3.1.1', '3.1.2', '3.1.3', '4.0.0', '4.0.1', '4.0.2', '4.0.3', '4.1.0', '4.1.1', '4.1.2', '4.1.3' } self.chunked_b_files = self.chunked_a_files self.chunked_time_files = { '.zarray', '.zattrs', '0', '1', '2', '3', '4' } self.chunked_lat_files = {'.zattrs', '.zarray', '0', '1'} self.chunked_lon_files = {'.zattrs', '.zarray', '0', '1', '2', '3'}
import os import os.path import unittest from typing import Set, Union, Sequence from xcube.constants import FORMAT_NAME_ZARR from xcube.core.chunk import chunk_dataset from xcube.core.dsio import rimraf from xcube.core.new import new_cube from xcube.core.optimize import optimize_dataset TEST_CUBE = chunk_dataset(new_cube(time_periods=3, variables=dict(A=0.5, B=-1.5)), chunk_sizes=dict(time=1, lat=180, lon=360), format_name=FORMAT_NAME_ZARR) INPUT_CUBE_PATH = 'test.zarr' OUTPUT_CUBE_PATH = 'test_opt.zarr' OUTPUT_CUBE_PATTERN = '{input}_opt.zarr' INPUT_CUBE_FILE_SET = { '.zattrs', '.zgroup', 'A/.zarray', 'A/.zattrs', 'A/0.0.0', 'A/1.0.0', 'A/2.0.0', 'B/.zarray', 'B/.zattrs', 'B/0.0.0', 'B/1.0.0', 'B/2.0.0', 'lat/.zarray', 'lat/.zattrs', 'lat/0', 'lat_bnds/.zarray', 'lat_bnds/.zattrs', 'lat_bnds/0.0', 'lon/.zarray', 'lon/.zattrs', 'lon/0', 'lon_bnds/.zarray', 'lon_bnds/.zattrs', 'lon_bnds/0.0', 'time/.zarray', 'time/.zattrs', 'time/0', 'time/1', 'time/2', 'time_bnds/.zarray', 'time_bnds/.zattrs', 'time_bnds/0.0', 'time_bnds/1.0', 'time_bnds/2.0' }