Beispiel #1
0
def append_time_slice(store: Union[str, MutableMapping],
                      time_slice: xr.Dataset,
                      chunk_sizes: Dict[str, int] = None):
    """
    Append time slice to existing zarr dataset.

    :param store: A zarr store.
    :param time_slice: Time slice to insert
    :param chunk_sizes: desired chunk sizes
    """
    if chunk_sizes:
        time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr')

    # Unfortunately time_slice.to_zarr(store, mode='a', append_dim='time') will replace global attributes of store
    # with attributes of time_slice (xarray bug?), which are usually empty in our case.
    # Hence, we must save our old attributes in a copy of time_slice.
    ds = zarr.open_group(store, mode='r')
    time_slice = time_slice.copy()
    time_slice.attrs.update(ds.attrs)
    if 'coordinates' in time_slice.attrs:
        # Remove 'coordinates', otherwise we get
        # ValueError: cannot serialize coordinates because the global attribute 'coordinates' already exists
        # from next time_slice.to_zarr(...) call.
        time_slice.attrs.pop('coordinates')

    time_slice.to_zarr(store, mode='a', append_dim='time')
    unchunk_dataset(store, coords_only=True)
Beispiel #2
0
 def test_unchunk_coord_var(self):
     unchunk_dataset(self.TEST_ZARR, var_names=['time'], coords_only=True)
     self._assert_cube_files(
         expected_a_files=self.chunked_a_files,
         expected_b_files=self.chunked_b_files,
         expected_time_files={'.zarray', '.zattrs', '0'},
         expected_lat_files=self.chunked_lat_files,
         expected_lon_files=self.chunked_lon_files)
Beispiel #3
0
 def test_unchunk_data_var(self):
     unchunk_dataset(self.TEST_ZARR, var_names=['B'])
     self._assert_cube_files(
         expected_a_files=self.chunked_a_files,
         expected_b_files={'.zarray', '.zattrs', '0.0.0'},
         expected_time_files=self.chunked_time_files,
         expected_lat_files=self.chunked_lat_files,
         expected_lon_files=self.chunked_lon_files)
Beispiel #4
0
 def test_unchunk_all(self):
     unchunk_dataset(self.TEST_ZARR)
     self._assert_cube_files(
         expected_a_files={'.zarray', '.zattrs', '0.0.0'},
         expected_b_files={'.zarray', '.zattrs', '0.0.0'},
         expected_time_files={'.zarray', '.zattrs', '0'},
         expected_lat_files={'.zarray', '.zattrs', '0'},
         expected_lon_files={'.zarray', '.zattrs', '0'})
Beispiel #5
0
def update_time_slice(store: Union[str, MutableMapping],
                      insert_index: int,
                      time_slice: xr.Dataset,
                      mode: str,
                      chunk_sizes: Dict[str, int] = None):
    """
    Update existing zarr dataset by new time slice.

    :param store: A zarr store.
    :param insert_index: Time index
    :param time_slice: Time slice to insert
    :param mode: Update mode, 'insert' or 'replace'
    :param chunk_sizes: desired chunk sizes
    """

    if mode not in ('insert', 'replace'):
        raise ValueError(f'illegal mode value: {mode!r}')

    insert_mode = mode == 'insert'

    time_var_names = []
    encoding = {}
    with xr.open_zarr(store) as cube:
        for var_name in cube.variables:
            var = cube[var_name]
            if var.ndim >= 1 and 'time' in var.dims:
                if var.dims[0] != 'time':
                    raise ValueError(
                        f"dimension 'time' of variable {var_name!r} must be first dimension"
                    )
                time_var_names.append(var_name)
                encoding[var_name] = cube[var_name].encoding

    if chunk_sizes:
        time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr')
    temp_dir = tempfile.TemporaryDirectory(prefix='xcube-time-slice-',
                                           suffix='.zarr')
    time_slice.to_zarr(temp_dir.name, encoding=encoding)
    slice_root_group = zarr.open(temp_dir.name, mode='r')
    slice_arrays = dict(slice_root_group.arrays())

    cube_root_group = zarr.open(store, mode='r+')
    for var_name, var_array in cube_root_group.arrays():
        if var_name in time_var_names:
            slice_array = slice_arrays[var_name]
            if insert_mode:
                # Add one empty time step
                empty = zarr.creation.empty(slice_array.shape,
                                            dtype=var_array.dtype)
                var_array.append(empty, axis=0)
                # Shift contents
                var_array[insert_index + 1:, ...] = var_array[insert_index:-1,
                                                              ...]
            # Replace slice
            var_array[insert_index, ...] = slice_array[0]

    unchunk_dataset(store, coords_only=True)
Beispiel #6
0
    def test_unchunk_data_var_coords_only(self):
        with self.assertRaises(ValueError) as cm:
            unchunk_dataset(self.TEST_ZARR, var_names=['B'], coords_only=True)
        self.assertEqual(
            "variable 'B' is not a coordinate variable in 'test.zarr'",
            f'{cm.exception}')

        with self.assertRaises(ValueError) as cm:
            unchunk_dataset(self.TEST_ZARR, var_names=['C'], coords_only=False)
        self.assertEqual("variable 'C' is not a variable in 'test.zarr'",
                         f'{cm.exception}')
Beispiel #7
0
def optimize_dataset(input_path: str,
                     output_path: str = None,
                     in_place: bool = False,
                     unchunk_coords: bool = False,
                     exception_type: Type[Exception] = ValueError):
    """
    Optimize a dataset for faster access.

    Reduces the number of metadata and coordinate data files in xcube dataset given by given by *dataset_path*.
    Consolidated cubes open much faster from remote locations, e.g. in object storage,
    because obviously much less HTTP requests are required to fetch initial cube meta
    information. That is, it merges all metadata files into a single top-level JSON file ".zmetadata".
    If *unchunk_coords* is set, it also removes any chunking of coordinate variables
    so they comprise a single binary data file instead of one file per data chunk.
    The primary usage of this function is to optimize data cubes for cloud object storage.
    The function currently works only for data cubes using ZARR format.

    :param input_path: Path to input dataset with ZARR format.
    :param output_path: Path to output dataset with ZARR format. May contain "{input}" template string,
           which is replaced by the input path's file name without file name extentsion.
    :param in_place: Whether to modify the dataset in place.
           If False, a copy is made and *output_path* must be given.
    :param unchunk_coords: Whether to also consolidate coordinate chunk files.
    :param exception_type: Type of exception to be used on value errors.
    """

    if not os.path.isfile(os.path.join(input_path, '.zgroup')):
        raise exception_type(
            'Input path must point to ZARR dataset directory.')

    input_path = os.path.abspath(os.path.normpath(input_path))

    if in_place:
        output_path = input_path
    else:
        if not output_path:
            raise exception_type(f'Output path must be given.')
        if '{input}' in output_path:
            base_name, _ = os.path.splitext(os.path.basename(input_path))
            output_path = output_path.format(input=base_name)
        output_path = os.path.abspath(os.path.normpath(output_path))
        if os.path.exists(output_path):
            raise exception_type(f'Output path already exists.')

    if not in_place:
        shutil.copytree(input_path, output_path)

    if unchunk_coords:
        unchunk_dataset(output_path, coords_only=True)

    zarr.convenience.consolidate_metadata(output_path)
Beispiel #8
0
 def test_unchunk_invalid_path(self):
     with self.assertRaises(ValueError) as cm:
         unchunk_dataset(self.TEST_ZARR + '.zip')
     self.assertEqual("'test.zarr.zip' is not a valid ZARR directory",
                      f'{cm.exception}')