def test_optimize_dataset_in_place(self): self.assertEqual(TEST_CUBE_FILE_SET, list_file_set(TEST_CUBE_ZARR)) optimize_dataset(TEST_CUBE_ZARR, in_place=True) expected_files = set(TEST_CUBE_FILE_SET) expected_files.add('.zmetadata') self.assertEqual(expected_files, list_file_set(TEST_CUBE_ZARR))
def test_edit_zmetadata(self): optimize_dataset(TEST_CUBE_ZARR, unchunk_coords=True, output_path=TEST_CUBE_ZARR_OPTIMIZED) edit_metadata(TEST_CUBE_ZARR_OPTIMIZED, metadata_path=TEST_NEW_META_YML, in_place=False, output_path=TEST_CUBE_ZARR_OPTIMIZED_EDIT, monitor=print) ds1 = zarr.open(TEST_CUBE_ZARR) ds2 = zarr.convenience.open_consolidated(TEST_CUBE_ZARR_OPTIMIZED_EDIT) self.assertEqual(ds1.__len__(), ds2.__len__()) self.assertEqual(ds1.attrs.__getitem__('start_date'), ds2.attrs.__getitem__('start_date')) self.assertEqual('happiness', ds2['conc_chl'].attrs.__getitem__('units')) self.assertNotIn('creator_name', ds1.attrs.keys()) self.assertIn('creator_name', ds2.attrs.keys())
def test_optimize_dataset_in_place_unchunk_coords(self): self.assertEqual(TEST_CUBE_FILE_SET, list_file_set(TEST_CUBE_ZARR)) optimize_dataset(TEST_CUBE_ZARR, in_place=True, unchunk_coords=True) expected_files = set(TEST_CUBE_FILE_SET) expected_files.add('.zmetadata') expected_files.remove('time/1') expected_files.remove('time/2') expected_files.remove('time_bnds/1.0') expected_files.remove('time_bnds/2.0') self.assertEqual(expected_files, list_file_set(TEST_CUBE_ZARR))
def _update_cube(output_writer: DatasetIO, output_path: str, global_attrs: Dict = None, temporal_only: bool = False, consolidate: bool = True): if consolidate and os.path.isfile(os.path.join(output_path, '.zgroup')): optimize_dataset(input_path=output_path, in_place=True, unchunk_coords=True) cube = output_writer.read(output_path) if temporal_only: cube = update_dataset_temporal_attrs(cube, update_existing=True, in_place=True) else: cube = update_dataset_attrs(cube, update_existing=True, in_place=True) cube_attrs = dict(cube.attrs) cube.close() if global_attrs: cube_attrs.update(global_attrs) output_writer.update(output_path, global_attrs=cube_attrs)
def _test_optimize_dataset(self, unchunk_coords: Union[bool, str, Sequence[str]], in_place: bool, expected_output_path: str, expected_cons_time: bool = False, expected_cons_time_bnds: bool = False): if not in_place: optimize_dataset(INPUT_CUBE_PATH, output_path=OUTPUT_CUBE_PATTERN, in_place=in_place, unchunk_coords=unchunk_coords) else: optimize_dataset(INPUT_CUBE_PATH, in_place=in_place, unchunk_coords=unchunk_coords) self._assert_consolidated(expected_output_path, expected_cons_time, expected_cons_time_bnds)
def optimize(cube, output, in_place, unchunk_coords): """ Optimize xcube dataset for faster access. Reduces the number of metadata and coordinate data files in xcube dataset given by CUBE. Consolidated cubes open much faster especially from remote locations, e.g. in object storage, because obviously much less HTTP requests are required to fetch initial cube meta information. That is, it merges all metadata files into a single top-level JSON file ".zmetadata". Optionally, it removes any chunking of coordinate variables so they comprise a single binary data file instead of one file per data chunk. The primary usage of this command is to optimize data cubes for cloud object storage. The command currently works only for data cubes using ZARR format. """ from xcube.core.optimize import optimize_dataset optimize_dataset(cube, output_path=output, in_place=in_place, unchunk_coords=unchunk_coords)
def test_failures(self): with self.assertRaises(RuntimeError) as cm: optimize_dataset('pippo', in_place=True, exception_type=RuntimeError) self.assertEqual('Input path must point to ZARR dataset directory.', f'{cm.exception}') with self.assertRaises(RuntimeError) as cm: optimize_dataset(INPUT_CUBE_PATH, exception_type=RuntimeError) self.assertEqual('Output path must be given.', f'{cm.exception}') with self.assertRaises(RuntimeError) as cm: optimize_dataset(INPUT_CUBE_PATH, output_path=INPUT_CUBE_PATH, exception_type=RuntimeError) self.assertEqual('Output path already exists.', f'{cm.exception}') with self.assertRaises(RuntimeError) as cm: optimize_dataset(INPUT_CUBE_PATH, output_path='./' + INPUT_CUBE_PATH, exception_type=RuntimeError) self.assertEqual('Output path already exists.', f'{cm.exception}')
def edit_metadata(input_path: str, output_path: str = None, metadata_path: str = None, update_coords: bool = False, in_place: bool = False, monitor: Callable[..., None] = None, exception_type: Type[Exception] = ValueError): """ Edit the metadata of an xcube dataset. Editing the metadata because it may be incorrect, inconsistent or incomplete. The metadata attributes should be given by a yaml file with the keywords to be edited. The function currently works only for data cubes using ZARR format. :param input_path: Path to input dataset with ZARR format. :param output_path: Path to output dataset with ZARR format. May contain "{input}" template string, which is replaced by the input path's file name without file name extentsion. :param metadata_path: Path to the metadata file, which will edit the existing metadata. :param update_coords: Whether to update the metadata about the coordinates. :param in_place: Whether to modify the dataset in place. If False, a copy is made and *output_path* must be given. :param monitor: A progress monitor. :param exception_type: Type of exception to be used on value errors. """ input_path = os.path.abspath(os.path.normpath(input_path)) if not os.path.isfile(os.path.join(input_path, '.zgroup')): raise exception_type( 'Input path must point to ZARR dataset directory.') if in_place: output_path = input_path else: if not output_path: raise exception_type(f'Output path must be given.') if '{input}' in output_path: base_name, _ = os.path.splitext(os.path.basename(input_path)) output_path = output_path.format(input=base_name) output_path = os.path.abspath(os.path.normpath(output_path)) if os.path.exists(output_path): raise exception_type(f'Output path already exists.') if not in_place: shutil.copytree(input_path, output_path) if monitor is None: # noinspection PyUnusedLocal def monitor(*args): pass cube = zarr.open(output_path) if update_coords: with xr.open_zarr(output_path) as ds: ds_attrs = update_dataset_attrs(ds, update_existing=False, in_place=True).attrs for key in ds_attrs: cube.attrs.update({key: ds_attrs[key]}) if metadata_path: new_metadata = load_configs(metadata_path) for element in new_metadata: if 'output_metadata' in element: _edit_keyvalue_in_metadata(cube, new_metadata, element, monitor) else: if cube.__contains__(element): _edit_keyvalue_in_metadata(cube[element], new_metadata, element, monitor) else: warnings.warn( f'The variable "{element}" could not be found in the xcube dataset. ' f'Please check spelling of it.') # the metadata attrs of a consolidated xcube dataset may not be changed # (https://zarr.readthedocs.io/en/stable/api/convenience.html#zarr.convenience.consolidate_metadata) # therefore after changing metadata the xcube dataset needs to be consolidated once more. if os.path.exists(os.path.join(output_path, '.zmetadata')): optimize_dataset(output_path, in_place=True)