def test_to_dict(self): self.assertEqual({}, DataStorePool().to_dict()) self.assertEqual({'ram': {'store_id': 'memory'}, 'dir': {'store_id': 'file', 'store_params': {'base_dir': 'bibo'}}}, DataStorePool({'ram': DataStoreConfig(store_id='memory'), 'dir': DataStoreConfig(store_id='file', store_params=dict(base_dir="bibo")) }).to_dict())
def main(gen_config_path: str, store_configs_path: str = None, verbose: bool = False): """ Generator tool for data cubes. Creates cube views from one or more cube stores, resamples them to a common grid, optionally performs some cube transformation, and writes the resulting cube to some target cube store. *gen_config_path* is the cube generator configuration. It may be provided as a JSON or YAML file (file extensions ".json" or ".yaml"). If the *gen_config_path* argument is omitted, it is expected that the cube generator configuration is piped as a JSON string. *store_configs_path* is a path to a JSON file with data store configurations. It is a mapping of names to configured stores. Entries are dictionaries that have a mandatory "store_id" property which is a name of a registered xcube data store. The optional "store_params" property may define data store specific parameters. :param gen_config_path: Cube generation configuration. It may be provided as a JSON or YAML file (file extensions ".json" or ".yaml"). If the REQUEST file argument is omitted, it is expected that the cube generator configuration is piped as a JSON string. :param store_configs_path: A JSON file that maps store names to parameterized stores. :param verbose: Whether to output progress information to stdout. """ store_pool = DataStorePool.from_file( store_configs_path) if store_configs_path else DataStorePool() gen_config = GenConfig.from_file(gen_config_path, verbose=verbose) if gen_config.callback_config: ApiProgressCallbackObserver(gen_config.callback_config).activate() if verbose: ConsoleProgressObserver().activate() with observe_progress('Generating cube', 100) as cm: cm.will_work(10) cubes = open_cubes(gen_config.input_configs, cube_config=gen_config.cube_config, store_pool=store_pool) cm.will_work(10) cube = resample_and_merge_cubes(cubes, cube_config=gen_config.cube_config) cm.will_work(80) data_id = write_cube(cube, output_config=gen_config.output_config, store_pool=store_pool) if verbose: print('Cube "{}" generated within {:.2f} seconds'.format( str(data_id), cm.state.total_time))
def test_from_dict_with_bad_dicts(self): store_configs = {"dir": {}} with self.assertRaises(jsonschema.exceptions.ValidationError) as cm: DataStorePool.from_dict(store_configs) self.assertTrue("'store_id' is a required property" in f'{cm.exception}', msg=f'{cm.exception}') store_configs = {"dir": {"store_id": 10}} with self.assertRaises(jsonschema.exceptions.ValidationError) as cm: DataStorePool.from_dict(store_configs) self.assertTrue("Failed validating 'type' in schema" in f'{cm.exception}', msg=f'{cm.exception}')
def test_multi_stores_with_params(self): """Just test many stores at once""" store_configs = { "ram-1": { "store_id": "memory", }, "ram-2": { "store_id": "memory", }, "local-1": { "store_id": "file", "store_params": { "root": "/home/bibo/datacubes-1", } }, "local-2": { "store_id": "file", "store_params": { "root": "/home/bibo/datacubes-2", } }, } pool = DataStorePool.from_dict(store_configs) self.assertIsInstance(pool, DataStorePool) self.assertEqual(["local-1", "local-2", "ram-1", "ram-2"], pool.store_instance_ids) for instance_id in pool.store_instance_ids: self.assertTrue(pool.has_store_instance(instance_id)) self.assertIsInstance(pool.get_store_config(instance_id), DataStoreConfig) self.assertIsInstance(pool.get_store(instance_id), DataStore)
def test_get_store_instance_id(self): store_params_1 = { "root": "./bibo" } ds_config_1 = DataStoreConfig(store_id='file', store_params=store_params_1) ds_configs = {'dir-1': ds_config_1} pool = DataStorePool(ds_configs) store_params_2 = { "root": "./babo" } ds_config_2 = DataStoreConfig(store_id='file', store_params=store_params_2) ds_config_3 = DataStoreConfig(store_id='file', store_params=store_params_1, title='A third configuration') self.assertEqual('dir-1', pool.get_store_instance_id(ds_config_1)) self.assertEqual('dir-1', pool.get_store_instance_id(ds_config_1, strict_check=True)) self.assertIsNone(pool.get_store_instance_id(ds_config_2)) self.assertEqual('dir-1', pool.get_store_instance_id(ds_config_3)) self.assertIsNone(pool.get_store_instance_id(ds_config_3, strict_check=True))
def test_close_all_stores(self): store_configs = { "ram-1": { "store_id": "memory", }, } pool = DataStorePool.from_dict(store_configs) # Smoke test, we do not expect any visible state changes after close_all_stores() pool.close_all_stores()
def test_get_data_store_instance_from_pool(self): pool = DataStorePool({ 'dir': DataStoreConfig('directory', store_params=dict(base_dir='.')) }) instance = get_data_store_instance('@dir', store_pool=pool) self.assertIsInstance(instance.store, DirectoryDataStore) instance2 = get_data_store_instance('@dir', store_pool=pool) self.assertIs(instance, instance2)
def test_from_dict_no_store_params(self): store_configs = { "ram-1": { "store_id": "memory" } } pool = DataStorePool.from_dict(store_configs) self.assertIsInstance(pool, DataStorePool) self.assertEqual(["ram-1"], pool.store_instance_ids) self.assertIsInstance(pool.get_store_config('ram-1'), DataStoreConfig)
def test_get_data_store_instance_from_pool(self): pool = DataStorePool({ 'dir': DataStoreConfig('file', store_params=dict(root='.')) }) instance = get_data_store_instance('@dir', store_pool=pool) self.assertTrue(hasattr(instance.store, 'root')) # noinspection PyUnresolvedReferences self.assertTrue(os.path.isabs(instance.store.root)) self.assertTrue(os.path.isdir(instance.store.root)) instance2 = get_data_store_instance('@dir', store_pool=pool) self.assertIs(instance, instance2)
def test_get_data_store_instance_from_pool_with_params(self): pool = DataStorePool({ '@dir': DataStoreConfig('directory', store_params=dict(base_dir='.')) }) with self.assertRaises(ValueError) as cm: get_data_store_instance('@dir', store_pool=pool, store_params={'thres': 5}) self.assertEqual( 'store_params cannot be given, with store_id ("@dir") referring to a configured store', f'{cm.exception}')
def test_get_data_store_instance_from_pool_with_params(self): pool = DataStorePool({ '@dir': DataStoreConfig('file', store_params=dict(root='.')) }) with self.assertRaises(ValueError) as cm: get_data_store_instance( '@dir', store_pool=pool, store_params={'auto_mkdir': True} ) self.assertEqual('store_params cannot be given,' ' with store_id ("@dir") referring' ' to a configured store', f'{cm.exception}')
def __init__(self, store_pool: DataStorePool = None, raise_on_error: bool = False, verbosity: int = 0): super().__init__(raise_on_error=raise_on_error, verbosity=verbosity) if store_pool is not None: assert_instance(store_pool, DataStorePool, 'store_pool') self._store_pool = store_pool if store_pool is not None \ else DataStorePool() self._generated_data_id: Optional[str] = None self._generated_cube: Optional[xr.Dataset] = None self._generated_gm: Optional[GridMapping] = None
def test_get_store(self): store_configs = { "dir-1": { "store_id": "directory", "store_params": { "base_dir": "bibo" } }, } pool = DataStorePool.from_dict(store_configs) store = pool.get_store('dir-1') self.assertIsInstance(store, DirectoryDataStore) self.assertEqual('bibo', store.base_dir) # Should stay same instance self.assertIs(store, pool.get_store('dir-1')) self.assertIs(store, pool.get_store('dir-1'))
def get_data_store_pool(self) -> Optional[DataStorePool]: data_store_configs = self._config.get('DataStores', []) if not data_store_configs or self._data_store_pool: return self._data_store_pool if not isinstance(data_store_configs, list): raise ServiceConfigError('DataStores must be a list') store_configs: Dict[str, DataStoreConfig] = {} for data_store_config_dict in data_store_configs: store_instance_id = data_store_config_dict.get('Identifier') store_id = data_store_config_dict.get('StoreId') store_params = data_store_config_dict.get('StoreParams', {}) dataset_configs = data_store_config_dict.get('Datasets') store_config = DataStoreConfig(store_id, store_params=store_params, user_data=dataset_configs) store_configs[store_instance_id] = store_config self._data_store_pool = DataStorePool(store_configs) return self._data_store_pool
def test_get_store(self): store_configs = { "dir-1": { "store_id": "file", "store_params": { "root": "./bibo" } }, } pool = DataStorePool.from_dict(store_configs) store = pool.get_store('dir-1') self.assertTrue(hasattr(store, 'root')) # noinspection PyUnresolvedReferences self.assertTrue(os.path.isabs(store.root)) self.assertFalse(os.path.exists(store.root)) # Should stay same instance self.assertIs(store, pool.get_store('dir-1')) self.assertIs(store, pool.get_store('dir-1'))
def test_from_yaml_file(self): store_configs = { "ram-1": { "store_id": "memory" }, "ram-2": { "store_id": "memory" } } path = 'test-store-configs.yaml' with open(path, 'w') as fp: yaml.dump(store_configs, fp, indent=2) try: pool = DataStorePool.from_file(path) self.assertIsInstance(pool, DataStorePool) self.assertEqual(['ram-1', 'ram-2'], pool.store_instance_ids) finally: import os os.remove(path)
def test_normalize(self): pool = DataStorePool({ '@dir': DataStoreConfig('directory', store_params=dict(root='.')) }) file_path = '_test-data-stores-pool.json' with open(file_path, 'w') as fp: json.dump(pool.to_dict(), fp) try: pool_1 = DataStorePool.normalize(file_path) self.assertIsInstance(pool_1, DataStorePool) pool_2 = DataStorePool.normalize(pool_1) self.assertIs(pool_2, pool_1) pool_3 = DataStorePool.normalize(pool_2.to_dict()) self.assertIsInstance(pool_3, DataStorePool) finally: os.remove(file_path) with self.assertRaises(TypeError): # noinspection PyTypeChecker DataStorePool.normalize(42)
def _assert_file_ok(self, format_name: str, root_1="/root1", root_2="/root2", use_env_vars=False): if use_env_vars: store_configs = self._get_test_config( root_1='${_TEST_ROOT_1}', root_2='${_TEST_ROOT_2}' ) import os os.environ['_TEST_ROOT_1'] = root_1 os.environ['_TEST_ROOT_2'] = root_2 else: store_configs = self._get_test_config( root_1=root_1, root_2=root_2 ) path = 'test-store-configs.' + format_name with open(path, 'w') as fp: mod = yaml if format_name == 'yaml' else json mod.dump(store_configs, fp, indent=2) try: pool = DataStorePool.from_file(path) self.assertIsInstance(pool, DataStorePool) self.assertEqual(['ram-1', 'ram-2'], pool.store_instance_ids) config_1 = pool.get_store_config('ram-1') self.assertIsInstance(config_1, DataStoreConfig) self.assertEqual( {'store_id': 'memory', 'store_params': {'root': root_1}}, config_1.to_dict()) config_2 = pool.get_store_config('ram-2') self.assertIsInstance(config_2, DataStoreConfig) self.assertEqual( {'store_id': 'memory', 'store_params': {'root': root_2}}, config_2.to_dict()) finally: import os os.remove(path)
def _maybe_assign_store_instance_ids(self): assignable_dataset_configs = [ dc for dc in self._dataset_configs if 'StoreInstanceId' not in dc and dc.get('FileSystem', 'file') in NON_MEMORY_FILE_SYSTEMS ] # split into sublists according to file system and non-root store params config_lists = [] for config in assignable_dataset_configs: store_params = self._get_other_store_params_than_root(config) file_system = config.get('FileSystem', 'file') appended = False for config_list in config_lists: if config_list[0] == file_system and \ config_list[1] == store_params: config_list[2].append(config) appended = True break if not appended: config_lists.append((file_system, store_params, [config])) data_store_pool = self.get_data_store_pool() if not data_store_pool: data_store_pool = self._data_store_pool = DataStorePool() for file_system, store_params, config_list in config_lists: # Retrieve paths per configuration paths = [dc['Path'] for dc in config_list] list.sort(paths) # Determine common prefixes of paths (and call them roots) prefixes = _get_common_prefixes(paths) if len(prefixes) < 1: roots = [''] else: # perform further step to merge prefixes with same start prefixes = list(set(prefixes)) prefixes.sort() roots = [] root_candidate = prefixes[0] for root in prefixes[1:]: common_root = os.path.commonprefix([root_candidate, root]) if _is_not_empty(common_root): root_candidate = common_root else: roots.append(root_candidate) root_candidate = root roots.append(root_candidate) for root in roots: # ensure root does not end with full or partial directory # or file name while not root.endswith("/") and not root.endswith("\\") and \ len(root) > 0: root = root[:-1] if root.endswith("/") or root.endswith("\\"): root = root[:-1] abs_root = root # For local file systems: Determine absolute root from base dir fs_protocol = FS_TYPE_TO_PROTOCOL.get(file_system, file_system) if fs_protocol == 'file' and not os.path.isabs(abs_root): abs_root = os.path.join(self._base_dir, abs_root) abs_root = os.path.normpath(abs_root) store_params_for_root = store_params.copy() store_params_for_root['root'] = abs_root # See if there already is a store with this configuration data_store_config = DataStoreConfig( store_id=fs_protocol, store_params=store_params_for_root) store_instance_id = data_store_pool.\ get_store_instance_id(data_store_config) if not store_instance_id: # Create new store with new unique store instance id counter = 1 while data_store_pool.has_store_instance( f'{fs_protocol}_{counter}'): counter += 1 store_instance_id = f'{fs_protocol}_{counter}' data_store_pool.add_store_config(store_instance_id, data_store_config) for config in config_list: if config['Path'].startswith(root): config['StoreInstanceId'] = store_instance_id new_path = config['Path'][len(root):] while new_path.startswith("/") or \ new_path.startswith("\\"): new_path = new_path[1:] config['Path'] = new_path
def new(cls, service_config: Optional[ServiceConfigLike] = None, stores_config: Optional[DataStorePoolLike] = None, raise_on_error: bool = False, verbosity: int = 0, **kwargs) -> 'CubeGenerator': """ Create a new cube generator from given configurations. If *service_config* is given, it describes a remote xcube generator remote, otherwise a local cube generator is configured using optional *stores_config*. The *service_config* parameter can be passed in different ways: * An instance of :class:ServiceConfig. * A ``str``. Then it is interpreted as a path to a YAML or JSON file and the remote configuration is loaded from this file. The file content may include template variables that are interpolated by environment variables, e.g. "${XCUBE_GEN_CLIENT_SECRET}". * A ``dict``. Then it is interpreted as a remote configuration JSON object. If *stores_config* is given, it describes a pool of data stores to be used as input and output for the cube generator. *stores_config* if a mapping of store instance identifiers to configured store instances. A store instance is a dictionary that has a mandatory "store_id" property which is a name of a registered xcube data store. as well as an optional "store_params" property that may define data store specific parameters. Similar to *service_config*, the *stores_config* parameter can be passed in different ways: * An instance of :class:DataStorePool. * A ``str``. Then it is interpreted as a YAML or JSON file path and the stores configuration is loaded from this file. * A ``dict``. Then it is interpreted as a stores configuration JSON object. The *service_config* and *stores_config* parameters cannot be given both. :param service_config: Service configuration. :param stores_config: Data stores configuration. :param raise_on_error: Whether to raise a CubeGeneratorError exception on generator failures. If False, the default, the returned result will have the "status" field set to "error" while other fields such as "message", "traceback", "output" provide more failure details. :param verbosity: Level of verbosity, 0 means off. :param kwargs: Extra arguments passed to the generator constructors. """ if service_config is not None: from .remote.config import ServiceConfig from .remote.generator import RemoteCubeGenerator assert_true( stores_config is None, 'service_config and stores_config cannot be' ' given at the same time.') assert_instance(service_config, (str, dict, ServiceConfig, type(None)), 'service_config') service_config = ServiceConfig.normalize(service_config) \ if service_config is not None else None return RemoteCubeGenerator(service_config=service_config, raise_on_error=raise_on_error, verbosity=verbosity, **kwargs) else: from .local.generator import LocalCubeGenerator assert_instance(stores_config, (str, dict, DataStorePool, type(None)), 'stores_config') store_pool = DataStorePool.normalize(stores_config) \ if stores_config is not None else None return LocalCubeGenerator(store_pool=store_pool, raise_on_error=raise_on_error, verbosity=verbosity)
def test_default_constr(self): pool = DataStorePool() self.assertEqual([], pool.store_instance_ids) self.assertEqual([], pool.store_configs)
def test_from_dict_empty(self): pool = DataStorePool.from_dict({}) self.assertIsInstance(pool, DataStorePool) self.assertEqual([], pool.store_instance_ids) self.assertEqual([], pool.store_configs)
def test_add_remove_store_config(self): pool = DataStorePool() self.assertEqual([], pool.store_instance_ids) pool.add_store_config('mem-1', DataStoreConfig('memory')) self.assertEqual(['mem-1'], pool.store_instance_ids) pool.add_store_config('mem-2', DataStoreConfig('memory')) self.assertEqual(['mem-1', 'mem-2'], pool.store_instance_ids) pool.add_store_config('mem-1', DataStoreConfig('memory')) self.assertEqual(['mem-1', 'mem-2'], pool.store_instance_ids) pool.remove_store_config('mem-1') self.assertEqual(['mem-2'], pool.store_instance_ids) pool.remove_store_config('mem-2') self.assertEqual([], pool.store_instance_ids)
def dump(output_file_path: str, config_file_path: Optional[str], type_specifier: Optional[str]): """ Dump metadata of given data stores. Dumps data store metadata and metadata for a store's data resources for given data stores into a JSON file. Data stores may be selected and configured by a configuration file CONFIG, which may have JSON or YAML format. For example, this YAML configuration configures a single directory data store: \b this_dir: title: Current Dir description: A store that represents my current directory store_id: "directory" store_params: base_dir: "." """ from xcube.core.store import DataStoreConfig from xcube.core.store import DataStorePool import time if config_file_path: store_pool = DataStorePool.from_file(config_file_path) else: extensions = get_extension_registry().find_extensions( EXTENSION_POINT_DATA_STORES) store_configs = { extension.name: DataStoreConfig(extension.name, title=extension.metadata.get('title'), description=extension.metadata.get('description')) for extension in extensions if extension.name not in ('memory', 'directory', 's3') } store_pool = DataStorePool(store_configs) stores = [] for store_instance_id in store_pool.store_instance_ids: t0 = time.perf_counter() print(f'Generating entries for store "{store_instance_id}"...') try: store_instance = store_pool.get_store(store_instance_id) except BaseException as error: print(f'error: cannot open store "{store_instance_id}": {error}', file=sys.stderr) continue try: search_result = [ dsd.to_dict() for dsd in store_instance.search_data( type_specifier=type_specifier) ] except BaseException as error: print(f'error: cannot search store "{store_instance_id}": {error}', file=sys.stderr) continue store_config = store_pool.get_store_config(store_instance_id) stores.append( dict(store_instance_id=store_instance_id, store_id=store_instance_id, title=store_config.title, description=store_config.description, type_specifier=type_specifier, datasets=search_result)) print('Done after {:.2f} seconds'.format(time.perf_counter() - t0)) with open(output_file_path, 'w') as fp: json.dump(dict(stores=stores), fp, indent=2) print(f'Dumped {len(stores)} store(s) to {output_file_path}.')
def dump(output_file_path: Optional[str], config_file_path: Optional[str], data_type: Optional[str], short_form: bool, include_props: str, exclude_props: str, csv_format: bool, yaml_format: bool, json_format: bool): """ Dump metadata of given data stores. Dumps data store metadata and metadata for a store's data resources for given data stores into a JSON file. Data stores may be selected and configured by a configuration file CONFIG, which may have JSON or YAML format. For example, this YAML configuration configures a single directory data store: \b this_dir: title: Current Dir description: A store that represents my current directory store_id: "directory" store_params: base_dir: "." """ from xcube.core.store import DataStoreConfig from xcube.core.store import DataStorePool import yaml import json import os.path if csv_format: output_format = 'csv' ext = '.csv' elif yaml_format: output_format = 'yaml' ext = '.yml' elif json_format: output_format = 'json' ext = '.json' elif output_file_path is not None: path_no_ext, ext = os.path.splitext(output_file_path) if ext in ('.csv', '.txt'): output_format = 'csv' elif ext in ('.yaml', '.yml'): output_format = 'yaml' else: output_format = 'json' else: output_format = 'json' ext = '.json' if output_file_path is None: path_no_ext, _ = os.path.splitext(_DEFAULT_DUMP_OUTPUT) output_file_path = path_no_ext + ext include_props = _parse_props(include_props) if include_props else None exclude_props = _parse_props(exclude_props) if exclude_props else None if short_form: short_include_props = _parse_props(_SHORT_INCLUDE) include_props = include_props or {} for data_key in ('store', 'data', 'var'): include_props[data_key] = include_props.get(data_key, set()).union( short_include_props[data_key]) if config_file_path: store_pool = DataStorePool.from_file(config_file_path) else: extensions = get_extension_registry().find_extensions( EXTENSION_POINT_DATA_STORES) store_configs = { extension.name: DataStoreConfig(extension.name, title=extension.metadata.get('title'), description=extension.metadata.get('description')) for extension in extensions if extension.name not in ('memory', 'directory', 's3') } store_pool = DataStorePool(store_configs) dump_data = _get_store_data_var_tuples(store_pool, data_type, include_props, exclude_props) if output_format == 'csv': column_names = None column_names_set = None rows = [] for store_dict, data_dict, var_dict in dump_data: if store_dict is None: break row = {} row.update({'store.' + k: v for k, v in store_dict.items()}) row.update({'data.' + k: v for k, v in data_dict.items()}) row.update({'var.' + k: v for k, v in var_dict.items()}) rows.append(row) if column_names_set is None: column_names = list(row.keys()) column_names_set = set(column_names) else: for k in row.keys(): if k not in column_names_set: column_names.append(k) column_names_set.add(k) def format_cell_value(value: Any) -> str: return str(value) if value is not None else '' sep = '\t' with open(output_file_path, 'w') as fp: if column_names: fp.write(sep.join(column_names) + '\n') for row in rows: fp.write( sep.join( map(format_cell_value, tuple(row.get(k) for k in column_names))) + '\n') print(f'Dumped {len(rows)} store entry/ies to {output_file_path}.') else: last_store_dict = None last_data_dict = None vars_list = [] data_list = [] store_list = [] for store_dict, data_dict, var_dict in dump_data: if data_dict is not last_data_dict or data_dict is None: if last_data_dict is not None: last_data_dict['data_vars'] = vars_list vars_list = [] data_list.append(last_data_dict) last_data_dict = data_dict if store_dict is not last_store_dict or store_dict is None: if last_store_dict is not None: last_store_dict['data'] = data_list data_list = [] store_list.append(last_store_dict) last_store_dict = store_dict if var_dict: vars_list.append(var_dict) with open(output_file_path, 'w') as fp: if output_format == 'json': json.dump(dict(stores=store_list), fp, indent=2) else: yaml.dump(dict(stores=store_list), fp, indent=2) print( f'Dumped entries of {len(store_list)} store(s) to {output_file_path}.' )
def test_get_store_error(self): pool = DataStorePool() with self.assertRaises(DataStoreError) as cm: pool.get_store('dir-1') self.assertEqual('Configured data store instance "dir-1" not found.', f'{cm.exception}')