def test_get_store_instance_id(self): store_params_1 = { "root": "./bibo" } ds_config_1 = DataStoreConfig(store_id='file', store_params=store_params_1) ds_configs = {'dir-1': ds_config_1} pool = DataStorePool(ds_configs) store_params_2 = { "root": "./babo" } ds_config_2 = DataStoreConfig(store_id='file', store_params=store_params_2) ds_config_3 = DataStoreConfig(store_id='file', store_params=store_params_1, title='A third configuration') self.assertEqual('dir-1', pool.get_store_instance_id(ds_config_1)) self.assertEqual('dir-1', pool.get_store_instance_id(ds_config_1, strict_check=True)) self.assertIsNone(pool.get_store_instance_id(ds_config_2)) self.assertEqual('dir-1', pool.get_store_instance_id(ds_config_3)) self.assertIsNone(pool.get_store_instance_id(ds_config_3, strict_check=True))
def test_to_dict(self): self.assertEqual({}, DataStorePool().to_dict()) self.assertEqual({'ram': {'store_id': 'memory'}, 'dir': {'store_id': 'file', 'store_params': {'base_dir': 'bibo'}}}, DataStorePool({'ram': DataStoreConfig(store_id='memory'), 'dir': DataStoreConfig(store_id='file', store_params=dict(base_dir="bibo")) }).to_dict())
def test_to_dict(self): store_config = DataStoreConfig('directory', store_params={'base_dir': '.'}, title='Local', description='Local files') self.assertEqual({'description': 'Local files', 'name': 'Local', 'store_id': 'directory', 'store_params': {'base_dir': '.'}}, store_config.to_dict())
def test_constructor_asserts(self): with self.assertRaises(ValueError) as cm: DataStoreConfig('') self.assertEqual('store_id must be given', f'{cm.exception}') with self.assertRaises(TypeError) as cm: # noinspection PyTypeChecker DataStoreConfig('directory', store_params=[1, 'B']) self.assertEqual("store_params must be an instance of <class 'dict'>", f'{cm.exception}')
def test_from_dict_with_invalid_cost_params(self): with self.assertRaises(DataStoreError): DataStoreConfig.from_dict({'description': 'Local files', 'title': 'Local', 'store_id': 'file', 'store_params': {'root': '.'}, 'cost_params': { # Required: # 'input_pixels_per_punit': 10, # 'output_pixels_per_punit': 20, }})
def test_add_remove_store_config(self): pool = DataStorePool() self.assertEqual([], pool.store_instance_ids) pool.add_store_config('mem-1', DataStoreConfig('memory')) self.assertEqual(['mem-1'], pool.store_instance_ids) pool.add_store_config('mem-2', DataStoreConfig('memory')) self.assertEqual(['mem-1', 'mem-2'], pool.store_instance_ids) pool.add_store_config('mem-1', DataStoreConfig('memory')) self.assertEqual(['mem-1', 'mem-2'], pool.store_instance_ids) pool.remove_store_config('mem-1') self.assertEqual(['mem-2'], pool.store_instance_ids) pool.remove_store_config('mem-2') self.assertEqual([], pool.store_instance_ids)
def test_get_data_store_instance_from_pool(self): pool = DataStorePool({ 'dir': DataStoreConfig('directory', store_params=dict(base_dir='.')) }) instance = get_data_store_instance('@dir', store_pool=pool) self.assertIsInstance(instance.store, DirectoryDataStore) instance2 = get_data_store_instance('@dir', store_pool=pool) self.assertIs(instance, instance2)
def test_constructor_and_instance_props(self): store_config = DataStoreConfig('file', store_params={'root': '.'}, title='Local', description='Local files') self.assertEqual('file', store_config.store_id) self.assertEqual({'root': '.'}, store_config.store_params) self.assertEqual('Local', store_config.title) self.assertEqual('Local files', store_config.description)
def test_from_dict(self): store_config = DataStoreConfig.from_dict({ 'description': 'Local files', 'title': 'Local', 'store_id': 'file', 'store_params': {'root': '.'} }) self.assertIsInstance(store_config, DataStoreConfig) self.assertEqual('file', store_config.store_id) self.assertEqual({'root': '.'}, store_config.store_params) self.assertEqual('Local', store_config.title) self.assertEqual('Local files', store_config.description)
def test_get_data_store_instance_from_pool_with_params(self): pool = DataStorePool({ '@dir': DataStoreConfig('directory', store_params=dict(base_dir='.')) }) with self.assertRaises(ValueError) as cm: get_data_store_instance('@dir', store_pool=pool, store_params={'thres': 5}) self.assertEqual( 'store_params cannot be given, with store_id ("@dir") referring to a configured store', f'{cm.exception}')
def test_get_data_store_instance_from_pool(self): pool = DataStorePool({ 'dir': DataStoreConfig('file', store_params=dict(root='.')) }) instance = get_data_store_instance('@dir', store_pool=pool) self.assertTrue(hasattr(instance.store, 'root')) # noinspection PyUnresolvedReferences self.assertTrue(os.path.isabs(instance.store.root)) self.assertTrue(os.path.isdir(instance.store.root)) instance2 = get_data_store_instance('@dir', store_pool=pool) self.assertIs(instance, instance2)
def test_get_data_store_instance_from_pool_with_params(self): pool = DataStorePool({ '@dir': DataStoreConfig('file', store_params=dict(root='.')) }) with self.assertRaises(ValueError) as cm: get_data_store_instance( '@dir', store_pool=pool, store_params={'auto_mkdir': True} ) self.assertEqual('store_params cannot be given,' ' with store_id ("@dir") referring' ' to a configured store', f'{cm.exception}')
def test_from_dict_with_valid_cost_params(self): store_config = DataStoreConfig.from_dict({ 'description': 'Local files', 'title': 'Local', 'store_id': 'file', 'store_params': {'root': '.'}, 'cost_params': { 'input_pixels_per_punit': 500, 'output_pixels_per_punit': 100, 'input_punits_weight': 1.1, } }) self.assertIsInstance(store_config, DataStoreConfig) self.assertEqual('file', store_config.store_id) self.assertEqual({'root': '.'}, store_config.store_params) self.assertEqual('Local', store_config.title) self.assertEqual('Local files', store_config.description)
def get_data_store_pool(self) -> Optional[DataStorePool]: data_store_configs = self._config.get('DataStores', []) if not data_store_configs or self._data_store_pool: return self._data_store_pool if not isinstance(data_store_configs, list): raise ServiceConfigError('DataStores must be a list') store_configs: Dict[str, DataStoreConfig] = {} for data_store_config_dict in data_store_configs: store_instance_id = data_store_config_dict.get('Identifier') store_id = data_store_config_dict.get('StoreId') store_params = data_store_config_dict.get('StoreParams', {}) dataset_configs = data_store_config_dict.get('Datasets') store_config = DataStoreConfig(store_id, store_params=store_params, user_data=dataset_configs) store_configs[store_instance_id] = store_config self._data_store_pool = DataStorePool(store_configs) return self._data_store_pool
def test_normalize(self): pool = DataStorePool({ '@dir': DataStoreConfig('directory', store_params=dict(root='.')) }) file_path = '_test-data-stores-pool.json' with open(file_path, 'w') as fp: json.dump(pool.to_dict(), fp) try: pool_1 = DataStorePool.normalize(file_path) self.assertIsInstance(pool_1, DataStorePool) pool_2 = DataStorePool.normalize(pool_1) self.assertIs(pool_2, pool_1) pool_3 = DataStorePool.normalize(pool_2.to_dict()) self.assertIsInstance(pool_3, DataStorePool) finally: os.remove(file_path) with self.assertRaises(TypeError): # noinspection PyTypeChecker DataStorePool.normalize(42)
def dump(output_file_path: Optional[str], config_file_path: Optional[str], data_type: Optional[str], short_form: bool, include_props: str, exclude_props: str, csv_format: bool, yaml_format: bool, json_format: bool): """ Dump metadata of given data stores. Dumps data store metadata and metadata for a store's data resources for given data stores into a JSON file. Data stores may be selected and configured by a configuration file CONFIG, which may have JSON or YAML format. For example, this YAML configuration configures a single directory data store: \b this_dir: title: Current Dir description: A store that represents my current directory store_id: "directory" store_params: base_dir: "." """ from xcube.core.store import DataStoreConfig from xcube.core.store import DataStorePool import yaml import json import os.path if csv_format: output_format = 'csv' ext = '.csv' elif yaml_format: output_format = 'yaml' ext = '.yml' elif json_format: output_format = 'json' ext = '.json' elif output_file_path is not None: path_no_ext, ext = os.path.splitext(output_file_path) if ext in ('.csv', '.txt'): output_format = 'csv' elif ext in ('.yaml', '.yml'): output_format = 'yaml' else: output_format = 'json' else: output_format = 'json' ext = '.json' if output_file_path is None: path_no_ext, _ = os.path.splitext(_DEFAULT_DUMP_OUTPUT) output_file_path = path_no_ext + ext include_props = _parse_props(include_props) if include_props else None exclude_props = _parse_props(exclude_props) if exclude_props else None if short_form: short_include_props = _parse_props(_SHORT_INCLUDE) include_props = include_props or {} for data_key in ('store', 'data', 'var'): include_props[data_key] = include_props.get(data_key, set()).union( short_include_props[data_key]) if config_file_path: store_pool = DataStorePool.from_file(config_file_path) else: extensions = get_extension_registry().find_extensions( EXTENSION_POINT_DATA_STORES) store_configs = { extension.name: DataStoreConfig(extension.name, title=extension.metadata.get('title'), description=extension.metadata.get('description')) for extension in extensions if extension.name not in ('memory', 'directory', 's3') } store_pool = DataStorePool(store_configs) dump_data = _get_store_data_var_tuples(store_pool, data_type, include_props, exclude_props) if output_format == 'csv': column_names = None column_names_set = None rows = [] for store_dict, data_dict, var_dict in dump_data: if store_dict is None: break row = {} row.update({'store.' + k: v for k, v in store_dict.items()}) row.update({'data.' + k: v for k, v in data_dict.items()}) row.update({'var.' + k: v for k, v in var_dict.items()}) rows.append(row) if column_names_set is None: column_names = list(row.keys()) column_names_set = set(column_names) else: for k in row.keys(): if k not in column_names_set: column_names.append(k) column_names_set.add(k) def format_cell_value(value: Any) -> str: return str(value) if value is not None else '' sep = '\t' with open(output_file_path, 'w') as fp: if column_names: fp.write(sep.join(column_names) + '\n') for row in rows: fp.write( sep.join( map(format_cell_value, tuple(row.get(k) for k in column_names))) + '\n') print(f'Dumped {len(rows)} store entry/ies to {output_file_path}.') else: last_store_dict = None last_data_dict = None vars_list = [] data_list = [] store_list = [] for store_dict, data_dict, var_dict in dump_data: if data_dict is not last_data_dict or data_dict is None: if last_data_dict is not None: last_data_dict['data_vars'] = vars_list vars_list = [] data_list.append(last_data_dict) last_data_dict = data_dict if store_dict is not last_store_dict or store_dict is None: if last_store_dict is not None: last_store_dict['data'] = data_list data_list = [] store_list.append(last_store_dict) last_store_dict = store_dict if var_dict: vars_list.append(var_dict) with open(output_file_path, 'w') as fp: if output_format == 'json': json.dump(dict(stores=store_list), fp, indent=2) else: yaml.dump(dict(stores=store_list), fp, indent=2) print( f'Dumped entries of {len(store_list)} store(s) to {output_file_path}.' )
def dump(output_file_path: str, config_file_path: Optional[str], type_specifier: Optional[str]): """ Dump metadata of given data stores. Dumps data store metadata and metadata for a store's data resources for given data stores into a JSON file. Data stores may be selected and configured by a configuration file CONFIG, which may have JSON or YAML format. For example, this YAML configuration configures a single directory data store: \b this_dir: title: Current Dir description: A store that represents my current directory store_id: "directory" store_params: base_dir: "." """ from xcube.core.store import DataStoreConfig from xcube.core.store import DataStorePool import time if config_file_path: store_pool = DataStorePool.from_file(config_file_path) else: extensions = get_extension_registry().find_extensions( EXTENSION_POINT_DATA_STORES) store_configs = { extension.name: DataStoreConfig(extension.name, title=extension.metadata.get('title'), description=extension.metadata.get('description')) for extension in extensions if extension.name not in ('memory', 'directory', 's3') } store_pool = DataStorePool(store_configs) stores = [] for store_instance_id in store_pool.store_instance_ids: t0 = time.perf_counter() print(f'Generating entries for store "{store_instance_id}"...') try: store_instance = store_pool.get_store(store_instance_id) except BaseException as error: print(f'error: cannot open store "{store_instance_id}": {error}', file=sys.stderr) continue try: search_result = [ dsd.to_dict() for dsd in store_instance.search_data( type_specifier=type_specifier) ] except BaseException as error: print(f'error: cannot search store "{store_instance_id}": {error}', file=sys.stderr) continue store_config = store_pool.get_store_config(store_instance_id) stores.append( dict(store_instance_id=store_instance_id, store_id=store_instance_id, title=store_config.title, description=store_config.description, type_specifier=type_specifier, datasets=search_result)) print('Done after {:.2f} seconds'.format(time.perf_counter() - t0)) with open(output_file_path, 'w') as fp: json.dump(dict(stores=stores), fp, indent=2) print(f'Dumped {len(stores)} store(s) to {output_file_path}.')
def _maybe_assign_store_instance_ids(self): assignable_dataset_configs = [ dc for dc in self._dataset_configs if 'StoreInstanceId' not in dc and dc.get('FileSystem', 'file') in NON_MEMORY_FILE_SYSTEMS ] # split into sublists according to file system and non-root store params config_lists = [] for config in assignable_dataset_configs: store_params = self._get_other_store_params_than_root(config) file_system = config.get('FileSystem', 'file') appended = False for config_list in config_lists: if config_list[0] == file_system and \ config_list[1] == store_params: config_list[2].append(config) appended = True break if not appended: config_lists.append((file_system, store_params, [config])) data_store_pool = self.get_data_store_pool() if not data_store_pool: data_store_pool = self._data_store_pool = DataStorePool() for file_system, store_params, config_list in config_lists: # Retrieve paths per configuration paths = [dc['Path'] for dc in config_list] list.sort(paths) # Determine common prefixes of paths (and call them roots) prefixes = _get_common_prefixes(paths) if len(prefixes) < 1: roots = [''] else: # perform further step to merge prefixes with same start prefixes = list(set(prefixes)) prefixes.sort() roots = [] root_candidate = prefixes[0] for root in prefixes[1:]: common_root = os.path.commonprefix([root_candidate, root]) if _is_not_empty(common_root): root_candidate = common_root else: roots.append(root_candidate) root_candidate = root roots.append(root_candidate) for root in roots: # ensure root does not end with full or partial directory # or file name while not root.endswith("/") and not root.endswith("\\") and \ len(root) > 0: root = root[:-1] if root.endswith("/") or root.endswith("\\"): root = root[:-1] abs_root = root # For local file systems: Determine absolute root from base dir fs_protocol = FS_TYPE_TO_PROTOCOL.get(file_system, file_system) if fs_protocol == 'file' and not os.path.isabs(abs_root): abs_root = os.path.join(self._base_dir, abs_root) abs_root = os.path.normpath(abs_root) store_params_for_root = store_params.copy() store_params_for_root['root'] = abs_root # See if there already is a store with this configuration data_store_config = DataStoreConfig( store_id=fs_protocol, store_params=store_params_for_root) store_instance_id = data_store_pool.\ get_store_instance_id(data_store_config) if not store_instance_id: # Create new store with new unique store instance id counter = 1 while data_store_pool.has_store_instance( f'{fs_protocol}_{counter}'): counter += 1 store_instance_id = f'{fs_protocol}_{counter}' data_store_pool.add_store_config(store_instance_id, data_store_config) for config in config_list: if config['Path'].startswith(root): config['StoreInstanceId'] = store_instance_id new_path = config['Path'][len(root):] while new_path.startswith("/") or \ new_path.startswith("\\"): new_path = new_path[1:] config['Path'] = new_path