def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]: if self.has_data(data_id, type_specifier=str(TYPE_SPECIFIER_CUBE)): return str(TYPE_SPECIFIER_DATASET), str(TYPE_SPECIFIER_CUBE) if self.has_data(data_id, type_specifier=str(TYPE_SPECIFIER_DATASET)): return str(TYPE_SPECIFIER_DATASET), raise DataStoreError( f'Data resource "{data_id}" does not exist in store')
def delete_data(self, data_id: str): path = self._resolve_data_id_to_path(data_id) try: self._s3.delete(path, recursive=True) self.deregister_data(data_id) except ValueError as e: raise DataStoreError(f'{e}') from e
def write_data(self, data: Any, data_id: str = None, writer_id: str = None, replace: bool = False, **write_params) -> str: assert_instance(data, (xr.Dataset, MultiLevelDataset)) if not writer_id: if isinstance(data, MultiLevelDataset): predicate = get_data_accessor_predicate( type_specifier=TYPE_SPECIFIER_MULTILEVEL_DATASET, format_id='levels', storage_id=_STORAGE_ID) elif isinstance(data, xr.Dataset): predicate = get_data_accessor_predicate( type_specifier=TYPE_SPECIFIER_DATASET, format_id='zarr', storage_id=_STORAGE_ID) else: raise DataStoreError(f'Unsupported data type "{type(data)}"') extensions = find_data_writer_extensions(predicate=predicate) writer_id = extensions[0].name data_id = self._ensure_valid_data_id(data_id, data) path = self._resolve_data_id_to_path(data_id) self._new_s3_writer(writer_id).write_data(data, data_id=path, replace=replace, **write_params) self.register_data(data_id, data) return data_id
def describe_data(self, data_id: str, type_specifier: str = None) -> DataDescriptor: self._assert_valid_data_id(data_id) actual_type_specifier = self._get_type_specifier_for_data_id(data_id) if actual_type_specifier is not None: if type_specifier is None or actual_type_specifier.satisfies( type_specifier): data = self.open_data(data_id) return new_data_descriptor(data_id, data, require=True) else: raise DataStoreError( f'Type specifier "{type_specifier}" cannot be satisfied' f' by type specifier "{actual_type_specifier}" of data resource "{data_id}"' ) else: raise DataStoreError(f'Data resource "{data_id}" not found')
def describe_data(self, data_id: str) -> DatasetDescriptor: self._assert_valid_data_id(data_id) try: ds_metadata = self._cci_odp.get_dataset_metadata(data_id) return self._get_data_descriptor_from_metadata( data_id, ds_metadata) except ValueError: raise DataStoreError( f'Cannot describe metadata. "{data_id}" does not seem to be a valid identifier.' )
def _get_accessor_id_parts(cls, data_id: str, require=True) -> Optional[Tuple[str, str, str]]: assert_given(data_id, 'data_id') _, ext = os.path.splitext(data_id) accessor_id_parts = _FILENAME_EXT_TO_ACCESSOR_ID_PARTS.get(ext) if not accessor_id_parts and require: raise DataStoreError( f'A dataset named "{data_id}" is not supported') return accessor_id_parts
def get_data_opener_ids( self, data_id: str = None, type_specifier: str = None, ) -> Tuple[str, ...]: self._assert_valid_type_specifier(type_specifier) if data_id is not None and not self.has_data(data_id): raise DataStoreError( f'Data Resource "{data_id}" is not available.') may_be_cube = data_id is None or self.has_data( data_id, str(TYPE_SPECIFIER_CUBE)) if type_specifier: if TYPE_SPECIFIER_CUBE.is_satisfied_by(type_specifier): if not may_be_cube: raise DataStoreError( f'Data Resource "{data_id}" is not available ' f'as specified type "{type_specifier}".') return CUBE_OPENER_ID, if may_be_cube: return DATASET_OPENER_ID, CUBE_OPENER_ID return DATASET_OPENER_ID,
def describe_data(self, data_id: str, type_specifier: str = None) -> DataDescriptor: self._assert_valid_data_id(data_id) if type_specifier is not None: data_type_specifier = get_type_specifier(self._data_dict[data_id]) if data_type_specifier is None or not data_type_specifier.satisfies( type_specifier): raise DataStoreError( f'Type specifier "{type_specifier}" cannot be satisfied' f' by type specifier "{data_type_specifier}" of data resource "{data_id}"' ) return new_data_descriptor(data_id, self._data_dict[data_id])
def write_data(self, data: Any, data_id: str = None, writer_id: str = None, replace: bool = False, **write_params) -> str: self._assert_empty_params(write_params, 'write_params') data_id = self._ensure_valid_data_id(data_id) if data_id in self._data_dict and not replace: raise DataStoreError( f'Data resource "{data_id}" already exist in store') self._data_dict[data_id] = data return data_id
def open_data(self, data_id: str, **open_params) -> xr.Dataset: s3 = self._s3 if s3 is None: s3, open_params = self.consume_s3fs_params(open_params) bucket_name, open_params = self.consume_bucket_name_param(open_params) try: return xr.open_zarr( s3fs.S3Map(root=f'{bucket_name}/{data_id}' if bucket_name else data_id, s3=s3, check=False), **open_params) except ValueError as e: raise DataStoreError(f'{e}') from e
def _get_dataset_and_collection_metadata( self, data_id: str) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]: dataset_metadata = SentinelHubMetadata().datasets.get(data_id) if dataset_metadata is None: raise DataStoreError(f'Dataset "{data_id}" not found.') if self._sentinel_hub is not None: # If we are connected to the API, we may also have collection metadata collection_name = dataset_metadata.get('collection_name') if collection_name is not None: for collection_metadata in self._sentinel_hub.collections(): if collection_name == collection_metadata.get('id'): return dataset_metadata, collection_metadata return dataset_metadata, None
def test_get_filename_ext(self): import xarray as xr import geopandas as gpd from xcube.core.mldataset import BaseMultiLevelDataset dataset = xr.Dataset() self.assertEqual('.zarr', self.store._get_filename_ext(dataset)) frame = gpd.GeoDataFrame() self.assertEqual('.geojson', self.store._get_filename_ext(frame)) mldataset = BaseMultiLevelDataset(base_dataset=dataset) self.assertEqual('.levels', self.store._get_filename_ext(mldataset)) self.assertIsNone(self.store._get_filename_ext(None)) self.assertIsNone( self.store._get_filename_ext(DataStoreError('A nonsense object')))
def _get_opener(self, opener_id: str = None, type_specifier: str = None) -> CciOdpDataOpener: self._assert_valid_opener_id(opener_id) self._assert_valid_type_specifier(type_specifier) if type_specifier: if TYPE_SPECIFIER_CUBE.is_satisfied_by(type_specifier): type_opener_id = CUBE_OPENER_ID else: type_opener_id = DATASET_OPENER_ID if opener_id and opener_id != type_opener_id: raise DataStoreError( f'Invalid combination of opener_id "{opener_id}" ' f'and type_specifier "{type_specifier}"') opener_id = type_opener_id if opener_id == CUBE_OPENER_ID: return self._cube_opener return self._dataset_opener
def open_cubes(input_configs: Sequence[InputConfig], cube_config: CubeConfig, store_pool: DataStorePool = None): cubes = [] all_cube_params = cube_config.to_dict() with observe_progress('Opening input(s)', len(input_configs)) as progress: for input_config in input_configs: open_params = {} opener_id = input_config.opener_id if input_config.store_id: store_instance = get_data_store_instance( input_config.store_id, store_params=input_config.store_params, store_pool=store_pool) store = store_instance.store if opener_id is None: opener_ids = store.get_data_opener_ids( data_id=input_config.data_id, type_specifier=TYPE_SPECIFIER_CUBE) if not opener_ids: raise DataStoreError( f'Data store "{input_config.store_id}" does not support data cubes' ) opener_id = opener_ids[0] opener = store open_params.update(opener_id=opener_id, **input_config.open_params) else: opener = new_data_opener(opener_id) open_params.update(**input_config.store_params, **input_config.open_params) open_params_schema = opener.get_open_data_params_schema( input_config.data_id) cube_params = { k: v for k, v in all_cube_params.items() if k in open_params_schema.properties } cube = opener.open_data(input_config.data_id, **open_params, **cube_params) cubes.append(cube) progress.worked(1) return cubes
def _get_accessor_extensions(self, data_id: str, get_data_accessor_extensions, require=True) -> List[Extension]: accessor_id_parts = self._get_accessor_id_parts(data_id, require=require) if not accessor_id_parts: return [] type_specifier, format_id, storage_id = accessor_id_parts predicate = get_data_accessor_predicate(type_specifier=type_specifier, format_id=format_id, storage_id=storage_id) extensions = get_data_accessor_extensions(predicate) if not extensions: if require: raise DataStoreError( f'No accessor found for data resource "{data_id}"') return [] return extensions
def write_data(self, data: xr.Dataset, data_id: str, replace=False, **write_params): assert_instance(data, xr.Dataset, 'data') s3 = self._s3 if s3 is None: s3, write_params = self.consume_s3fs_params(write_params) bucket_name, write_params = self.consume_bucket_name_param( write_params) try: data.to_zarr(s3fs.S3Map( root=f'{bucket_name}/{data_id}' if bucket_name else data_id, s3=s3, check=False), mode='w' if replace else None, **write_params) except ValueError as e: raise DataStoreError(f'{e}') from e
def search_data(self, type_specifier: str = None, **search_params) -> Iterator[DataDescriptor]: """ Search the data store. The default implementation returns all data resources that may be filtered using the optional *type_specifier*. :param type_specifier: Type specifier to filter returned data resources. :param search_params: Not supported (yet) :return: an iterator of :class:DataDescriptor instances """ if search_params: raise DataStoreError( f'Unsupported search parameters: {", ".join(search_params.keys())}' ) for data_id, _ in self.get_data_ids(type_specifier=type_specifier, include_titles=False): yield self.describe_data(data_id)
def delete_data(self, data_id: str): if not os.path.exists(data_id): raise DataStoreError(f'A dataset named "{data_id}" does not exist') rimraf(data_id)
def _assert_empty_params(self, params: Optional[Mapping[str, Any]], name: str): if params: param_names = ', '.join(map(lambda k: f'"{k}"', params.keys())) raise DataStoreError(f'Unsupported {name} {param_names}')
def _assert_valid_opener_id(self, opener_id): if opener_id is not None and opener_id != DATASET_OPENER_ID and opener_id != CUBE_OPENER_ID: raise DataStoreError( f'Data opener identifier must be "{DATASET_OPENER_ID}" or "{CUBE_OPENER_ID}",' f'but got "{opener_id}"')
def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]: if not self.has_data(data_id): raise DataStoreError( f'"{data_id}" is not provided by this data store') data_type_specifier, _, _ = self._get_accessor_id_parts(data_id) return data_type_specifier,
def _assert_not_closed(self): if self._s3 is None: raise DataStoreError(f'Data store already closed.')
def _assert_valid_data_id(self, data_id): if not self.has_data(data_id): raise DataStoreError( f'Data resource "{data_id}" does not exist in store')
def _assert_valid_data_id(self, data_id: str): if data_id not in self.dataset_names: raise DataStoreError( f'Cannot describe metadata of data resource "{data_id}", ' f'as it cannot be accessed by data accessor "{self._id}".')
def _assert_valid_data_id(self, data_id): assert_given(data_id, 'data_id') if data_id not in self._data_dict: raise DataStoreError( f'Data resource "{data_id}" does not exist in store')
def _validate_type_specifier(type_specifier: Union[str, TypeSpecifier]): if not CDSDataStore._is_type_specifier_satisfied(type_specifier): raise DataStoreError( f'Supplied type specifier "{type_specifier}" is not compatible ' f'with "{TYPE_SPECIFIER_CUBE}."')
def _assert_valid_type_specifier(cls, type_specifier): if not cls._is_valid_type_specifier(type_specifier): raise DataStoreError( f'Type Specifier must be "{TYPE_SPECIFIER_DATASET}" or "{TYPE_SPECIFIER_CUBE}", ' f'but got "{type_specifier}"')
def _assert_valid_opener_id(opener_id): if opener_id is not None and opener_id != CDS_DATA_OPENER_ID: raise DataStoreError( f'Data opener identifier must be "{CDS_DATA_OPENER_ID}"' f'but got "{opener_id}"')