def open_ml_dataset_from_local_fs( ctx: ServiceContext, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_id}") if not os.path.isabs(path): path = os.path.join(ctx.base_dir, path) data_format = dataset_descriptor.get('Format', guess_cube_format(path)) if data_format == FORMAT_NAME_NETCDF4: with measure_time(tag=f"opened local NetCDF dataset {path}"): ds = assert_cube(xr.open_dataset(path)) return BaseMultiLevelDataset(ds) if data_format == FORMAT_NAME_ZARR: with measure_time(tag=f"opened local zarr dataset {path}"): ds = assert_cube(xr.open_zarr(path)) return BaseMultiLevelDataset(ds) if data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened local levels dataset {path}"): return FileStorageMultiLevelDataset(path) raise ServiceConfigError( f"Illegal data format {data_format!r} for dataset {ds_id}")
def open_ml_dataset_from_python_code( ctx: ServiceContext, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_id}") if not os.path.isabs(path): path = os.path.join(ctx.base_dir, path) callable_name = dataset_descriptor.get('Function', COMPUTE_DATASET) input_dataset_ids = dataset_descriptor.get('InputDatasets', []) input_parameters = dataset_descriptor.get('InputParameters', {}) for input_dataset_id in input_dataset_ids: if not ctx.get_dataset_descriptor(input_dataset_id): raise ServiceConfigError( f"Invalid dataset descriptor {ds_id!r}: " f"Input dataset {input_dataset_id!r} of callable {callable_name!r} " f"must reference another dataset") with measure_time(tag=f"opened memory dataset {path}"): return ComputedMultiLevelDataset(ds_id, path, callable_name, input_dataset_ids, ctx.get_ml_dataset, input_parameters, exception_type=ServiceConfigError)
def get_chunk_cache_capacity(cls, config: Dict[str, Any], cache_size_key: str) -> Optional[int]: cache_size = config.get(cache_size_key, None) if not cache_size: return None elif isinstance(cache_size, str): try: cache_size = parse_mem_size(cache_size) except ValueError: raise ServiceConfigError(f'Invalid {cache_size_key}') elif not isinstance(cache_size, int) or cache_size < 0: raise ServiceConfigError(f'Invalid {cache_size_key}') return cache_size
def from_config(cls, config: Dict[str, Any]) -> Optional['AuthConfig']: authentication = config.get('Authentication') if not authentication: return None domain = authentication.get('Domain') if not domain: raise ServiceConfigError('Missing key "Domain" in section "Authentication"') audience = authentication.get('Audience') if not audience: raise ServiceConfigError('Missing key "Audience" in section "Authentication"') algorithms = authentication.get('Algorithms', ['RS256']) if not algorithms: raise ServiceConfigError('Value for key "Algorithms" in section "Authentication" must not be empty') return AuthConfig(domain, audience, algorithms)
def test_same_base_type(self): self.assertIsInstance(ServiceError(''), HTTPError) self.assertEqual(500, ServiceError('').status_code) self.assertEqual(503, ServiceError('', status_code=503).status_code) self.assertIsInstance(ServiceConfigError(''), ServiceError) self.assertEqual(500, ServiceConfigError('').status_code) self.assertIsInstance(ServiceBadRequestError(''), ServiceError) self.assertEqual(400, ServiceBadRequestError('').status_code) self.assertIsInstance(ServiceResourceNotFoundError(''), ServiceError) self.assertEqual(404, ServiceResourceNotFoundError('').status_code)
def _open_ml_dataset_from_python_code( ctx: ServiceContext, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') path = ctx.get_descriptor_path(dataset_descriptor, f"dataset descriptor {ds_id}") callable_name = dataset_descriptor.get('Function', COMPUTE_DATASET) input_dataset_ids = dataset_descriptor.get('InputDatasets', []) input_parameters = dataset_descriptor.get('InputParameters', {}) for input_dataset_id in input_dataset_ids: if not ctx.get_dataset_descriptor(input_dataset_id): raise ServiceConfigError( f"Invalid dataset descriptor {ds_id!r}: " f"Input dataset {input_dataset_id!r} of callable {callable_name!r} " f"must reference another dataset") return open_ml_dataset_from_python_code( path, callable_name=callable_name, input_ml_dataset_ids=input_dataset_ids, input_ml_dataset_getter=ctx.get_ml_dataset, input_parameters=input_parameters, ds_id=ds_id, exception_type=ServiceConfigError)
def _open_ml_dataset( self, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') fs_type = dataset_descriptor.get('FileSystem', 'local') if self._ml_dataset_openers and fs_type in self._ml_dataset_openers: ml_dataset_opener = self._ml_dataset_openers[fs_type] elif fs_type in _DEFAULT_MULTI_LEVEL_DATASET_OPENERS: ml_dataset_opener = _DEFAULT_MULTI_LEVEL_DATASET_OPENERS[fs_type] else: raise ServiceConfigError( f"Invalid fs={fs_type!r} in dataset descriptor {ds_id!r}") ml_dataset = ml_dataset_opener(self, dataset_descriptor) augmentation = dataset_descriptor.get('Augmentation') if augmentation: script_path = self.get_descriptor_path( augmentation, f"'Augmentation' of dataset descriptor {ds_id}") input_parameters = augmentation.get('InputParameters') callable_name = augmentation.get('Function', COMPUTE_VARIABLES) ml_dataset = augment_ml_dataset(ml_dataset, script_path, callable_name, self.get_ml_dataset, self.set_ml_dataset, input_parameters=input_parameters, exception_type=ServiceConfigError) return ml_dataset
def _open_ml_dataset_from_python_code( ctx: ServiceContext, dataset_config: DatasetConfigDict) -> MultiLevelDataset: ds_id = dataset_config.get('Identifier') path = ctx.get_config_path(dataset_config, f"dataset configuration {ds_id}") callable_name = dataset_config.get('Function', COMPUTE_DATASET) input_dataset_ids = dataset_config.get('InputDatasets', []) input_parameters = dataset_config.get('InputParameters', {}) chunk_cache_capacity = ctx.get_dataset_chunk_cache_capacity(dataset_config) if chunk_cache_capacity: warnings.warn( 'chunk cache size is not effective for datasets computed from scripts' ) for input_dataset_id in input_dataset_ids: if not ctx.get_dataset_config(input_dataset_id): raise ServiceConfigError( f"Invalid dataset configuration {ds_id!r}: " f"Input dataset {input_dataset_id!r} of callable {callable_name!r} " f"must reference another dataset") return open_ml_dataset_from_python_code( path, callable_name=callable_name, input_ml_dataset_ids=input_dataset_ids, input_ml_dataset_getter=ctx.get_ml_dataset, input_parameters=input_parameters, ds_id=ds_id, exception_type=ServiceConfigError)
def get_dataset_descriptor(self, ds_id: str) -> Dict[str, Any]: dataset_descriptors = self.get_dataset_descriptors() if not dataset_descriptors: raise ServiceConfigError(f"No datasets configured") dataset_descriptor = self.find_dataset_descriptor(dataset_descriptors, ds_id) if dataset_descriptor is None: raise ServiceResourceNotFoundError(f'Dataset "{ds_id}" not found') return dataset_descriptor
def _open_ml_dataset(self, dataset_config: DatasetConfigDict) \ -> MultiLevelDataset: ds_id: str = dataset_config.get('Identifier') store_instance_id = dataset_config.get('StoreInstanceId') if store_instance_id: data_store_pool = self.get_data_store_pool() data_store = data_store_pool.get_store(store_instance_id) data_id = dataset_config.get('Path') open_params = dataset_config.get('StoreOpenParams') or {} # Inject chunk_cache_capacity into open parameters chunk_cache_capacity = self.get_dataset_chunk_cache_capacity( dataset_config) if chunk_cache_capacity \ and (data_id.endswith('.zarr') or data_id.endswith('.levels')) \ and 'cache_size' not in open_params: open_params['cache_size'] = chunk_cache_capacity with self.measure_time(tag=f"opened dataset {ds_id!r}" f" from data store" f" {store_instance_id!r}"): dataset = data_store.open_data(data_id, **open_params) if isinstance(dataset, MultiLevelDataset): ml_dataset = dataset else: cube, _, _ = decode_cube(dataset, normalize=True, force_non_empty=True, force_geographic=True) ml_dataset = BaseMultiLevelDataset(cube, ds_id=ds_id) else: fs_type = dataset_config.get('FileSystem') if fs_type != 'memory': raise ServiceConfigError(f"Invalid FileSystem {fs_type!r}" f" in dataset configuration" f" {ds_id!r}") with self.measure_time(tag=f"opened dataset {ds_id!r}" f" from {fs_type!r}"): ml_dataset = _open_ml_dataset_from_python_code( self, dataset_config) augmentation = dataset_config.get('Augmentation') if augmentation: script_path = self.get_config_path( augmentation, f"'Augmentation' of dataset configuration {ds_id}") input_parameters = augmentation.get('InputParameters') callable_name = augmentation.get('Function', COMPUTE_VARIABLES) ml_dataset = augment_ml_dataset(ml_dataset, script_path, callable_name, self.get_ml_dataset, self.set_ml_dataset, input_parameters=input_parameters, exception_type=ServiceConfigError) return ml_dataset
def _open_ml_dataset( self, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: fs_type = dataset_descriptor.get('FileSystem', 'local') if self._ml_dataset_openers and fs_type in self._ml_dataset_openers: ml_dataset_opener = self._ml_dataset_openers[fs_type] elif fs_type in _DEFAULT_MULTI_LEVEL_DATASET_OPENERS: ml_dataset_opener = _DEFAULT_MULTI_LEVEL_DATASET_OPENERS[fs_type] else: ds_id = dataset_descriptor.get('Identifier') raise ServiceConfigError( f"Invalid fs={fs_type!r} in dataset descriptor {ds_id!r}") return ml_dataset_opener(self, dataset_descriptor)
def get_data_store_pool(self) -> Optional[DataStorePool]: data_store_configs = self._config.get('DataStores', []) if not data_store_configs or self._data_store_pool: return self._data_store_pool if not isinstance(data_store_configs, list): raise ServiceConfigError('DataStores must be a list') store_configs: Dict[str, DataStoreConfig] = {} for data_store_config_dict in data_store_configs: store_instance_id = data_store_config_dict.get('Identifier') store_id = data_store_config_dict.get('StoreId') store_params = data_store_config_dict.get('StoreParams', {}) dataset_configs = data_store_config_dict.get('Datasets') store_config = DataStoreConfig(store_id, store_params=store_params, user_data=dataset_configs) store_configs[store_instance_id] = store_config self._data_store_pool = DataStorePool(store_configs) return self._data_store_pool
def open_ml_dataset_from_object_storage( ctx: ServiceContext, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_id}") data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR) s3_client_kwargs = {} if 'Endpoint' in dataset_descriptor: s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint'] if 'Region' in dataset_descriptor: s3_client_kwargs['region_name'] = dataset_descriptor['Region'] obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=s3_client_kwargs) if data_format == FORMAT_NAME_ZARR: store = s3fs.S3Map(root=path, s3=obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time(tag=f"opened remote zarr dataset {path}"): consolidated = obs_file_system.exists(f'{path}/.zmetadata') ds = assert_cube( xr.open_zarr(cached_store, consolidated=consolidated)) return BaseMultiLevelDataset(ds) if data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened remote levels dataset {path}"): return ObjectStorageMultiLevelDataset( ds_id, obs_file_system, path, exception_type=ServiceConfigError)
def get_dataset_descriptors(self): dataset_descriptors = self._config.get('Datasets') if not dataset_descriptors: raise ServiceConfigError(f"No datasets configured") return dataset_descriptors
def _load_place_group(self, place_group_descriptor: Dict[str, Any], base_url: str, is_global: bool = False, load_features: bool = False) -> Dict[str, Any]: place_group_id = place_group_descriptor.get("PlaceGroupRef") if place_group_id: if is_global: raise ServiceConfigError( "'PlaceGroupRef' cannot be used in a global place group") if len(place_group_descriptor) > 1: raise ServiceConfigError( "'PlaceGroupRef' if present, must be the only entry in a 'PlaceGroups' item" ) return self.get_global_place_group(place_group_id, base_url, load_features=load_features) place_group_id = place_group_descriptor.get("Identifier") if not place_group_id: raise ServiceConfigError( "Missing 'Identifier' entry in a 'PlaceGroups' item") if place_group_id in self._place_group_cache: place_group = self._place_group_cache[place_group_id] else: place_group_title = place_group_descriptor.get( "Title", place_group_id) place_path_wc = self.get_descriptor_path(place_group_descriptor, f"'PlaceGroups' item") source_paths = glob.glob(place_path_wc) source_encoding = place_group_descriptor.get( "CharacterEncoding", "utf-8") join = None place_join = place_group_descriptor.get("Join") if isinstance(place_join, dict): join_path = self.get_descriptor_path( place_join, "'Join' of a 'PlaceGroups' item") join_property = place_join.get("Property") if not join_property: raise ServiceError( "Missing 'Property' entry in 'Join' of a 'PlaceGroups' item" ) join_encoding = place_join.get("CharacterEncoding", "utf-8") join = dict(path=join_path, property=join_property, encoding=join_encoding) property_mapping = place_group_descriptor.get("PropertyMapping") if property_mapping: property_mapping = dict(property_mapping) for key, value in property_mapping.items(): if isinstance(value, str) and '${base_url}' in value: property_mapping[key] = value.replace( '${base_url}', base_url) place_group = dict(type="FeatureCollection", features=None, id=place_group_id, title=place_group_title, propertyMapping=property_mapping, sourcePaths=source_paths, sourceEncoding=source_encoding, join=join) sub_place_group_configs = place_group_descriptor.get("Places") if sub_place_group_configs: raise ServiceConfigError( "Invalid 'Places' entry in a 'PlaceGroups' item: not implemented yet" ) # sub_place_group_descriptors = place_group_config.get("Places") # if sub_place_group_descriptors: # sub_place_groups = self._load_place_groups(sub_place_group_descriptors) # place_group["placeGroups"] = sub_place_groups self._place_group_cache[place_group_id] = place_group if load_features: self._load_place_group_features(place_group) return place_group