def _get_data_descriptor_from_metadata( self, data_id: str, metadata: dict) -> DatasetDescriptor: ds_metadata = metadata.copy() dims = self._normalize_dims(ds_metadata.get('dimensions', {})) if 'time' not in dims: dims['time'] = ds_metadata.get('time_dimension_size') else: dims['time'] *= ds_metadata.get('time_dimension_size') temporal_resolution = _get_temporal_resolution_from_id(data_id) dataset_info = self._cci_odp.get_dataset_info(data_id, ds_metadata) spatial_resolution = dataset_info['lat_res'] if spatial_resolution <= 0: spatial_resolution = None bbox = dataset_info['bbox'] # only use date parts of times temporal_coverage = ( dataset_info['temporal_coverage_start'].split('T')[0], dataset_info['temporal_coverage_end'].split('T')[0]) var_infos = ds_metadata.get('variable_infos', {}) var_descriptors = self._get_variable_descriptors( dataset_info['var_names'], var_infos) coord_descriptors = self._get_variable_descriptors( dataset_info['coord_names'], var_infos, normalize_dims=False) if 'time' not in coord_descriptors.keys( ) and 't' not in coord_descriptors.keys(): time_attrs = { "units": "seconds since 1970-01-01T00:00:00Z", "calendar": "proleptic_gregorian", "standard_name": "time" } coord_descriptors['time'] = VariableDescriptor('time', dtype='int64', dims=('time', ), attrs=time_attrs) if 'variables' in ds_metadata: ds_metadata.pop('variables') ds_metadata.pop('dimensions') ds_metadata.pop('variable_infos') attrs = ds_metadata.get('attributes', {}).get('NC_GLOBAL', {}) ds_metadata.pop('attributes') attrs.update(ds_metadata) self._remove_irrelevant_metadata_attributes(attrs) descriptor = DatasetDescriptor(data_id, type_specifier=self._type_specifier, dims=dims, coords=coord_descriptors, data_vars=var_descriptors, attrs=attrs, bbox=bbox, spatial_res=spatial_resolution, time_range=temporal_coverage, time_period=temporal_resolution) data_schema = self._get_open_data_params_schema(descriptor) descriptor.open_params_schema = data_schema return descriptor
def test_serialisation(self): result = CubeInfoResult( status='ok', message='Success!', result=CubeInfo( dataset_descriptor=DatasetDescriptor(data_id='bibo.zarr'), size_estimation={})) result_dict = result.to_dict() self.assertIsInstance(result_dict, dict) self.assertEqual( { 'status': 'ok', 'message': 'Success!', 'result': { 'dataset_descriptor': { 'data_id': 'bibo.zarr', 'data_type': 'dataset' }, 'size_estimation': {} }, }, result_dict) result2 = CubeInfoResult.from_dict(result_dict) self.assertIsInstance(result2, CubeInfoResult)
def test_describe_data(self): dd = self.store.describe_data('cube_1') self.assertIsInstance(dd, DatasetDescriptor) self.assertEqual( DatasetDescriptor( data_id='cube_1', type_specifier=TYPE_SPECIFIER_CUBE, ).to_dict(), dd.to_dict()) dd = self.store.describe_data('cube_1', type_specifier='dataset[cube]') self.assertIsInstance(dd, DatasetDescriptor) self.assertEqual( DatasetDescriptor( data_id='cube_1', type_specifier=TYPE_SPECIFIER_CUBE, ).to_dict(), dd.to_dict())
def test_serialisation(self): result = CubeInfoWithCostsResult( status='ok', message='Success!', result=CubeInfoWithCosts( dataset_descriptor=DatasetDescriptor(data_id='bibo.zarr'), size_estimation={}, cost_estimation=CostEstimation(required=10, available=20, limit=100))) result_dict = result.to_dict() self.assertIsInstance(result_dict, dict) self.assertEqual( { 'status': 'ok', 'message': 'Success!', 'result': { 'dataset_descriptor': { 'data_id': 'bibo.zarr', 'data_type': 'dataset' }, 'size_estimation': {}, 'cost_estimation': { 'available': 20, 'limit': 100, 'required': 10 }, }, }, result_dict) result2 = CubeInfoWithCostsResult.from_dict(result_dict) self.assertIsInstance(result2, GenericCubeGeneratorResult) self.assertIsInstance(result2.result, CubeInfoWithCosts)
def get_schema(cls) -> JsonObjectSchema: return JsonObjectSchema( properties=dict( dataset_descriptor=DatasetDescriptor.get_schema(), size_estimation=JsonObjectSchema(additional_properties=True)), required=['dataset_descriptor', 'size_estimation'], additional_properties=True, factory=cls)
def describe_data(self, data_id: str) -> DataDescriptor: ds_info = self._dataset_dicts[data_id.split(':')[0]] return DatasetDescriptor( data_id=data_id, data_vars=self._create_variable_descriptors(data_id), crs=ds_info['crs'], bbox=tuple(ds_info['bbox']), spatial_res=ds_info['spatial_res'], time_range=tuple(ds_info['time_range']), time_period=ds_info['time_period'], open_params_schema=self.get_open_data_params_schema(data_id))
def _describe_data(self, data_id: str) -> DatasetDescriptor: dataset_metadata, collection_metadata = self._get_dataset_and_collection_metadata( data_id) band_metadatas = dataset_metadata.get('bands', {}) if self._sentinel_hub is not None: # If we are connected to the API, we return band names by API band_names = self._sentinel_hub.band_names(data_id) else: # Otherwise all we know about band_names = band_metadatas.keys() data_vars = [] for band_name in band_names: band_metadata = band_metadatas.get(band_name, dict(sample_type='FLOAT32')) data_vars.append( VariableDescriptor(name=band_name, dtype=band_metadata.get( 'sample_type', 'FLOAT32'), dims=('time', 'lat', 'lon'), attrs=band_metadatas.copy())) dataset_attrs = dataset_metadata.copy() bbox = None time_range = None if collection_metadata is not None: extent = collection_metadata.get('extent') if extent is not None: bbox = extent.get("spatial", {}).get('bbox') interval = extent.get("temporal", {}).get('interval') if isinstance(interval, list) and len(interval) == 2: min_datetime, max_datetime = interval # Get rid of time part time_range = (min_datetime.split('T')[0] if min_datetime is not None else None, max_datetime.split('T')[0] if max_datetime is not None else None) if 'title' in collection_metadata: dataset_attrs['title'] = collection_metadata['title'] if 'description' in collection_metadata: dataset_attrs['description'] = collection_metadata[ 'description'] return DatasetDescriptor( data_id=data_id, data_vars=data_vars, bbox=bbox, time_range=time_range, time_period=dataset_metadata.get('request_period'), attrs=dataset_metadata)
def generate(self) -> CubeInfo: try: cube_config, resolved_crs, resolved_time_range = \ self._compute_effective_cube_config() except (TypeError, ValueError) as e: raise CubeGeneratorError(f'{e}', status_code=400) from e x_min, y_min, x_max, y_max = cube_config.bbox spatial_res = cube_config.spatial_res width = round((x_max - x_min) / spatial_res) height = round((y_max - y_min) / spatial_res) width = 2 if width < 2 else width height = 2 if height < 2 else height num_tiles_x = 1 num_tiles_y = 1 tile_width = width tile_height = height tile_size = cube_config.tile_size if tile_size is None and cube_config.chunks is not None: # TODO: this is just an assumption, with new # Resampling module, use GridMapping # to identify the actual names for the # spatial tile dimensions. tile_size_x = cube_config.chunks.get('lon', cube_config.chunks.get('x')) tile_size_y = cube_config.chunks.get('lat', cube_config.chunks.get('y')) if tile_size_x and tile_size_y: tile_size = tile_size_x, tile_size_y if tile_size is not None: tile_width, tile_height = tile_size # TODO: this must be made common store logic if width > 1.5 * tile_width: num_tiles_x = _idiv(width, tile_width) width = num_tiles_x * tile_width # TODO: this must be made common store logic if height > 1.5 * tile_height: num_tiles_y = _idiv(height, tile_height) height = num_tiles_y * tile_height variable_names = cube_config.variable_names num_times = len(resolved_time_range) num_variables = len(variable_names) num_requests = num_variables \ * num_times \ * num_tiles_x * num_tiles_y # TODO: get original data types from dataset descriptors num_bytes_per_pixel = 4 num_bytes = num_variables \ * num_times \ * (height * width * num_bytes_per_pixel) x_name, y_name = ('lon', 'lat') \ if resolved_crs.is_geographic else ('x', 'y') data_id = self._request.output_config.data_id or 'unnamed' # TODO: get original variable descriptors from input dataset descriptors data_vars = { name: VariableDescriptor(name, dtype='float32', dims=('time', y_name, x_name)) for name in variable_names } dims = {'time': num_times, y_name: height, x_name: width} dataset_descriptor = DatasetDescriptor( data_id, crs=cube_config.crs, bbox=cube_config.bbox, spatial_res=cube_config.spatial_res, time_range=cube_config.time_range, time_period=cube_config.time_period, dims=dims, data_vars=data_vars) size_estimation = dict(image_size=[width, height], tile_size=[tile_width, tile_height], num_variables=num_variables, num_tiles=[num_tiles_x, num_tiles_y], num_requests=num_requests, num_bytes=num_bytes) return CubeInfo(dataset_descriptor=dataset_descriptor, size_estimation=size_estimation)
def describe_data(self, data_id: str) -> DatasetDescriptor: _, variable_spec, aggregation = data_id.split(':') sm_attrs = dict(saturation=('percent', 'Percent of Saturation Soil Moisture'), volumetric=('m3 m-3', 'Volumetric Soil Moisture'))[variable_spec] descriptors_common = [ VariableDescriptor(name='sensor', dtype='int16', dims=('time', 'lat', 'lon'), attrs={'long_name': 'Sensor'}), VariableDescriptor( name='freqbandID', dtype='int16', dims=('time', 'lat', 'lon'), attrs={'long_name': 'Frequency Band Identification'}), VariableDescriptor(name='sm', dtype='float32', dims=('time', 'lat', 'lon'), attrs={ 'units': sm_attrs[0], 'long_name': sm_attrs[1] }), ] descriptors_daily = [ VariableDescriptor( # The product user guide claims that sm_uncertainty is # available for all three aggregation periods, but in practice # it only seems to be present in the daily data. name='sm_uncertainty', dtype='float32', dims=('time', 'lat', 'lon'), attrs={ 'units': sm_attrs[0], 'long_name': sm_attrs[1] + ' Uncertainty' }), VariableDescriptor(name='t0', dtype='float64', dims=('time', 'lat', 'lon'), attrs={ 'units': 'days since 1970-01-01 00:00:00 UTC', 'long_name': 'Observation Timestamp' }), VariableDescriptor(name='dnflag', dtype='int8', dims=('time', 'lat', 'lon'), attrs={'long_name': 'Day / Night Flag'}), VariableDescriptor(name='flag', dtype='int8', dims=('time', 'lat', 'lon'), attrs={'long_name': 'Flag'}), VariableDescriptor( name='mode', dtype='int8', dims=('time', 'lat', 'lon'), # Note: the product user guide gives the long name as # 'Satellite Mode' with one space, but the long name in the # actual NetCDF files has two spaces. attrs={'long_name': 'Satellite Mode'}), ] descriptors_aggregated = [ VariableDescriptor( name='nobs', dtype='int16', dims=('time', 'lat', 'lon'), attrs={'long_name': 'Number of valid observation'}), ] descriptors = descriptors_common + \ (descriptors_daily if aggregation == 'daily' else descriptors_aggregated) return DatasetDescriptor( data_id=data_id, data_vars={desc.name: desc for desc in descriptors}, crs='WGS84', bbox=(-180, -90, 180, 90), spatial_res=0.25, time_range=('1978-11-01', None), time_period=self._aggregation_map[aggregation], open_params_schema=self.get_open_data_params_schema(data_id))