def test_convert(self): expected = ['aa', 'b*', 'cc'] actual = VarNamesLike.convert('aa,b*,cc') self.assertEqual(actual, expected) with self.assertRaises(ValidationError) as err: VarNamesLike.convert(['aa', 1, 'bb']) self.assertEqual(str(err.exception), 'List of variables names expected.') self.assertEqual(None, VarNamesLike.convert(None))
def select_var(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None) -> xr.Dataset: """ Filter the dataset, by leaving only the desired variables in it. The original dataset information, including original coordinates, is preserved. :param ds: The dataset or dataframe from which to perform selection. :param var: One or more variable names to select and preserve in the dataset. \ All of these are valid 'var_name' 'var_name1,var_name2,var_name3' ['var_name1', 'var_name2']. \ One can also use wildcards when doing the selection. E.g., choosing 'var_name*' for selection \ will select all variables that start with 'var_name'. This can be used to select variables \ along with their auxiliary variables, to select all uncertainty variables, and so on. :return: A filtered dataset """ if not var: return ds ds = DatasetLike.convert(ds) var_names = VarNamesLike.convert(var) dropped_var_names = list(ds.data_vars.keys()) for pattern in var_names: keep = fnmatch.filter(dropped_var_names, pattern) for name in keep: dropped_var_names.remove(name) return ds.drop(dropped_var_names)
def data_frame_subset(gdf: gpd.GeoDataFrame, region_op: bool = 'intersects', region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None) -> gpd.GeoDataFrame: """ Create a GeoDataFrame subset from given variables (data frame columns) and/or region. :param gdf: A GeoDataFrame. :param region_op: The geometric operation to be performed if *region* is given. :param region: A region polygon used to filter rows. :param var_names: The variables (columns) to select. :return: A GeoDataFrame subset. """ region = PolygonLike.convert(region) var_names = VarNamesLike.convert(var_names) if not var_names and not region: return gdf if var_names: if 'geometry' not in var_names: var_names = ['geometry'] + var_names gdf = gdf[var_names] if region and region_op: geom_str = PolygonLike.format(region) gdf = data_frame_query(gdf, f'@{region_op}("{geom_str}")') return gdf
def test_accepts(self): self.assertTrue(VarNamesLike.accepts('aa')) self.assertTrue(VarNamesLike.accepts('aa,bb,cc')) self.assertTrue(VarNamesLike.accepts(['aa', 'bb', 'cc'])) self.assertFalse(VarNamesLike.accepts(1.0)) self.assertFalse(VarNamesLike.accepts([1, 2, 4])) self.assertFalse(VarNamesLike.accepts(['aa', 2, 'bb']))
def make_local(self, local_name: str, local_id: str = None, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE) -> Optional[DataSource]: time_range = TimeRangeLike.convert(time_range) if time_range else None region = PolygonLike.convert(region) if region else None var_names = VarNamesLike.convert(var_names) if var_names else None local_store = DATA_STORE_REGISTRY.get_data_store('local') if not local_store: add_to_data_store_registry() local_store = DATA_STORE_REGISTRY.get_data_store('local') if not local_store: raise ValueError('Cannot initialize `local` DataStore') _uuid = LocalDataStore.generate_uuid(ref_id=self.id, time_range=time_range, region=region, var_names=var_names) if not local_name or len(local_name) == 0: local_name = "local.{}.{}".format(self.id, _uuid) existing_ds_list = local_store.query(ds_id=local_name) if len(existing_ds_list) == 1: return existing_ds_list[0] else: existing_ds_list = local_store.query(ds_id='local.%s' % local_name) if len(existing_ds_list) == 1: if existing_ds_list[0].meta_info.get('uuid', None) == _uuid: return existing_ds_list[0] else: raise ValueError('Datastore {} already contains dataset {}'.format(local_store.id, local_name)) local_meta_info = self.meta_info.copy() local_meta_info['ref_uuid'] = local_meta_info.get('uuid', None) local_meta_info['uuid'] = _uuid local_ds = local_store.create_data_source(local_name, region, local_name, time_range=time_range, var_names=var_names, meta_info=self.meta_info.copy()) if local_ds: if not local_ds.is_complete: self._make_local(local_ds, time_range, region, var_names, monitor=monitor) if local_ds.is_empty: local_store.remove_data_source(local_ds) return None local_store.register_ds(local_ds) return local_ds return None
def generate_uuid(cls, ref_id: str, time_range: Optional[TimeRange] = None, region: Optional[shapely.geometry.Polygon] = None, var_names: Optional[VarNames] = None) -> str: if time_range: ref_id += TimeRangeLike.format(time_range) if region: ref_id += PolygonLike.format(region) if var_names: ref_id += VarNamesLike.format(var_names) return str(uuid.uuid3(_NAMESPACE, ref_id))
def generate_title(cls, title: str, time_range: Optional[TimeRange] = None, region: Optional[shapely.geometry.Polygon] = None, var_names: Optional[VarNames] = None) -> str: if time_range: title += " [TimeRange:{}]".format(TimeRangeLike.format(time_range)) if region: title += " [Region:{}]".format(PolygonLike.format(region)) if var_names: title += " [Variables:{}]".format(VarNamesLike.format(var_names)) return title
def long_term_average(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Create a 'mean over years' dataset by averaging the values of the given input dataset over all years. The output is a climatological dataset with the same resolution as the input dataset. E.g. a daily input dataset will create a daily climatology consisting of 365 days, a monthly input dataset will create a monthly climatology, etc. Seasonal input datasets must have matching seasons over all years denoted by the same date each year. E.g., first date of each quarter. The output dataset will then be a seasonal climatology where each season is denoted with the same date as in the input dataset. For further information on climatological datasets, see http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics :param ds: A dataset to average :param var: If given, only these variables will be preserved in the resulting dataset :param monitor: A progress monitor :return: A climatological long term average dataset """ ds = DatasetLike.convert(ds) # Check if time dtype is what we want if 'datetime64[ns]' != ds.time.dtype: raise ValidationError('Long term average operation expects a dataset with the' ' time coordinate of type datetime64[ns], but received' ' {}. Running the normalize operation on this' ' dataset may help'.format(ds.time.dtype)) try: t_resolution = ds.attrs['time_coverage_resolution'] except KeyError: raise ValidationError('Could not determine temporal resolution. Running' ' the adjust_temporal_attrs operation beforehand may' ' help.') var = VarNamesLike.convert(var) # Shallow retset = ds.copy() if var: retset = select_var(retset, var) if t_resolution == 'P1D': return _lta_daily(retset, monitor) elif t_resolution == 'P1M': return _lta_monthly(retset, monitor) else: return _lta_general(retset, monitor)
def open_dataset(self, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, protocol: str = None, monitor: Monitor = Monitor.NONE) -> Any: time_range = TimeRangeLike.convert(time_range) if time_range else None var_names = VarNamesLike.convert(var_names) if var_names else None paths = [] if time_range: time_series = list(self._files.values()) file_paths = list(self._files.keys()) for i in range(len(time_series)): if time_series[i]: if isinstance(time_series[i], Tuple) and \ time_series[i][0] >= time_range[0] and \ time_series[i][1] <= time_range[1]: paths.extend(self._resolve_file_path(file_paths[i])) elif isinstance(time_series[i], datetime) and time_range[0] <= time_series[i] < time_range[1]: paths.extend(self._resolve_file_path(file_paths[i])) else: for file in self._files.items(): paths.extend(self._resolve_file_path(file[0])) if not paths: raise self._empty_error(time_range) paths = sorted(set(paths)) try: excluded_variables = self._meta_info.get('exclude_variables') if excluded_variables: drop_variables = [variable.get('name') for variable in excluded_variables] else: drop_variables = None # TODO: combine var_names and drop_variables return open_xarray_dataset(paths, region=region, var_names=var_names, drop_variables=drop_variables, monitor=monitor) except HTTPError as e: raise self._cannot_access_error(time_range, region, var_names, verb="open", cause=e) from e except (URLError, socket.timeout) as e: raise self._cannot_access_error(time_range, region, var_names, verb="open", cause=e, error_cls=NetworkError) from e except OSError as e: raise self._cannot_access_error(time_range, region, var_names, verb="open", cause=e) from e
def __init__(self, ds_id: str, files: Union[Sequence[str], OrderedDict], data_store: 'LocalDataStore', temporal_coverage: TimeRangeLike.TYPE = None, spatial_coverage: PolygonLike.TYPE = None, variables: VarNamesLike.TYPE = None, meta_info: dict = None, status: DataSourceStatus = None): self._id = ds_id if isinstance(files, Sequence): self._files = OrderedDict.fromkeys(files) else: self._files = files self._data_store = data_store initial_temporal_coverage = TimeRangeLike.convert(temporal_coverage) if temporal_coverage else None if not initial_temporal_coverage: files_number = len(self._files.items()) if files_number > 0: files_range = list(self._files.values()) if files_range: if isinstance(files_range[0], Tuple): initial_temporal_coverage = TimeRangeLike.convert(tuple([files_range[0][0], files_range[files_number - 1][1]])) elif isinstance(files_range[0], datetime): initial_temporal_coverage = TimeRangeLike.convert((files_range[0], files_range[files_number - 1])) self._temporal_coverage = initial_temporal_coverage self._spatial_coverage = PolygonLike.convert(spatial_coverage) if spatial_coverage else None self._variables = VarNamesLike.convert(variables) if variables else [] self._meta_info = meta_info if meta_info else OrderedDict() if self._variables and not self._meta_info.get('variables', None): self._meta_info['variables'] = [ {'name': var_name, 'units': '', 'long_name': '', 'standard_name': '' } for var_name in self._variables] self._status = status if status else DataSourceStatus.READY
def reduce(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None, dim: DimNamesLike.TYPE = None, method: str = 'mean', monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Reduce the given variables of the given dataset along the given dimensions. If no variables are given, all variables of the dataset will be reduced. If no dimensions are given, all dimensions will be reduced. If no variables have been given explicitly, it can be set that only variables featuring numeric values should be reduced. :param ds: Dataset to reduce :param var: Variables in the dataset to reduce :param dim: Dataset dimensions along which to reduce :param method: reduction method :param monitor: A progress monitor """ ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean, 'median': np.nanmedian, 'sum': np.nansum} ds = DatasetLike.convert(ds) if not var: var = list(ds.data_vars.keys()) var_names = VarNamesLike.convert(var) if not dim: dim = list(ds.coords.keys()) else: dim = DimNamesLike.convert(dim) retset = ds.copy() for var_name in var_names: intersection = [value for value in dim if value in retset[var_name].dims] with monitor.starting("Reduce dataset", total_work=100): monitor.progress(5) with monitor.child(95).observing("Reduce"): retset[var_name] = retset[var_name].reduce(ufuncs[method], dim=intersection, keep_attrs=True) return retset
def detect_outliers(ds: xr.Dataset, var: VarNamesLike.TYPE, threshold_low: float = 0.05, threshold_high: float = 0.95, quantiles: bool = True, mask: bool = False, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Detect outliers in the given Dataset. When mask=True the input dataset should not contain nan values, otherwise all existing nan values will be marked as 'outliers' in the mask data array added to the output dataset. :param ds: The dataset or dataframe for which to do outlier detection :param var: Variable or variables in the dataset to which to do outlier detection. Note that when multiple variables are selected, absolute threshold values might not make much sense. Wild cards can be used to select multiple variables matching a pattern. :param threshold_low: Values less or equal to this will be removed/masked :param threshold_high: Values greater or equal to this will be removed/masked :param quantiles: If True, threshold values are treated as quantiles, otherwise as absolute values. :param mask: If True, an ancillary variable containing flag values for outliers will be added to the dataset. Otherwise, outliers will be replaced with nan directly in the data variables. :param monitor: A progress monitor. :return: The dataset with outliers masked or replaced with nan """ ds = DatasetLike.convert(ds) # Create a list of variable names on which to perform outlier detection # based on the input comma separated list that can contain wildcards var_patterns = VarNamesLike.convert(var) all_vars = list(ds.data_vars.keys()) variables = list() for pattern in var_patterns: leave = fnmatch.filter(all_vars, pattern) variables = variables + leave # For each array in the dataset for which we should detect outliers, detect # outliers ret_ds = ds.copy() with monitor.starting("detect_outliers", total_work=len(variables) * 3): for var_name in variables: if quantiles: # Get threshold values with monitor.child(1).observing("quantile low"): threshold_low = ret_ds[var_name].quantile(threshold_low) with monitor.child(1).observing("quantile high"): threshold_high = ret_ds[var_name].quantile(threshold_high) else: monitor.progress(2) # If not mask, put nans in the data arrays for min/max outliers if not mask: arr = ret_ds[var_name] attrs = arr.attrs ret_ds[var_name] = arr.where((arr > threshold_low) & (arr < threshold_high)) ret_ds[var_name].attrs = attrs else: # Create and add a data variable containing the mask for this data # variable _mask_outliers(ret_ds, var_name, threshold_low, threshold_high) monitor.progress(1) return ret_ds
def test_format(self): self.assertEqual(VarNamesLike.format(['aa', 'bb', 'cc']), "aa, bb, cc") self.assertEqual(VarNamesLike.format(['aa']), "aa") self.assertEqual(VarNamesLike.format([]), "") self.assertEqual(VarNamesLike.format(None), "")
def test_format(self): actual = VarNamesLike.format(HTML('abc')) self.assertIsInstance(actual, str) self.assertEqual(actual, 'abc')
def test_format(self): self.assertEqual(VarNamesLike.format(['aa', 'bb', 'cc']), "['aa', 'bb', 'cc']")
def plot_line(ds: DatasetLike.TYPE, var_names: VarNamesLike.TYPE, fmt: str = None, label: DimName.TYPE = None, indexers: DictLike.TYPE = None, title: str = None, file: str = None) -> Figure: """ Create a 1D/line plot of variable(s) given by dataset *ds* and variable name(s) *var_names*. :param ds: Dataset or Dataframe that contains the variable(s) named by *var_names*. :param var_names: The name of the variable(s) to plot :param fmt: optional semicolon-separated matplotlib formats, e.g. 1 variable - "b.-" 2 variables - "b.-;r+:" If the number of properties is less than the number of selected variables, the next non-corresponding variable will repeat the first style on the list, and so on. For full reference on matplotlib plot() function, refer to https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html :param file: path to a file in which to save the plot :param label: dimension name to be selected as the x-axis of the plot :param indexers: Optional indexers into data array of *var_names*. The *indexers* is a dictionary or a comma-separated string of key-value pairs that maps the variable's dimension names to constant labels. e.g. "lat=12.4, time='2012-05-02'". :param title: an optional plot title :return: a matplotlib figure object or None if in IPython mode """ ds = DatasetLike.convert(ds) fmt_count = 0 fmt_list = [] if fmt: fmt_list = fmt.split(";") fmt_count = len(fmt_list) if not var_names: raise ValidationError("Missing name for 'vars'") figure = plt.figure() ax = figure.add_subplot(111) figure.subplots_adjust(right=0.65) var_names = VarNamesLike.convert(var_names) if not title: if label: title = ','.join(var_names) + ' over ' + label else: title = ','.join(var_names) if indexers: title = title + '\n' + ' at ' + json.dumps(indexers).strip('"') ax.set_title(title) indexers = DictLike.convert(indexers) ax_var = {} var_count = len(var_names) predefined_fmt = ['r', 'g', 'b', 'c', 'm', 'y', 'k'] if label: ds = get_vars_data(ds, indexers, remaining_dims=[label]) else: ds = get_vars_data(ds, indexers) for i in range(var_count): var_name = var_names[i] var = ds[var_name] if len(var.dims) > 1: raise ValidationError(f'Unable to plot because variable {var_name} has more than one dimension: {var.dims}.' f' To specify value(s) of these dimension(s), please use the indexers.') var_label = var_name + ' (' + var.attrs['units'] + ')' if 'units' in var.attrs else var_name properties_dict = {} indexers = DictLike.convert(indexers) if fmt is None: selected_fmt = predefined_fmt[i % len(predefined_fmt)] else: selected_fmt = fmt_list[i % fmt_count] if label: x_axis = var[label] elif 'time' in var: x_axis = var.time else: x_axis = [] # to differentiate the creation of y-axis of the first and the nth variable if i == 0: if len(x_axis) > 0: ax.plot(x_axis, var, selected_fmt, **properties_dict) else: ax.plot(var, selected_fmt, **properties_dict) ax.set_ylabel(var_label, wrap=True) ax.yaxis.label.set_color(selected_fmt[0]) ax.tick_params(axis='y', colors=selected_fmt[0]) else: ax_var[var_name] = ax.twinx() if len(ax_var) > 1: ax_var[var_name].spines["right"].set_position(("axes", 1 + ((i - 1) * 0.2))) ax_var[var_name].set_frame_on(True) ax_var[var_name].patch.set_visible(False) if len(x_axis) > 0: ax_var[var_name].plot(x_axis, var, selected_fmt, **properties_dict) else: ax_var[var_name].plot(var, selected_fmt, **properties_dict) ax_var[var_name].set_ylabel(var_label, wrap=True) ax_var[var_name].yaxis.label.set_color(selected_fmt[0]) ax_var[var_name].tick_params(axis='y', colors=selected_fmt[0]) ax.tick_params(axis='x', rotation=45) if label in ds and 'long_name' in ds[label].attrs: ax.set_xlabel(ds[label].attrs['long_name']) figure.tight_layout() if file: figure.savefig(file, dpi=600) return figure if not in_notebook() else None
def test_format(self): actual = VarNamesLike.format(HTML('abc')) self.assertIsInstance(actual, str) self.assertEqual(actual, 'abc')
def test_make_local_and_update(self): soilmoisture_data_sources = self.data_store.query( query_expr= 'esacci.SOILMOISTURE.day.L3S.SSMV.multi-sensor.multi-platform.COMBINED.02-1.r1' ) soilmoisture_data_source = soilmoisture_data_sources[0] reference_path = os.path.join( os.path.dirname(__file__), os.path.normpath('resources/datasources/local/files/')) def find_files_mock(_, time_range): def build_file_item(item_name: str, date_from: datetime, date_to: datetime, size: int): return [ item_name, date_from, date_to, size, { 'OPENDAP': os.path.join(reference_path, item_name), 'HTTPServer': 'file:' + urllib.request.pathname2url( os.path.join(reference_path, item_name)) } ] reference_files = { 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781114000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 14, 0, 0), 'date_to': datetime.datetime(1978, 11, 14, 23, 59), 'size': 21511378 }, 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781115000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 15, 0, 0), 'date_to': datetime.datetime(1978, 11, 15, 23, 59), 'size': 21511378 }, 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781116000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 16, 0, 0), 'date_to': datetime.datetime(1978, 11, 16, 23, 59), 'size': 21511378 } } reference_files_list = [] for reference_file in reference_files.items(): file_name = reference_file[0] file_date_from = reference_file[1].get('date_from') file_date_to = reference_file[1].get('date_to') file_size = reference_file[1].get('size') if time_range: if file_date_from >= time_range[ 0] and file_date_to <= time_range[1]: reference_files_list.append( build_file_item(file_name, file_date_from, file_date_to, file_size)) else: reference_files_list.append( build_file_item(file_name, file_date_from, file_date_to, file_size)) return reference_files_list with unittest.mock.patch( 'cate.ds.esa_cci_odp.EsaCciOdpDataSource._find_files', find_files_mock): with unittest.mock.patch.object(EsaCciOdpDataStore, 'query', return_value=[]): new_ds_title = 'local_ds_test' new_ds_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) try: new_ds = soilmoisture_data_source.make_local( new_ds_title, time_range=new_ds_time_range) except: raise ValueError(reference_path, os.listdir(reference_path)) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.id, "local.%s" % new_ds_title) self.assertEqual(new_ds.temporal_coverage(), new_ds_time_range) new_ds_w_one_variable_title = 'local_ds_test_var' new_ds_w_one_variable_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_one_variable_var_names = VarNamesLike.convert(['sm']) new_ds_w_one_variable = soilmoisture_data_source.make_local( new_ds_w_one_variable_title, time_range=new_ds_w_one_variable_time_range, var_names=new_ds_w_one_variable_var_names) self.assertIsNotNone(new_ds_w_one_variable) self.assertEqual(new_ds_w_one_variable.id, "local.%s" % new_ds_w_one_variable_title) ds = new_ds_w_one_variable.open_dataset() new_ds_w_one_variable_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(ds.variables), set(new_ds_w_one_variable_var_names)) new_ds_w_region_title = 'from_local_to_local_region' new_ds_w_region_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_spatial_coverage = PolygonLike.convert( "10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, region=new_ds_w_region_spatial_coverage ) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) new_ds_w_region_title = 'from_local_to_local_region_one_var' new_ds_w_region_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_var_names = VarNamesLike.convert(['sm']) new_ds_w_region_spatial_coverage = PolygonLike.convert( "10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, var_names=new_ds_w_region_var_names, region=new_ds_w_region_spatial_coverage ) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) data_set = new_ds_w_region.open_dataset() new_ds_w_region_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names)) new_ds_w_region_title = 'from_local_to_local_region_two_var_sm_uncertainty' new_ds_w_region_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_var_names = VarNamesLike.convert( ['sm', 'sm_uncertainty']) new_ds_w_region_spatial_coverage = PolygonLike.convert( "10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, var_names=new_ds_w_region_var_names, region=new_ds_w_region_spatial_coverage ) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) data_set = new_ds_w_region.open_dataset() new_ds_w_region_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names)) empty_ds_timerange = (datetime.datetime(2017, 12, 1, 0, 0), datetime.datetime(2017, 12, 31, 23, 59)) with self.assertRaises(DataAccessError) as cm: soilmoisture_data_source.make_local( 'empty_ds', time_range=empty_ds_timerange) self.assertEqual( "Open Data Portal's data source '{}' does not seem to have any data sets in given " "time range {}".format( soilmoisture_data_source.id, TimeRangeLike.format(empty_ds_timerange)), str(cm.exception)) new_ds_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 14, 23, 59))) new_ds = soilmoisture_data_source.make_local( "title_test_copy", time_range=new_ds_time_range) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.meta_info['title'], soilmoisture_data_source.meta_info['title']) title = "Title Test!" new_ds = soilmoisture_data_source.make_local( "title_test_set", title, time_range=new_ds_time_range) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.meta_info['title'], title)
def detect_outliers(ds: xr.Dataset, var: VarNamesLike.TYPE, threshold_low: float = 0.05, threshold_high: float = 0.95, quantiles: bool = True, mask: bool = False, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Detect outliers in the given Dataset. When mask=True the input dataset should not contain nan values, otherwise all existing nan values will be marked as 'outliers' in the mask data array added to the output dataset. :param ds: The dataset or dataframe for which to do outlier detection :param var: Variable or variables in the dataset to which to do outlier detection. Note that when multiple variables are selected, absolute threshold values might not make much sense. Wild cards can be used to select multiple variables matching a pattern. :param threshold_low: Values less or equal to this will be removed/masked :param threshold_high: Values greater or equal to this will be removed/masked :param quantiles: If True, threshold values are treated as quantiles, otherwise as absolute values. :param mask: If True, an ancillary variable containing flag values for outliers will be added to the dataset. Otherwise, outliers will be replaced with nan directly in the data variables. :param monitor: A progress monitor. :return: The dataset with outliers masked or replaced with nan """ ds = DatasetLike.convert(ds) # Create a list of variable names on which to perform outlier detection # based on the input comma separated list that can contain wildcards var_patterns = VarNamesLike.convert(var) all_vars = list(ds.data_vars.keys()) variables = list() for pattern in var_patterns: leave = fnmatch.filter(all_vars, pattern) variables = variables + leave # For each array in the dataset for which we should detect outliers, detect # outliers ret_ds = ds.copy() with monitor.starting("detect_outliers", total_work=len(variables) * 3): for var_name in variables: if quantiles: # Get threshold values with monitor.child(1).observing("quantile low"): threshold_low = ret_ds[var_name].quantile(threshold_low) with monitor.child(1).observing("quantile high"): threshold_high = ret_ds[var_name].quantile(threshold_high) else: monitor.progress(2) # If not mask, put nans in the data arrays for min/max outliers if not mask: arr = ret_ds[var_name] attrs = arr.attrs ret_ds[var_name] = arr.where((arr > threshold_low) & (arr < threshold_high)) ret_ds[var_name].attrs = attrs else: # Create and add a data variable containing the mask for this data # variable _mask_outliers(ret_ds, var_name, threshold_low, threshold_high) monitor.progress(1) return ret_ds
def plot_line(ds: DatasetLike.TYPE, var_names: VarNamesLike.TYPE, fmt: str = None, label: DimName.TYPE = None, indexers: DictLike.TYPE = None, title: str = None, file: str = None) -> Figure: """ Create a 1D/line plot of variable(s) given by dataset *ds* and variable name(s) *var_names*. :param ds: Dataset or Dataframe that contains the variable(s) named by *var_names*. :param var_names: The name of the variable(s) to plot :param fmt: optional semicolon-separated matplotlib formats, e.g. 1 variable - "b.-" 2 variables - "b.-;r+:" If the number of properties is less than the number of selected variables, the next non-corresponding variable will repeat the first style on the list, and so on. For full reference on matplotlib plot() function, refer to https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html :param file: path to a file in which to save the plot :param label: dimension name to be selected as the x-axis of the plot :param indexers: Optional indexers into data array of *var_names*. The *indexers* is a dictionary or a comma-separated string of key-value pairs that maps the variable's dimension names to constant labels. e.g. "lat=12.4, time='2012-05-02'". :param title: an optional plot title :return: a matplotlib figure object or None if in IPython mode """ ds = DatasetLike.convert(ds) fmt_count = 0 fmt_list = [] if fmt: fmt_list = fmt.split(";") fmt_count = len(fmt_list) if not var_names: raise ValidationError("Missing name for 'vars'") figure = plt.figure() ax = figure.add_subplot(111) figure.subplots_adjust(right=0.65) var_names = VarNamesLike.convert(var_names) if not title: if label: title = ','.join(var_names) + ' over ' + label else: title = ','.join(var_names) if indexers: title = title + '\n' + ' at ' + json.dumps(indexers).strip('"') ax.set_title(title) indexers = DictLike.convert(indexers) ax_var = {} var_count = len(var_names) predefined_fmt = ['r', 'g', 'b', 'c', 'm', 'y', 'k'] if label: ds = get_vars_data(ds, indexers, remaining_dims=[label]) else: ds = get_vars_data(ds, indexers) for i in range(var_count): var_name = var_names[i] var = ds[var_name] if len(var.dims) > 1: raise ValidationError( f'Unable to plot because variable {var_name} has more than one dimension: {var.dims}.' f' To specify value(s) of these dimension(s), please use the indexers.' ) var_label = var_name + ' (' + var.attrs[ 'units'] + ')' if 'units' in var.attrs else var_name properties_dict = {} indexers = DictLike.convert(indexers) if fmt is None: selected_fmt = predefined_fmt[i % len(predefined_fmt)] else: selected_fmt = fmt_list[i % fmt_count] if label: x_axis = var[label] elif 'time' in var: x_axis = var.time else: x_axis = [] # to differentiate the creation of y-axis of the first and the nth variable if i == 0: if len(x_axis) > 0: ax.plot(x_axis, var, selected_fmt, **properties_dict) else: ax.plot(var, selected_fmt, **properties_dict) ax.set_ylabel(var_label, wrap=True) ax.yaxis.label.set_color(selected_fmt[0]) ax.tick_params(axis='y', colors=selected_fmt[0]) else: ax_var[var_name] = ax.twinx() if len(ax_var) > 1: ax_var[var_name].spines["right"].set_position( ("axes", 1 + ((i - 1) * 0.2))) ax_var[var_name].set_frame_on(True) ax_var[var_name].patch.set_visible(False) if len(x_axis) > 0: ax_var[var_name].plot(x_axis, var, selected_fmt, **properties_dict) else: ax_var[var_name].plot(var, selected_fmt, **properties_dict) ax_var[var_name].set_ylabel(var_label, wrap=True) ax_var[var_name].yaxis.label.set_color(selected_fmt[0]) ax_var[var_name].tick_params(axis='y', colors=selected_fmt[0]) ax.tick_params(axis='x', rotation=45) if label in ds and 'long_name' in ds[label].attrs: ax.set_xlabel(ds[label].attrs['long_name']) figure.tight_layout() if file: figure.savefig(file, dpi=600) return figure if not in_notebook() else None
def test_format(self): self.assertEqual(VarNamesLike.format(['aa', 'bb', 'cc']), "aa, bb, cc") self.assertEqual(VarNamesLike.format(['aa']), "aa") self.assertEqual(VarNamesLike.format([]), "") self.assertEqual(VarNamesLike.format(None), "")
def open_dataset(self, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, protocol: str = None, monitor: Monitor = Monitor.NONE) -> Any: time_range = TimeRangeLike.convert(time_range) if time_range else None var_names = VarNamesLike.convert(var_names) if var_names else None paths = [] if time_range: time_series = list(self._files.values()) file_paths = list(self._files.keys()) for i in range(len(time_series)): if time_series[i]: if isinstance(time_series[i], Tuple) and \ time_series[i][0] >= time_range[0] and \ time_series[i][1] <= time_range[1]: paths.extend(self._resolve_file_path(file_paths[i])) elif isinstance( time_series[i], datetime ) and time_range[0] <= time_series[i] < time_range[1]: paths.extend(self._resolve_file_path(file_paths[i])) else: for file in self._files.items(): paths.extend(self._resolve_file_path(file[0])) if not paths: raise self._empty_error(time_range) paths = sorted(set(paths)) try: excluded_variables = self._meta_info.get('exclude_variables') if excluded_variables: drop_variables = [ variable.get('name') for variable in excluded_variables ] else: drop_variables = None # TODO: combine var_names and drop_variables return open_xarray_dataset(paths, region=region, var_names=var_names, drop_variables=drop_variables, monitor=monitor) except HTTPError as e: raise self._cannot_access_error(time_range, region, var_names, verb="open", cause=e) from e except (URLError, socket.timeout) as e: raise self._cannot_access_error(time_range, region, var_names, verb="open", cause=e, error_cls=NetworkError) from e except OSError as e: raise self._cannot_access_error(time_range, region, var_names, verb="open", cause=e) from e
def _make_local(self, local_ds: 'LocalDataSource', time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): local_id = local_ds.id time_range = TimeRangeLike.convert(time_range) if time_range else None var_names = VarNamesLike.convert(var_names) if var_names else None # type: Sequence compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False encoding_update = dict() if compression_enabled: encoding_update.update({'zlib': True, 'complevel': compression_level}) local_path = os.path.join(local_ds.data_store.data_store_path, local_id) data_store_path = local_ds.data_store.data_store_path if not os.path.exists(local_path): os.makedirs(local_path) monitor.start("Sync " + self.id, total_work=len(self._files.items())) for remote_relative_filepath, coverage in self._files.items(): child_monitor = monitor.child(work=1) file_name = os.path.basename(remote_relative_filepath) local_relative_filepath = os.path.join(local_id, file_name) local_absolute_filepath = os.path.join(data_store_path, local_relative_filepath) remote_absolute_filepath = os.path.join(self._data_store.data_store_path, remote_relative_filepath) if isinstance(coverage, Tuple): time_coverage_start = coverage[0] time_coverage_end = coverage[1] if not time_range or time_coverage_start >= time_range[0] and time_coverage_end <= time_range[1]: if region or var_names: do_update_of_variables_meta_info_once = True do_update_of_region_meta_info_once = True remote_dataset = None try: remote_dataset = xr.open_dataset(remote_absolute_filepath) if var_names: remote_dataset = remote_dataset.drop( [var_name for var_name in remote_dataset.data_vars.keys() if var_name not in var_names]) if region: remote_dataset = normalize_impl(remote_dataset) remote_dataset = adjust_spatial_attrs_impl(subset_spatial_impl(remote_dataset, region), allow_point=False) if do_update_of_region_meta_info_once: # subset_spatial_impl local_ds.meta_info['bbox_maxx'] = remote_dataset.attrs['geospatial_lon_max'] local_ds.meta_info['bbox_minx'] = remote_dataset.attrs['geospatial_lon_min'] local_ds.meta_info['bbox_maxy'] = remote_dataset.attrs['geospatial_lat_max'] local_ds.meta_info['bbox_miny'] = remote_dataset.attrs['geospatial_lat_min'] do_update_of_region_meta_info_once = False if compression_enabled: for sel_var_name in remote_dataset.variables.keys(): remote_dataset.variables.get(sel_var_name).encoding.update(encoding_update) remote_dataset.to_netcdf(local_absolute_filepath) child_monitor.progress(work=1, msg=str(time_coverage_start)) finally: if do_update_of_variables_meta_info_once and remote_dataset is not None: variables_info = local_ds.meta_info.get('variables', []) local_ds.meta_info['variables'] = [var_info for var_info in variables_info if var_info.get('name') in remote_dataset.variables.keys() and var_info.get('name') not in remote_dataset.dims.keys()] # noinspection PyUnusedLocal do_update_of_variables_meta_info_once = False local_ds.add_dataset(os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) child_monitor.done() else: shutil.copy(remote_absolute_filepath, local_absolute_filepath) local_ds.add_dataset(local_relative_filepath, (time_coverage_start, time_coverage_end)) child_monitor.done() monitor.done() return local_id
def _make_local(self, local_ds: 'LocalDataSource', time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): local_id = local_ds.id time_range = TimeRangeLike.convert(time_range) if time_range else None var_names = VarNamesLike.convert( var_names) if var_names else None # type: Sequence compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False encoding_update = dict() if compression_enabled: encoding_update.update({ 'zlib': True, 'complevel': compression_level }) local_path = os.path.join(local_ds.data_store.data_store_path, local_id) data_store_path = local_ds.data_store.data_store_path if not os.path.exists(local_path): os.makedirs(local_path) monitor.start("Sync " + self.id, total_work=len(self._files.items())) for remote_relative_filepath, coverage in self._files.items(): child_monitor = monitor.child(work=1) file_name = os.path.basename(remote_relative_filepath) local_relative_filepath = os.path.join(local_id, file_name) local_absolute_filepath = os.path.join(data_store_path, local_relative_filepath) remote_absolute_filepath = os.path.join( self._data_store.data_store_path, remote_relative_filepath) if isinstance(coverage, Tuple): time_coverage_start = coverage[0] time_coverage_end = coverage[1] if not time_range or time_coverage_start >= time_range[ 0] and time_coverage_end <= time_range[1]: if region or var_names: do_update_of_variables_meta_info_once = True do_update_of_region_meta_info_once = True remote_dataset = None try: remote_dataset = xr.open_dataset( remote_absolute_filepath) if var_names: remote_dataset = remote_dataset.drop([ var_name for var_name in remote_dataset.data_vars.keys() if var_name not in var_names ]) if region: remote_dataset = normalize_impl(remote_dataset) remote_dataset = adjust_spatial_attrs_impl( subset_spatial_impl( remote_dataset, region), allow_point=False) if do_update_of_region_meta_info_once: # subset_spatial_impl local_ds.meta_info[ 'bbox_maxx'] = remote_dataset.attrs[ 'geospatial_lon_max'] local_ds.meta_info[ 'bbox_minx'] = remote_dataset.attrs[ 'geospatial_lon_min'] local_ds.meta_info[ 'bbox_maxy'] = remote_dataset.attrs[ 'geospatial_lat_max'] local_ds.meta_info[ 'bbox_miny'] = remote_dataset.attrs[ 'geospatial_lat_min'] do_update_of_region_meta_info_once = False if compression_enabled: for sel_var_name in remote_dataset.variables.keys( ): remote_dataset.variables.get( sel_var_name).encoding.update( encoding_update) remote_dataset.to_netcdf(local_absolute_filepath) child_monitor.progress( work=1, msg=str(time_coverage_start)) finally: if do_update_of_variables_meta_info_once and remote_dataset is not None: variables_info = local_ds.meta_info.get( 'variables', []) local_ds.meta_info['variables'] = [ var_info for var_info in variables_info if var_info.get('name') in remote_dataset. variables.keys() and var_info.get('name') not in remote_dataset.dims.keys() ] # noinspection PyUnusedLocal do_update_of_variables_meta_info_once = False local_ds.add_dataset( os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) child_monitor.done() else: shutil.copy(remote_absolute_filepath, local_absolute_filepath) local_ds.add_dataset( local_relative_filepath, (time_coverage_start, time_coverage_end)) child_monitor.done() monitor.done() return local_id
def write_csv(obj: DataFrameLike.TYPE, file: FileLike.TYPE, columns: VarNamesLike.TYPE = None, na_rep: str = '', delimiter: str = ',', quotechar: str = None, more_args: DictLike.TYPE = None, monitor: Monitor = Monitor.NONE): """ Write comma-separated values (CSV) to plain text file from a DataFrame or Dataset. :param obj: The object to write as CSV; must be a ``DataFrame`` or a ``Dataset``. :param file: The CSV file path. :param columns: The names of variables that should be converted to columns. If given, coordinate variables are included automatically. :param delimiter: Delimiter to use. :param na_rep: A string representation of a missing value (no-data value). :param quotechar: The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. :param more_args: Other optional keyword arguments. Please refer to Pandas documentation of ``pandas.to_csv()`` function. :param monitor: optional progress monitor """ if obj is None: raise ValidationError('obj must not be None') columns = VarNamesLike.convert(columns) if isinstance(obj, pd.DataFrame): # The following code is needed, because Pandas treats any kw given in kwargs as being set, even if just None. kwargs = DictLike.convert(more_args) if kwargs is None: kwargs = {} if columns: kwargs.update(columns=columns) if delimiter: kwargs.update(sep=delimiter) if na_rep: kwargs.update(na_rep=na_rep) if quotechar: kwargs.update(quotechar=quotechar) with monitor.starting('Writing to CSV', 1): obj.to_csv(file, index_label='index', **kwargs) monitor.progress(1) elif isinstance(obj, xr.Dataset): var_names = [ var_name for var_name in obj.data_vars if columns is None or var_name in columns ] dim_names = None data_vars = [] for var_name in var_names: data_var = obj.data_vars[var_name] if dim_names is None: dim_names = data_var.dims elif dim_names != data_var.dims: raise ValidationError( 'Not all variables have the same dimensions. ' 'Please select variables so that their dimensions are equal.' ) data_vars.append(data_var) if dim_names is None: raise ValidationError( 'None of the selected variables has a dimension.') coord_vars = [] for dim_name in dim_names: if dim_name in obj.coords: coord_var = obj.coords[dim_name] else: coord_var = None for data_var in obj.coords.values(): if len(data_var.dims ) == 1 and data_var.dims[0] == dim_name: coord_var = data_var break if coord_var is None: raise ValueError( f'No coordinate variable found for dimension "{dim_name}"' ) coord_vars.append(coord_var) coord_indexes = [range(len(coord_var)) for coord_var in coord_vars] num_coords = len(coord_vars) num_rows = 1 for coord_var in coord_vars: num_rows *= len(coord_var) stream = open(file, 'w') if isinstance(file, str) else file try: # Write header row stream.write('index') for i in range(num_coords): stream.write(delimiter) stream.write(coord_vars[i].name) for data_var in data_vars: stream.write(delimiter) stream.write(data_var.name) stream.write('\n') with monitor.starting('Writing CSV', num_rows): row = 0 for index in itertools.product(*coord_indexes): # Write data row stream.write(str(row)) for i in range(num_coords): coord_value = coord_vars[i].values[index[i]] stream.write(delimiter) stream.write(str(coord_value)) for data_var in data_vars: var_value = data_var.values[index] stream.write(delimiter) stream.write(str(var_value)) stream.write('\n') monitor.progress(1) row += 1 finally: if isinstance(file, str): stream.close() elif obj is None: raise ValidationError('obj must not be None') else: raise ValidationError( 'obj must be a pandas.DataFrame or a xarray.Dataset')
def _make_local(self, local_ds: LocalDataSource, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): local_id = local_ds.id time_range = TimeRangeLike.convert(time_range) var_names = VarNamesLike.convert(var_names) excluded_variables = get_exclude_variables_fix_known_issues(self.id) compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False do_update_of_verified_time_coverage_start_once = True verified_time_coverage_start = None verified_time_coverage_end = None encoding_update = dict() if compression_enabled: encoding_update.update({'zlib': True, 'complevel': compression_level}) if region or var_names: protocol = _ODP_PROTOCOL_OPENDAP else: protocol = _ODP_PROTOCOL_HTTP local_path = os.path.join(local_ds.data_store.data_store_path, local_id) if not os.path.exists(local_path): os.makedirs(local_path) selected_file_list = self._find_files(time_range) if not selected_file_list: msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(self.id) if time_range is not None: msg += ' in given time range {}'.format(TimeRangeLike.format(time_range)) raise DataAccessError(msg) try: if protocol == _ODP_PROTOCOL_OPENDAP: do_update_of_variables_meta_info_once = True do_update_of_region_meta_info_once = True files = self._get_urls_list(selected_file_list, protocol) monitor.start('Sync ' + self.id, total_work=len(files)) for idx, dataset_uri in enumerate(files): child_monitor = monitor.child(work=1) file_name = os.path.basename(dataset_uri) local_filepath = os.path.join(local_path, file_name) time_coverage_start = selected_file_list[idx][1] time_coverage_end = selected_file_list[idx][2] child_monitor.start(label=file_name, total_work=1) remote_dataset = xr.open_dataset(dataset_uri, drop_variables=[variable.get('name') for variable in excluded_variables]) if var_names: remote_dataset = remote_dataset.drop([var_name for var_name in remote_dataset.data_vars.keys() if var_name not in var_names]) if region: remote_dataset = normalize_impl(remote_dataset) remote_dataset = adjust_spatial_attrs_impl(subset_spatial_impl(remote_dataset, region), allow_point=False) if do_update_of_region_meta_info_once: local_ds.meta_info['bbox_minx'] = remote_dataset.attrs['geospatial_lon_min'] local_ds.meta_info['bbox_maxx'] = remote_dataset.attrs['geospatial_lon_max'] local_ds.meta_info['bbox_maxy'] = remote_dataset.attrs['geospatial_lat_max'] local_ds.meta_info['bbox_miny'] = remote_dataset.attrs['geospatial_lat_min'] do_update_of_region_meta_info_once = False if compression_enabled: for sel_var_name in remote_dataset.variables.keys(): remote_dataset.variables.get(sel_var_name).encoding.update(encoding_update) remote_dataset.to_netcdf(local_filepath) child_monitor.progress(work=1, msg=str(time_coverage_start)) if do_update_of_variables_meta_info_once: variables_info = local_ds.meta_info.get('variables', []) local_ds.meta_info['variables'] = [var_info for var_info in variables_info if var_info.get('name') in remote_dataset.variables.keys() and var_info.get('name') not in remote_dataset.dims.keys()] do_update_of_variables_meta_info_once = False local_ds.add_dataset(os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) if do_update_of_verified_time_coverage_start_once: verified_time_coverage_start = time_coverage_start do_update_of_verified_time_coverage_start_once = False verified_time_coverage_end = time_coverage_end child_monitor.done() else: outdated_file_list = [] for file_rec in selected_file_list: filename, _, _, file_size, url = file_rec dataset_file = os.path.join(local_path, filename) # todo (forman, 20160915): must perform better checks on dataset_file if it is... # ... outdated or incomplete or corrupted. # JSON also includes "checksum" and "checksum_type" fields. if not os.path.isfile(dataset_file) or (file_size and os.path.getsize(dataset_file) != file_size): outdated_file_list.append(file_rec) if outdated_file_list: with monitor.starting('Sync ' + self.id, len(outdated_file_list)): bytes_to_download = sum([file_rec[3] for file_rec in outdated_file_list]) dl_stat = _DownloadStatistics(bytes_to_download) file_number = 1 for filename, coverage_from, coverage_to, file_size, url in outdated_file_list: dataset_file = os.path.join(local_path, filename) sub_monitor = monitor.child(work=1.0) # noinspection PyUnusedLocal def reporthook(block_number, read_size, total_file_size): dl_stat.handle_chunk(read_size) sub_monitor.progress(work=read_size, msg=str(dl_stat)) sub_monitor_msg = "file %d of %d" % (file_number, len(outdated_file_list)) with sub_monitor.starting(sub_monitor_msg, file_size): urllib.request.urlretrieve(url[protocol], filename=dataset_file, reporthook=reporthook) file_number += 1 local_ds.add_dataset(os.path.join(local_id, filename), (coverage_from, coverage_to)) if do_update_of_verified_time_coverage_start_once: verified_time_coverage_start = coverage_from do_update_of_verified_time_coverage_start_once = False verified_time_coverage_end = coverage_to except (OSError, ValueError) as e: raise DataAccessError("Copying remote data source failed: {}".format(e), source=self) from e local_ds.meta_info['temporal_coverage_start'] = TimeLike.format(verified_time_coverage_start) local_ds.meta_info['temporal_coverage_end'] = TimeLike.format(verified_time_coverage_end) local_ds.meta_info['exclude_variables'] = excluded_variables local_ds.save(True)
def test_make_local(self): data_source = self._local_data_store.query('local_w_temporal')[0] with unittest.mock.patch.object(EsaCciOdpDataStore, 'query', return_value=[]): new_ds_title = 'from_local_to_local' new_ds_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 15, 23, 59))) new_ds = data_source.make_local(new_ds_title, time_range=new_ds_time_range) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.id, "local.%s" % new_ds_title) self.assertEqual( new_ds.temporal_coverage(), TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 15, 23, 59)))) new_ds_2_title = 'from_local_to_local_var' new_ds_2_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 15, 23, 59))) new_ds_2_vars = VarNamesLike.convert(['sm']) new_ds_w_one_variable = data_source.make_local( new_ds_2_title, time_range=new_ds_2_time_range, var_names=new_ds_2_vars) self.assertIsNotNone(new_ds_w_one_variable) self.assertEqual(new_ds_w_one_variable.id, "local.%s" % new_ds_2_title) data_set = new_ds_w_one_variable.open_dataset() self.assertSetEqual(set(data_set.variables), {'sm', 'lat', 'lon', 'time'}) new_ds_3_title = 'from_local_to_local_range' new_ds_3_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 15, 23, 59))) new_ds_3_vars = VarNamesLike.convert(['sm']) new_ds_3_region = PolygonLike.convert("10,10,20,20") new_ds_w_region = data_source.make_local( new_ds_3_title, time_range=new_ds_3_time_range, var_names=new_ds_3_vars, region=new_ds_3_region) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_3_title) self.assertEqual(new_ds_w_region.spatial_coverage(), PolygonLike.convert("10,10,20,20")) data_set = new_ds_w_region.open_dataset() self.assertSetEqual(set(data_set.variables), {'sm', 'lat', 'lon', 'time'}) no_data = data_source.make_local( 'no_data', time_range=(datetime.datetime(2020, 11, 14, 0, 0), datetime.datetime(2020, 11, 15, 23, 59))) self.assertIsNone(no_data)
def make_local(self, local_name: str, local_id: str = None, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE) -> Optional[DataSource]: time_range = TimeRangeLike.convert(time_range) if time_range else None region = PolygonLike.convert(region) if region else None var_names = VarNamesLike.convert(var_names) if var_names else None ds_id = local_name title = local_id local_store = DATA_STORE_REGISTRY.get_data_store('local') if not local_store: add_to_data_store_registry() local_store = DATA_STORE_REGISTRY.get_data_store('local') if not local_store: raise ValueError('Cannot initialize `local` DataStore') uuid = LocalDataStore.generate_uuid(ref_id=self.id, time_range=time_range, region=region, var_names=var_names) if not ds_id or len(ds_id) == 0: ds_id = "local.{}.{}".format(self.id, uuid) existing_ds_list = local_store.query(ds_id=ds_id) if len(existing_ds_list) == 1: return existing_ds_list[0] else: existing_ds_list = local_store.query(ds_id='local.%s' % ds_id) if len(existing_ds_list) == 1: if existing_ds_list[0].meta_info.get('uuid', None) == uuid: return existing_ds_list[0] else: raise ValueError('Datastore {} already contains dataset {}'.format(local_store.id, ds_id)) local_meta_info = self.meta_info.copy() local_meta_info['ref_uuid'] = local_meta_info.get('uuid', None) local_meta_info['uuid'] = uuid local_ds = local_store.create_data_source(ds_id, title=title, time_range=time_range, region=region, var_names=var_names, meta_info=local_meta_info, lock_file=True) if local_ds: if not local_ds.is_complete: try: self._make_local(local_ds, time_range, region, var_names, monitor=monitor) except Cancellation as c: local_store.remove_data_source(local_ds) raise c except Exception as e: if local_ds.is_empty: local_store.remove_data_source(local_ds) raise e if local_ds.is_empty: local_store.remove_data_source(local_ds) return None local_store.register_ds(local_ds) return local_ds else: return None
def _make_local(self, local_ds: 'LocalDataSource', time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): local_id = local_ds.id time_range = TimeRangeLike.convert(time_range) if time_range else None region = PolygonLike.convert(region) if region else None var_names = VarNamesLike.convert( var_names) if var_names else None # type: Sequence compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False encoding_update = dict() if compression_enabled: encoding_update.update({ 'zlib': True, 'complevel': compression_level }) local_path = os.path.join(local_ds.data_store.data_store_path, local_id) data_store_path = local_ds.data_store.data_store_path if not os.path.exists(local_path): os.makedirs(local_path) monitor.start("Sync " + self.id, total_work=len(self._files.items())) for remote_relative_filepath, coverage in self._files.items(): child_monitor = monitor.child(work=1) file_name = os.path.basename(remote_relative_filepath) local_relative_filepath = os.path.join(local_id, file_name) local_absolute_filepath = os.path.join(data_store_path, local_relative_filepath) remote_absolute_filepath = os.path.join( self._data_store.data_store_path, remote_relative_filepath) if isinstance(coverage, Tuple): time_coverage_start = coverage[0] time_coverage_end = coverage[1] remote_netcdf = None local_netcdf = None if not time_range or time_coverage_start >= time_range[ 0] and time_coverage_end <= time_range[1]: if region or var_names: try: remote_netcdf = NetCDF4DataStore( remote_absolute_filepath) local_netcdf = NetCDF4DataStore( local_absolute_filepath, mode='w', persist=True) local_netcdf.set_attributes( remote_netcdf.get_attrs()) remote_dataset = xr.Dataset.load_store( remote_netcdf) geo_lat_min = None geo_lat_max = None geo_lon_min = None geo_lon_max = None process_region = False if region: geo_lat_min = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_min') geo_lat_max = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_max') geo_lon_min = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_min') geo_lon_max = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_max') geo_lat_res = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_resolution') geo_lon_res = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_resolution') if not (isnan(geo_lat_min) or isnan(geo_lat_max) or isnan(geo_lon_min) or isnan(geo_lon_max) or isnan(geo_lat_res) or isnan(geo_lon_res)): process_region = True [lon_min, lat_min, lon_max, lat_max] = region.bounds descending_data_order = set() for var in remote_dataset.coords.keys(): if remote_dataset.coords[var][ 0] > remote_dataset.coords[ var][-1]: descending_data_order.add(var) if 'lat' not in descending_data_order: lat_min = lat_min - geo_lat_min lat_max = lat_max - geo_lat_min else: lat_min_copy = lat_min lat_min = geo_lat_max - lat_max lat_max = geo_lat_max - lat_min_copy if 'lon' not in descending_data_order: lon_min = lon_min - geo_lon_min lon_max = lon_max - geo_lon_min else: lon_min_copy = lon_min lon_min = geo_lon_max - lon_max lon_max = geo_lon_max - lon_min_copy lat_min = int(floor(lat_min / geo_lat_res)) lat_max = int(ceil(lat_max / geo_lat_res)) lon_min = int(floor(lon_min / geo_lon_res)) lon_max = int(ceil(lon_max / geo_lon_res)) remote_dataset = remote_dataset.isel( drop=False, lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max)) if 'lat' not in descending_data_order: geo_lat_min_copy = geo_lat_min geo_lat_min = lat_min * geo_lat_res + geo_lat_min_copy geo_lat_max = lat_max * geo_lat_res + geo_lat_min_copy else: geo_lat_max_copy = geo_lat_max geo_lat_min = geo_lat_max_copy - lat_max * geo_lat_res geo_lat_max = geo_lat_max_copy - lat_min * geo_lat_res if 'lon' not in descending_data_order: geo_lon_min_copy = geo_lon_min geo_lon_min = lon_min * geo_lon_res + geo_lon_min_copy geo_lon_max = lon_max * geo_lon_res + geo_lon_min_copy else: geo_lon_max_copy = geo_lon_max geo_lon_min = geo_lon_max_copy - lon_max * geo_lon_res geo_lon_max = geo_lon_max_copy - lon_min * geo_lon_res if not var_names: var_names = [ var_name for var_name in remote_netcdf.variables.keys() ] var_names.extend([ coord_name for coord_name in remote_dataset.coords.keys() if coord_name not in var_names ]) child_monitor.start(label=file_name, total_work=len(var_names)) for sel_var_name in var_names: var_dataset = remote_dataset.drop([ var_name for var_name in remote_dataset.variables.keys() if var_name != sel_var_name ]) if compression_enabled: var_dataset.variables.get( sel_var_name).encoding.update( encoding_update) local_netcdf.store_dataset(var_dataset) child_monitor.progress(work=1, msg=sel_var_name) if process_region: local_netcdf.set_attribute( 'geospatial_lat_min', geo_lat_min) local_netcdf.set_attribute( 'geospatial_lat_max', geo_lat_max) local_netcdf.set_attribute( 'geospatial_lon_min', geo_lon_min) local_netcdf.set_attribute( 'geospatial_lon_max', geo_lon_max) finally: if remote_netcdf: remote_netcdf.close() if local_netcdf: local_netcdf.close() local_ds.add_dataset( local_relative_filepath, (time_coverage_start, time_coverage_end)) child_monitor.done() else: shutil.copy(remote_absolute_filepath, local_absolute_filepath) local_ds.add_dataset( local_relative_filepath, (time_coverage_start, time_coverage_end)) child_monitor.done() monitor.done() return local_id
def long_term_average(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Perform long term average of the given dataset by doing a mean of monthly values over the time range covered by the dataset. E.g. it averages all January values, all February values, etc, to create a dataset with twelve time slices each containing a mean of respective monthly values. For further information on climatological datasets, see http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics :param ds: A monthly dataset to average :param var: If given, only these variables will be preserved in the resulting dataset :param monitor: A progress monitor :return: A climatological long term average dataset """ ds = DatasetLike.convert(ds) # Check if time dtype is what we want if 'datetime64[ns]' != ds.time.dtype: raise ValueError( 'Long term average operation expects a dataset with the' ' time coordinate of type datetime64[ns], but received' ' {}. Running the normalize operation on this' ' dataset may help'.format(ds.time.dtype)) # Check if we have a monthly dataset try: if ds.attrs['time_coverage_resolution'] != 'P1M': raise ValueError( 'Long term average operation expects a monthly dataset' ' running temporal aggregation on this dataset' ' beforehand may help.') except KeyError: raise ValueError('Could not determine temporal resolution. Running' ' the adjust_temporal_attrs operation beforehand may' ' help.') var = VarNamesLike.convert(var) # Shallow retset = ds.copy() if var: retset = select_var(retset, var) time_min = pd.Timestamp(ds.time.values[0]) time_max = pd.Timestamp(ds.time.values[-1]) total_work = 100 with monitor.starting('LTA', total_work=total_work): monitor.progress(work=0) step = total_work / 12 kwargs = {'monitor': monitor, 'step': step} retset = retset.groupby('time.month', squeeze=False).apply(_mean, **kwargs) # Make the return dataset CF compliant retset = retset.rename({'month': 'time'}) retset['time'] = pd.date_range('{}-01-01'.format(time_min.year), freq='MS', periods=12) climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max], (12, 1)), dims=['time', 'nv'], name='climatology_bounds') retset['climatology_bounds'] = climatology_bounds retset.time.attrs = ds.time.attrs retset.time.attrs['climatology'] = 'climatology_bounds' for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + ' time: mean over years' except KeyError: retset[var].attrs['cell_methods'] = 'time: mean over years' return retset
def data_frame_aggregate(df: DataFrameLike.TYPE, var_names: VarNamesLike.TYPE = None, aggregate_geometry: bool = False, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Aggregate columns into count, mean, median, sum, std, min, and max. Return a new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of the GeoDataFrame are to be aggregated. All geometries are merged union-like. The return data type will always be the same as the input data type. :param df: The (Geo)DataFrame to be analysed :param var_names: Variables to be aggregated ('None' uses all aggregatable columns) :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame :param monitor: Monitor for progress bar :return: returns either DataFrame or GeoDataFrame. Keeps input data type """ vns = VarNamesLike.convert(var_names) df_is_geo = isinstance(df, gpd.GeoDataFrame) aggregations = ["count", "mean", "median", "sum", "std", "min", "max"] # Check var names integrity (aggregatable, exists in data frame) types_accepted_for_agg = ['float64', 'int64', 'bool'] agg_columns = list(df.select_dtypes(include=types_accepted_for_agg).columns) if df_is_geo: agg_columns.append('geometry') columns = list(df.columns) if vns is None: vns = agg_columns diff = list(set(vns) - set(columns)) if len(diff) > 0: raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!') diff = list(set(vns) - set(agg_columns)) if len(diff) > 0: raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!') try: df['geometry'] except KeyError as e: raise ValidationError('Variable geometry not in GEO data frame!') from e # Aggregate columns if vns is None: df_buff = df.select_dtypes(include=types_accepted_for_agg).agg(aggregations) else: df_buff = df[vns].select_dtypes(include=types_accepted_for_agg).agg(aggregations) res = {} for n in df_buff.columns: for a in aggregations: val = df_buff[n][a] h = n + '_' + a res[h] = [val] df_agg = pd.DataFrame(res) # Aggregate (union) geometry if GeoDataFrame if df_is_geo and aggregate_geometry: total_work = 100 num_work_rows = 1 + len(df) // total_work with monitor.starting('Aggregating geometry: ', total_work): multi_polygon = shapely.geometry.MultiPolygon() i = 0 for rec in df.geometry: if monitor.is_cancelled(): break # noinspection PyBroadException try: multi_polygon = multi_polygon.union(other=rec) except Exception: pass if i % num_work_rows == 0: monitor.progress(work=1) i += 1 df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs) return df_agg
def _make_local(self, local_ds: LocalDataSource, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): # local_name = local_ds.name local_id = local_ds.name time_range = TimeRangeLike.convert(time_range) if time_range else None region = PolygonLike.convert(region) if region else None var_names = VarNamesLike.convert( var_names) if var_names else None # type: Sequence compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False encoding_update = dict() if compression_enabled: encoding_update.update({ 'zlib': True, 'complevel': compression_level }) if region or var_names: protocol = _ODP_PROTOCOL_OPENDAP else: protocol = _ODP_PROTOCOL_HTTP local_path = os.path.join(local_ds.data_store.data_store_path, local_id) if not os.path.exists(local_path): os.makedirs(local_path) selected_file_list = self._find_files(time_range) if protocol == _ODP_PROTOCOL_OPENDAP: files = self._get_urls_list(selected_file_list, protocol) monitor.start('Sync ' + self.name, total_work=len(files)) for idx, dataset_uri in enumerate(files): child_monitor = monitor.child(work=1) file_name = os.path.basename(dataset_uri) local_filepath = os.path.join(local_path, file_name) time_coverage_start = selected_file_list[idx][1] time_coverage_end = selected_file_list[idx][2] remote_netcdf = None local_netcdf = None try: remote_netcdf = NetCDF4DataStore(dataset_uri) local_netcdf = NetCDF4DataStore(local_filepath, mode='w', persist=True) local_netcdf.set_attributes(remote_netcdf.get_attrs()) remote_dataset = xr.Dataset.load_store(remote_netcdf) process_region = False if region: geo_lat_min = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_min') geo_lat_max = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_max') geo_lon_min = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_min') geo_lon_max = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_max') geo_lat_res = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_resolution') geo_lon_res = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_resolution') if not (isnan(geo_lat_min) or isnan(geo_lat_max) or isnan(geo_lon_min) or isnan(geo_lon_max) or isnan(geo_lat_res) or isnan(geo_lon_res)): process_region = True [lat_min, lon_min, lat_max, lon_max] = region.bounds lat_min = floor( (lat_min - geo_lat_min) / geo_lat_res) lat_max = ceil( (lat_max - geo_lat_min) / geo_lat_res) lon_min = floor( (lon_min - geo_lon_min) / geo_lon_res) lon_max = ceil( (lon_max - geo_lon_min) / geo_lon_res) # TODO (kbernat): check why dataset.sel fails! remote_dataset = remote_dataset.isel( drop=False, lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max)) geo_lat_max = lat_max * geo_lat_res + geo_lat_min geo_lat_min += lat_min * geo_lat_res geo_lon_max = lon_max * geo_lon_res + geo_lon_min geo_lon_min += lon_min * geo_lon_res if not var_names: var_names = [ var_name for var_name in remote_netcdf.variables.keys() ] var_names.extend([ coord_name for coord_name in remote_dataset.coords.keys() if coord_name not in var_names ]) child_monitor.start(label=file_name, total_work=len(var_names)) for sel_var_name in var_names: var_dataset = remote_dataset.drop([ var_name for var_name in remote_dataset.variables.keys() if var_name != sel_var_name ]) if compression_enabled: var_dataset.variables.get( sel_var_name).encoding.update(encoding_update) local_netcdf.store_dataset(var_dataset) child_monitor.progress(work=1, msg=sel_var_name) if process_region: local_netcdf.set_attribute('geospatial_lat_min', geo_lat_min) local_netcdf.set_attribute('geospatial_lat_max', geo_lat_max) local_netcdf.set_attribute('geospatial_lon_min', geo_lon_min) local_netcdf.set_attribute('geospatial_lon_max', geo_lon_max) finally: if remote_netcdf: remote_netcdf.close() if local_netcdf: local_netcdf.close() local_ds.add_dataset( os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) child_monitor.done() else: outdated_file_list = [] for file_rec in selected_file_list: filename, _, _, file_size, url = file_rec dataset_file = os.path.join(local_path, filename) # todo (forman, 20160915): must perform better checks on dataset_file if it is... # ... outdated or incomplete or corrupted. # JSON also includes "checksum" and "checksum_type" fields. if not os.path.isfile(dataset_file) or ( file_size and os.path.getsize(dataset_file) != file_size): outdated_file_list.append(file_rec) if outdated_file_list: with monitor.starting('Sync ' + self.name, len(outdated_file_list)): bytes_to_download = sum( [file_rec[3] for file_rec in outdated_file_list]) dl_stat = _DownloadStatistics(bytes_to_download) file_number = 1 for filename, coverage_from, coverage_to, file_size, url in outdated_file_list: if monitor.is_cancelled(): raise InterruptedError dataset_file = os.path.join(local_path, filename) sub_monitor = monitor.child(work=1.0) # noinspection PyUnusedLocal def reporthook(block_number, read_size, total_file_size): dl_stat.handle_chunk(read_size) if monitor.is_cancelled(): raise InterruptedError sub_monitor.progress(work=read_size, msg=str(dl_stat)) sub_monitor_msg = "file %d of %d" % ( file_number, len(outdated_file_list)) with sub_monitor.starting(sub_monitor_msg, file_size): urllib.request.urlretrieve(url[protocol], filename=dataset_file, reporthook=reporthook) file_number += 1 local_ds.add_dataset(os.path.join(local_id, filename), (coverage_from, coverage_to)) local_ds.save() monitor.done()
def test_make_local_and_update(self): soilmoisture_data_sources = self.data_store.query( query_expr='esacci.SOILMOISTURE.day.L3S.SSMV.multi-sensor.multi-platform.COMBINED.02-1.r1') soilmoisture_data_source = soilmoisture_data_sources[0] reference_path = os.path.join(os.path.dirname(__file__), os.path.normpath('resources/datasources/local/files/')) def find_files_mock(_, time_range): def build_file_item(item_name: str, date_from: datetime, date_to: datetime, size: int): return [item_name, date_from, date_to, size, {'OPENDAP': os.path.join(reference_path, item_name), 'HTTPServer': 'file:' + urllib.request.pathname2url(os.path.join(reference_path, item_name))}] reference_files = { 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781114000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 14, 0, 0), 'date_to': datetime.datetime(1978, 11, 14, 23, 59), 'size': 21511378 }, 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781115000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 15, 0, 0), 'date_to': datetime.datetime(1978, 11, 15, 23, 59), 'size': 21511378 }, 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781116000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 16, 0, 0), 'date_to': datetime.datetime(1978, 11, 16, 23, 59), 'size': 21511378 } } reference_files_list = [] for reference_file in reference_files.items(): file_name = reference_file[0] file_date_from = reference_file[1].get('date_from') file_date_to = reference_file[1].get('date_to') file_size = reference_file[1].get('size') if time_range: if file_date_from >= time_range[0] and file_date_to <= time_range[1]: reference_files_list.append(build_file_item(file_name, file_date_from, file_date_to, file_size)) else: reference_files_list.append(build_file_item(file_name, file_date_from, file_date_to, file_size)) return reference_files_list with unittest.mock.patch('cate.ds.esa_cci_odp.EsaCciOdpDataSource._find_files', find_files_mock): with unittest.mock.patch.object(EsaCciOdpDataStore, 'query', return_value=[]): new_ds_title = 'local_ds_test' new_ds_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) try: new_ds = soilmoisture_data_source.make_local(new_ds_title, time_range=new_ds_time_range) except Exception: raise ValueError(reference_path, os.listdir(reference_path)) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.id, "local.%s" % new_ds_title) self.assertEqual(new_ds.temporal_coverage(), new_ds_time_range) new_ds_w_one_variable_title = 'local_ds_test_var' new_ds_w_one_variable_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_one_variable_var_names = VarNamesLike.convert(['sm']) new_ds_w_one_variable = soilmoisture_data_source.make_local( new_ds_w_one_variable_title, time_range=new_ds_w_one_variable_time_range, var_names=new_ds_w_one_variable_var_names ) self.assertIsNotNone(new_ds_w_one_variable) self.assertEqual(new_ds_w_one_variable.id, "local.%s" % new_ds_w_one_variable_title) ds = new_ds_w_one_variable.open_dataset() new_ds_w_one_variable_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(ds.variables), set(new_ds_w_one_variable_var_names)) new_ds_w_region_title = 'from_local_to_local_region' new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, region=new_ds_w_region_spatial_coverage) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) new_ds_w_region_title = 'from_local_to_local_region_one_var' new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_var_names = VarNamesLike.convert(['sm']) new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, var_names=new_ds_w_region_var_names, region=new_ds_w_region_spatial_coverage) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) data_set = new_ds_w_region.open_dataset() new_ds_w_region_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names)) new_ds_w_region_title = 'from_local_to_local_region_two_var_sm_uncertainty' new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_var_names = VarNamesLike.convert(['sm', 'sm_uncertainty']) new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, var_names=new_ds_w_region_var_names, region=new_ds_w_region_spatial_coverage) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) data_set = new_ds_w_region.open_dataset() new_ds_w_region_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names)) empty_ds_timerange = (datetime.datetime(2017, 12, 1, 0, 0), datetime.datetime(2017, 12, 31, 23, 59)) with self.assertRaises(DataAccessError) as cm: soilmoisture_data_source.make_local('empty_ds', time_range=empty_ds_timerange) self.assertEqual(f'Data source "{soilmoisture_data_source.id}" does not' f' seem to have any datasets in given' f' time range {TimeRangeLike.format(empty_ds_timerange)}', str(cm.exception)) new_ds_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 14, 23, 59))) new_ds = soilmoisture_data_source.make_local("title_test_copy", time_range=new_ds_time_range) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.meta_info['title'], soilmoisture_data_source.meta_info['title']) title = "Title Test!" new_ds = soilmoisture_data_source.make_local("title_test_set", title, time_range=new_ds_time_range) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.meta_info['title'], title)
def data_frame_aggregate(df: DataFrameLike.TYPE, var_names: VarNamesLike.TYPE = None, aggregate_geometry: bool = False, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Aggregate columns into count, mean, median, sum, std, min, and max. Return a new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of the GeoDataFrame are to be aggregated. All geometries are merged union-like. The return data type will always be the same as the input data type. :param df: The (Geo)DataFrame to be analysed :param var_names: Variables to be aggregated ('None' uses all aggregatable columns) :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame :param monitor: Monitor for progress bar :return: returns either DataFrame or GeoDataFrame. Keeps input data type """ vns = VarNamesLike.convert(var_names) df_is_geo = isinstance(df, gpd.GeoDataFrame) aggregations = ["count", "mean", "median", "sum", "std", "min", "max"] # Check var names integrity (aggregatable, exists in data frame) types_accepted_for_agg = ['float64', 'int64', 'bool'] agg_columns = list( df.select_dtypes(include=types_accepted_for_agg).columns) if df_is_geo: agg_columns.append('geometry') columns = list(df.columns) if vns is None: vns = agg_columns diff = list(set(vns) - set(columns)) if len(diff) > 0: raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!') diff = list(set(vns) - set(agg_columns)) if len(diff) > 0: raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!') try: df['geometry'] except KeyError as e: raise ValidationError( 'Variable geometry not in GEO data frame!') from e # Aggregate columns if vns is None: df_buff = df.select_dtypes( include=types_accepted_for_agg).agg(aggregations) else: df_buff = df[vns].select_dtypes( include=types_accepted_for_agg).agg(aggregations) res = {} for n in df_buff.columns: for a in aggregations: val = df_buff[n][a] h = n + '_' + a res[h] = [val] df_agg = pd.DataFrame(res) # Aggregate (union) geometry if GeoDataFrame if df_is_geo and aggregate_geometry: total_work = 100 num_work_rows = 1 + len(df) // total_work with monitor.starting('Aggregating geometry: ', total_work): multi_polygon = shapely.geometry.MultiPolygon() i = 0 for rec in df.geometry: if monitor.is_cancelled(): break # noinspection PyBroadException try: multi_polygon = multi_polygon.union(other=rec) except Exception: pass if i % num_work_rows == 0: monitor.progress(work=1) i += 1 df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs) return df_agg