def test_mask_dataset_by_geometry(self): cube = mask_dataset_by_geometry(self.cube, self.triangle) self._assert_clipped_dataset_has_basic_props(cube) cube = mask_dataset_by_geometry(self.cube, self.triangle, save_geometry_wkt=True) self._assert_saved_geometry_wkt_is_fine(cube, 'geometry_wkt') cube = mask_dataset_by_geometry(self.cube, self.triangle, save_geometry_wkt='intersect_geom') self._assert_saved_geometry_wkt_is_fine(cube, 'intersect_geom')
def test_mask_dataset_for_chunked_input(self): cube = chunk_dataset(self.cube, chunk_sizes=dict(time=1, lat=90, lon=90)) cube = mask_dataset_by_geometry(cube, self.triangle) self._assert_clipped_dataset_has_basic_props(cube) self.assertEqual(((1, 1, 1, 1, 1), (4, ), (7, )), cube.temp.chunks) self.assertEqual(((1, 1, 1, 1, 1), (4, ), (7, )), cube.precip.chunks)
def select_region(xr_data: Union[xr.Dataset, xr.DataArray], bbox: Union[RegionBox, RegionShape]) -> xr.Dataset: """Function to spatially subset an xarray dataset from a bounding box.""" if isinstance(xr_data, xr.DataArray): xr_data = xr.Dataset({f"{xr_data.name}": xr_data}) # get bounding box if isinstance(bbox, RegionBox): bbox = shapely.geometry.box(bbox.lonmax, bbox.latmax, bbox.lonmin, bbox.latmin) return clip_dataset_by_geometry(xr_data, bbox) elif isinstance(bbox, RegionShape): bbox = bbox.geometries return mask_dataset_by_geometry(xr_data, bbox) else: raise ValueError(f"Unrecognized bbox type: {type(bbox)}")
def get_time_series(cube: xr.Dataset, geometry: GeometryLike = None, var_names: Sequence[str] = None, start_date: Date = None, end_date: Date = None, include_count: bool = False, include_stdev: bool = False, use_groupby: bool = False, cube_asserted: bool = False) -> Optional[xr.Dataset]: """ Get a time series dataset from a data *cube*. *geometry* may be provided as a (shapely) geometry object, a valid GeoJSON object, a valid WKT string, a sequence of box coordinates (x1, y1, x2, y2), or point coordinates (x, y). If *geometry* covers an area, i.e. is not a point, the function aggregates the variables to compute a mean value and if desired, the number of valid observations and the standard deviation. *start_date* and *end_date* may be provided as a numpy.datetime64 or an ISO datetime string. Returns a time-series dataset whose data variables have a time dimension but no longer have spatial dimensions, hence the resulting dataset's variables will only have N-2 dimensions. A global attribute ``max_number_of_observations`` will be set to the maximum number of observations that could have been made in each time step. If the given *geometry* does not overlap the cube's boundaries, or if not output variables remain, the function returns ``None``. :param cube: The xcube dataset :param geometry: Optional geometry :param var_names: Optional sequence of names of variables to be included. :param start_date: Optional start date. :param end_date: Optional end date. :param include_count: Whether to include the number of valid observations for each time step. Ignored if geometry is a point. :param include_stdev: Whether to include standard deviation for each time step. Ignored if geometry is a point. :param use_groupby: Use group-by operation. May increase or decrease runtime performance and/or memory consumption. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new dataset with time-series for each variable. """ if not cube_asserted: assert_cube(cube) geometry = convert_geometry(geometry) dataset = select_variables_subset(cube, var_names) if len(dataset.data_vars) == 0: return None if start_date is not None or end_date is not None: # noinspection PyTypeChecker dataset = dataset.sel(time=slice(start_date, end_date)) if isinstance(geometry, shapely.geometry.Point): bounds = get_dataset_geometry(dataset) if not bounds.contains(geometry): return None dataset = dataset.sel(lon=geometry.x, lat=geometry.y, method='Nearest') return dataset.assign_attrs(max_number_of_observations=1) if geometry is not None: dataset = mask_dataset_by_geometry(dataset, geometry, save_geometry_mask='__mask__') if dataset is None: return None mask = dataset['__mask__'] max_number_of_observations = np.count_nonzero(mask) dataset = dataset.drop('__mask__') else: max_number_of_observations = dataset.lat.size * dataset.lon.size ds_count = None ds_stdev = None if use_groupby: time_group = dataset.groupby('time') ds_mean = time_group.mean(skipna=True, dim=xr.ALL_DIMS) if include_count: ds_count = time_group.count(dim=xr.ALL_DIMS) if include_stdev: ds_stdev = time_group.std(skipna=True, dim=xr.ALL_DIMS) else: ds_mean = dataset.mean(dim=('lat', 'lon'), skipna=True) if include_count: ds_count = dataset.count(dim=('lat', 'lon')) if include_stdev: ds_stdev = dataset.std(dim=('lat', 'lon'), skipna=True) if ds_count is not None: ds_count = ds_count.rename( name_dict=dict({v: f"{v}_count" for v in ds_count.data_vars})) if ds_stdev is not None: ds_stdev = ds_stdev.rename( name_dict=dict({v: f"{v}_stdev" for v in ds_stdev.data_vars})) if ds_count is not None and ds_stdev is not None: ts_dataset = xr.merge([ds_mean, ds_stdev, ds_count]) elif ds_count is not None: ts_dataset = xr.merge([ds_mean, ds_count]) elif ds_stdev is not None: ts_dataset = xr.merge([ds_mean, ds_stdev]) else: ts_dataset = ds_mean ts_dataset = ts_dataset.assign_attrs( max_number_of_observations=max_number_of_observations) return ts_dataset
def test_mask_dataset_by_geometry_store_mask(self): cube = mask_dataset_by_geometry(self.cube, self.triangle, save_geometry_mask='geom_mask') self._assert_clipped_dataset_has_basic_props(cube) self._assert_dataset_mask_is_fine(cube, 'geom_mask')
def test_mask_dataset_by_geometry_excluded_vars(self): cube = mask_dataset_by_geometry(self.cube, self.triangle, excluded_vars='precip') self._assert_clipped_dataset_has_basic_props(cube)
def get_time_series(cube: xr.Dataset, geometry: GeometryLike = None, var_names: Sequence[str] = None, start_date: Date = None, end_date: Date = None, agg_methods: Union[str, Sequence[str], AbstractSet[str]] = AGG_MEAN, include_count: bool = False, include_stdev: bool = False, use_groupby: bool = False, cube_asserted: bool = False) -> Optional[xr.Dataset]: """ Get a time series dataset from a data *cube*. *geometry* may be provided as a (shapely) geometry object, a valid GeoJSON object, a valid WKT string, a sequence of box coordinates (x1, y1, x2, y2), or point coordinates (x, y). If *geometry* covers an area, i.e. is not a point, the function aggregates the variables to compute a mean value and if desired, the number of valid observations and the standard deviation. *start_date* and *end_date* may be provided as a numpy.datetime64 or an ISO datetime string. Returns a time-series dataset whose data variables have a time dimension but no longer have spatial dimensions, hence the resulting dataset's variables will only have N-2 dimensions. A global attribute ``max_number_of_observations`` will be set to the maximum number of observations that could have been made in each time step. If the given *geometry* does not overlap the cube's boundaries, or if not output variables remain, the function returns ``None``. :param cube: The xcube dataset :param geometry: Optional geometry :param var_names: Optional sequence of names of variables to be included. :param start_date: Optional start date. :param end_date: Optional end date. :param agg_methods: Aggregation methods. May be single string or sequence of strings. Possible values are 'mean', 'median', 'min', 'max', 'std', 'count'. Defaults to 'mean'. Ignored if geometry is a point. :param include_count: Deprecated. Whether to include the number of valid observations for each time step. Ignored if geometry is a point. :param include_stdev: Deprecated. Whether to include standard deviation for each time step. Ignored if geometry is a point. :param use_groupby: Use group-by operation. May increase or decrease runtime performance and/or memory consumption. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new dataset with time-series for each variable. """ if not cube_asserted: assert_cube(cube) geometry = convert_geometry(geometry) agg_methods = normalize_agg_methods(agg_methods) if include_count: warnings.warn("keyword argument 'include_count' has been deprecated, " f"use 'agg_methods=[{AGG_COUNT!r}, ...]' instead") agg_methods.add(AGG_COUNT) if include_stdev: warnings.warn("keyword argument 'include_stdev' has been deprecated, " f"use 'agg_methods=[{AGG_STD!r}, ...]' instead") agg_methods.add(AGG_STD) dataset = select_variables_subset(cube, var_names) if len(dataset.data_vars) == 0: return None if start_date is not None or end_date is not None: # noinspection PyTypeChecker dataset = dataset.sel(time=slice(start_date, end_date)) if isinstance(geometry, shapely.geometry.Point): bounds = get_dataset_geometry(dataset) if not bounds.contains(geometry): return None dataset = dataset.sel(lon=geometry.x, lat=geometry.y, method='Nearest') return dataset.assign_attrs(max_number_of_observations=1) if geometry is not None: dataset = mask_dataset_by_geometry(dataset, geometry, save_geometry_mask='__mask__') if dataset is None: return None mask = dataset['__mask__'] max_number_of_observations = np.count_nonzero(mask) dataset = dataset.drop_vars(['__mask__']) else: max_number_of_observations = dataset.lat.size * dataset.lon.size must_load = len(agg_methods) > 1 or any( AGG_METHODS[agg_method] == MUST_LOAD for agg_method in agg_methods) if must_load: dataset.load() agg_datasets = [] if use_groupby: time_group = dataset.groupby('time') for agg_method in agg_methods: method = getattr(time_group, agg_method) if agg_method == 'count': agg_dataset = method(dim=xr.ALL_DIMS) else: agg_dataset = method(dim=xr.ALL_DIMS, skipna=True) agg_datasets.append(agg_dataset) else: for agg_method in agg_methods: method = getattr(dataset, agg_method) if agg_method == 'count': agg_dataset = method(dim=('lat', 'lon')) else: agg_dataset = method(dim=('lat', 'lon'), skipna=True) agg_datasets.append(agg_dataset) agg_datasets = [ agg_dataset.rename(name_dict=dict( {v: f"{v}_{agg_method}" for v in agg_dataset.data_vars})) for agg_method, agg_dataset in zip(agg_methods, agg_datasets) ] ts_dataset = xr.merge(agg_datasets) ts_dataset = ts_dataset.assign_attrs( max_number_of_observations=max_number_of_observations) return ts_dataset