Beispiel #1
0
 def test_select_variables_subset_all(self):
     ds1 = create_highroc_dataset()
     # noinspection PyTypeChecker
     ds2 = select_variables_subset(ds1, None)
     self.assertIs(ds2, ds1)
     ds2 = select_variables_subset(ds1, ds1.data_vars.keys())
     self.assertIs(ds2, ds1)
Beispiel #2
0
    def transform_cube(self,
                       cube: xr.Dataset,
                       gm: GridMapping,
                       cube_config: CubeConfig) -> TransformedCube:

        desired_var_names = cube_config.variable_names
        if desired_var_names:
            cube = select_variables_subset(cube,
                                           var_names=desired_var_names)
            cube_config = cube_config.drop_props('variable_names')

        desired_bbox = cube_config.bbox
        if desired_bbox is not None:
            # Find out whether its possible to make a spatial subset
            # without resampling. First, grid mapping must be regular.
            can_do_spatial_subset = False
            if gm.is_regular:
                can_do_spatial_subset = True
                # Current spatial resolution must be the
                # desired spatial resolution, otherwise spatial resampling
                # is required later, which will include the desired
                # subsetting.
                desired_res = cube_config.spatial_res
                if desired_res is not None \
                        and not (math.isclose(gm.x_res, desired_res)
                                 and math.isclose(gm.y_res, desired_res)):
                    can_do_spatial_subset = False
                if can_do_spatial_subset:
                    # Finally, the desired CRS must be equal to the current
                    # one, or they must both be geographic.
                    desired_crs = cube_config.crs
                    if desired_crs:
                        desired_crs = pyproj.CRS.from_string(desired_crs)
                        if desired_crs != gm.crs \
                                and not (desired_crs.is_geographic
                                         and gm.crs.is_geographic):
                            can_do_spatial_subset = False
            if can_do_spatial_subset:
                cube = select_spatial_subset(cube,
                                             xy_bbox=desired_bbox)
                # Now that we have a new cube subset, we must adjust
                # its grid mapping.
                gm = GridMapping.from_dataset(
                    cube,
                    crs=gm.crs,
                    xy_var_names=gm.xy_var_names,
                )
                # Consume spatial properties
                cube_config = cube_config.drop_props(['bbox',
                                                      'spatial_res',
                                                      'crs'])

        desired_time_range = cube_config.time_range
        if desired_time_range:
            cube = select_temporal_subset(cube,
                                          time_range=desired_time_range)
            cube_config = cube_config.drop_props('time_range')

        return cube, gm, cube_config
Beispiel #3
0
    def select_variables_subset(self, var_names: Sequence[str] = None):
        """
        Select data variable from given *dataset* and create new dataset.

        :param var_names: The names of data variables to select.
        :return: A new dataset. It is empty, if *var_names* is empty. It is *dataset*, if *var_names* is None.
        """
        return select_variables_subset(self._dataset, var_names)
Beispiel #4
0
 def transform(ds: xr.Dataset) -> xr.Dataset:
     if variables:
         ds = select_variables_subset(ds, var_names=variables)
     if indexers:
         ds = ds.sel(**indexers)
     chunk_sizes = {dim: 1 for dim in ds.dims}
     chunk_sizes[spatial_dims[0]] = tile_width
     chunk_sizes[spatial_dims[1]] = tile_height
     return ds.chunk(chunk_sizes)
Beispiel #5
0
 def step3(input_slice):
     extra_vars = input_processor.get_extra_vars(input_slice)
     selected_variables = set(
         [var_name for var_name, _ in output_variables])
     selected_variables.update(extra_vars or set())
     return select_variables_subset(input_slice, selected_variables)
Beispiel #6
0
 def test_select_variables_subset_some(self):
     ds1 = create_highroc_dataset()
     self.assertEqual(36, len(ds1.data_vars))
     ds2 = select_variables_subset(ds1,
                                   ['conc_chl', 'c2rcc_flags', 'rtoa_10'])
     self.assertEqual(3, len(ds2.data_vars))
Beispiel #7
0
 def test_select_variables_subset_none(self):
     ds1 = create_highroc_dataset()
     ds2 = select_variables_subset(ds1, [])
     self.assertEqual(0, len(ds2.data_vars))
     ds2 = select_variables_subset(ds1, ['bibo'])
     self.assertEqual(0, len(ds2.data_vars))
Beispiel #8
0
def get_time_series(cube: xr.Dataset,
                    geometry: GeometryLike = None,
                    var_names: Sequence[str] = None,
                    start_date: Date = None,
                    end_date: Date = None,
                    include_count: bool = False,
                    include_stdev: bool = False,
                    use_groupby: bool = False,
                    cube_asserted: bool = False) -> Optional[xr.Dataset]:
    """
    Get a time series dataset from a data *cube*.

    *geometry* may be provided as a (shapely) geometry object, a valid GeoJSON object, a valid WKT string,
    a sequence of box coordinates (x1, y1, x2, y2), or point coordinates (x, y). If *geometry* covers an area,
    i.e. is not a point, the function aggregates the variables to compute a mean value and if desired,
    the number of valid observations and the standard deviation.

    *start_date* and *end_date* may be provided as a numpy.datetime64 or an ISO datetime string.

    Returns a time-series dataset whose data variables have a time dimension but no longer have spatial dimensions,
    hence the resulting dataset's variables will only have N-2 dimensions.
    A global attribute ``max_number_of_observations`` will be set to the maximum number of observations
    that could have been made in each time step.
    If the given *geometry* does not overlap the cube's boundaries, or if not output variables remain,
    the function returns ``None``.

    :param cube: The xcube dataset
    :param geometry: Optional geometry
    :param var_names: Optional sequence of names of variables to be included.
    :param start_date: Optional start date.
    :param end_date: Optional end date.
    :param include_count: Whether to include the number of valid observations for each time step.
           Ignored if geometry is a point.
    :param include_stdev: Whether to include standard deviation for each time step.
           Ignored if geometry is a point.
    :param use_groupby: Use group-by operation. May increase or decrease runtime performance and/or memory consumption.
    :param cube_asserted:  If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new dataset with time-series for each variable.
    """

    if not cube_asserted:
        assert_cube(cube)

    geometry = convert_geometry(geometry)

    dataset = select_variables_subset(cube, var_names)
    if len(dataset.data_vars) == 0:
        return None

    if start_date is not None or end_date is not None:
        # noinspection PyTypeChecker
        dataset = dataset.sel(time=slice(start_date, end_date))

    if isinstance(geometry, shapely.geometry.Point):
        bounds = get_dataset_geometry(dataset)
        if not bounds.contains(geometry):
            return None
        dataset = dataset.sel(lon=geometry.x, lat=geometry.y, method='Nearest')
        return dataset.assign_attrs(max_number_of_observations=1)

    if geometry is not None:
        dataset = mask_dataset_by_geometry(dataset,
                                           geometry,
                                           save_geometry_mask='__mask__')
        if dataset is None:
            return None
        mask = dataset['__mask__']
        max_number_of_observations = np.count_nonzero(mask)
        dataset = dataset.drop('__mask__')
    else:
        max_number_of_observations = dataset.lat.size * dataset.lon.size

    ds_count = None
    ds_stdev = None
    if use_groupby:
        time_group = dataset.groupby('time')
        ds_mean = time_group.mean(skipna=True, dim=xr.ALL_DIMS)
        if include_count:
            ds_count = time_group.count(dim=xr.ALL_DIMS)
        if include_stdev:
            ds_stdev = time_group.std(skipna=True, dim=xr.ALL_DIMS)
    else:
        ds_mean = dataset.mean(dim=('lat', 'lon'), skipna=True)
        if include_count:
            ds_count = dataset.count(dim=('lat', 'lon'))
        if include_stdev:
            ds_stdev = dataset.std(dim=('lat', 'lon'), skipna=True)

    if ds_count is not None:
        ds_count = ds_count.rename(
            name_dict=dict({v: f"{v}_count"
                            for v in ds_count.data_vars}))

    if ds_stdev is not None:
        ds_stdev = ds_stdev.rename(
            name_dict=dict({v: f"{v}_stdev"
                            for v in ds_stdev.data_vars}))

    if ds_count is not None and ds_stdev is not None:
        ts_dataset = xr.merge([ds_mean, ds_stdev, ds_count])
    elif ds_count is not None:
        ts_dataset = xr.merge([ds_mean, ds_count])
    elif ds_stdev is not None:
        ts_dataset = xr.merge([ds_mean, ds_stdev])
    else:
        ts_dataset = ds_mean

    ts_dataset = ts_dataset.assign_attrs(
        max_number_of_observations=max_number_of_observations)

    return ts_dataset
Beispiel #9
0
def resample_in_time(dataset: xr.Dataset,
                     frequency: str,
                     method: Union[str, Sequence[str]],
                     offset=None,
                     base: int = 0,
                     tolerance=None,
                     interp_kind=None,
                     time_chunk_size=None,
                     var_names: Sequence[str] = None,
                     metadata: Dict[str, Any] = None,
                     cube_asserted: bool = False) -> xr.Dataset:
    """
    Resample a dataset in the time dimension.

    The argument *method* may be one or a sequence of
    ``'all'``, ``'any'``,
    ``'argmax'``, ``'argmin'``, ``'count'``,
    ``'first'``, ``'last'``,
    ``'max'``, ``'min'``, ``'mean'``, ``'median'``,
    ``'percentile_<p>'``,
    ``'std'``, ``'sum'``, ``'var'``.

    In value ``'percentile_<p>'`` is a placeholder,
    where ``'<p>'`` must be replaced by an integer percentage
    value, e.g. ``'percentile_90'`` is the 90%-percentile.

    *Important note:* As of xarray 0.14 and dask 2.8, the
    methods ``'median'`` and ``'percentile_<p>'` cannot be
    used if the variables in *cube* comprise chunked dask arrays.
    In this case, use the ``compute()`` or ``load()`` method
    to convert dask arrays into numpy arrays.

    :param dataset: The xcube dataset.
    :param frequency: Temporal aggregation frequency.
        Use format "<count><offset>" where <offset> is one of
        'H', 'D', 'W', 'M', 'Q', 'Y'.
    :param method: Resampling method or sequence of
        resampling methods.
    :param offset: Offset used to adjust the resampled time labels.
        Uses same syntax as *frequency*.
    :param base: For frequencies that evenly subdivide 1 day,
        the "origin" of the aggregated intervals. For example,
        for '24H' frequency, base could range from 0 through 23.
    :param time_chunk_size: If not None, the chunk size to be
        used for the "time" dimension.
    :param var_names: Variable names to include.
    :param tolerance: Time tolerance for selective
        upsampling methods. Defaults to *frequency*.
    :param interp_kind: Kind of interpolation
        if *method* is 'interpolation'.
    :param metadata: Output metadata.
    :param cube_asserted: If False, *cube* will be verified,
        otherwise it is expected to be a valid cube.
    :return: A new xcube dataset resampled in time.
    """
    if not cube_asserted:
        assert_cube(dataset)

    if frequency == 'all':
        time_gap = np.array(dataset.time[-1]) - np.array(dataset.time[0])
        days = int((np.timedelta64(time_gap, 'D') / np.timedelta64(1, 'D')) +
                   1)
        frequency = f'{days}D'

    if var_names:
        dataset = select_variables_subset(dataset, var_names)

    resampler = dataset.resample(skipna=True,
                                 closed='left',
                                 label='left',
                                 time=frequency,
                                 loffset=offset,
                                 base=base)

    if isinstance(method, str):
        methods = [method]
    else:
        methods = list(method)

    percentile_prefix = 'percentile_'

    resampled_cubes = []
    for method in methods:
        method_args = []
        method_postfix = method
        if method.startswith(percentile_prefix):
            p = int(method[len(percentile_prefix):])
            q = p / 100.0
            method_args = [q]
            method_postfix = f'p{p}'
            method = 'quantile'
        resampling_method = getattr(resampler, method)
        method_kwargs = get_method_kwargs(method, frequency, interp_kind,
                                          tolerance)
        resampled_cube = resampling_method(*method_args, **method_kwargs)
        resampled_cube = resampled_cube.rename({
            var_name: f'{var_name}_{method_postfix}'
            for var_name in resampled_cube.data_vars
        })
        resampled_cubes.append(resampled_cube)

    if len(resampled_cubes) == 1:
        resampled_cube = resampled_cubes[0]
    else:
        resampled_cube = xr.merge(resampled_cubes)

    # TODO: add time_bnds to resampled_ds
    time_coverage_start = '%s' % dataset.time[0]
    time_coverage_end = '%s' % dataset.time[-1]

    resampled_cube.attrs.update(metadata or {})
    # TODO: add other time_coverage_ attributes
    resampled_cube.attrs.update(time_coverage_start=time_coverage_start,
                                time_coverage_end=time_coverage_end)

    schema = CubeSchema.new(dataset)
    chunk_sizes = {
        schema.dims[i]: schema.chunks[i]
        for i in range(schema.ndim)
    }

    if isinstance(time_chunk_size, int) and time_chunk_size >= 0:
        chunk_sizes['time'] = time_chunk_size

    return resampled_cube.chunk(chunk_sizes)
Beispiel #10
0
def get_time_series(cube: xr.Dataset,
                    geometry: GeometryLike = None,
                    var_names: Sequence[str] = None,
                    start_date: Date = None,
                    end_date: Date = None,
                    agg_methods: Union[str, Sequence[str],
                                       AbstractSet[str]] = AGG_MEAN,
                    include_count: bool = False,
                    include_stdev: bool = False,
                    use_groupby: bool = False,
                    cube_asserted: bool = False) -> Optional[xr.Dataset]:
    """
    Get a time series dataset from a data *cube*.

    *geometry* may be provided as a (shapely) geometry object, a valid GeoJSON object, a valid WKT string,
    a sequence of box coordinates (x1, y1, x2, y2), or point coordinates (x, y). If *geometry* covers an area,
    i.e. is not a point, the function aggregates the variables to compute a mean value and if desired,
    the number of valid observations and the standard deviation.

    *start_date* and *end_date* may be provided as a numpy.datetime64 or an ISO datetime string.

    Returns a time-series dataset whose data variables have a time dimension but no longer have spatial dimensions,
    hence the resulting dataset's variables will only have N-2 dimensions.
    A global attribute ``max_number_of_observations`` will be set to the maximum number of observations
    that could have been made in each time step.
    If the given *geometry* does not overlap the cube's boundaries, or if not output variables remain,
    the function returns ``None``.

    :param cube: The xcube dataset
    :param geometry: Optional geometry
    :param var_names: Optional sequence of names of variables to be included.
    :param start_date: Optional start date.
    :param end_date: Optional end date.
    :param agg_methods: Aggregation methods. May be single string or sequence of strings. Possible values are
           'mean', 'median', 'min', 'max', 'std', 'count'. Defaults to 'mean'.
           Ignored if geometry is a point.
    :param include_count: Deprecated. Whether to include the number of valid observations for each time step.
           Ignored if geometry is a point.
    :param include_stdev: Deprecated. Whether to include standard deviation for each time step.
           Ignored if geometry is a point.
    :param use_groupby: Use group-by operation. May increase or decrease runtime performance and/or memory consumption.
    :param cube_asserted:  If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new dataset with time-series for each variable.
    """

    if not cube_asserted:
        assert_cube(cube)

    geometry = convert_geometry(geometry)

    agg_methods = normalize_agg_methods(agg_methods)
    if include_count:
        warnings.warn("keyword argument 'include_count' has been deprecated, "
                      f"use 'agg_methods=[{AGG_COUNT!r}, ...]' instead")
        agg_methods.add(AGG_COUNT)
    if include_stdev:
        warnings.warn("keyword argument 'include_stdev' has been deprecated, "
                      f"use 'agg_methods=[{AGG_STD!r}, ...]' instead")
        agg_methods.add(AGG_STD)

    dataset = select_variables_subset(cube, var_names)
    if len(dataset.data_vars) == 0:
        return None

    if start_date is not None or end_date is not None:
        # noinspection PyTypeChecker
        dataset = dataset.sel(time=slice(start_date, end_date))

    if isinstance(geometry, shapely.geometry.Point):
        bounds = get_dataset_geometry(dataset)
        if not bounds.contains(geometry):
            return None
        dataset = dataset.sel(lon=geometry.x, lat=geometry.y, method='Nearest')
        return dataset.assign_attrs(max_number_of_observations=1)

    if geometry is not None:
        dataset = mask_dataset_by_geometry(dataset,
                                           geometry,
                                           save_geometry_mask='__mask__')
        if dataset is None:
            return None
        mask = dataset['__mask__']
        max_number_of_observations = np.count_nonzero(mask)
        dataset = dataset.drop_vars(['__mask__'])
    else:
        max_number_of_observations = dataset.lat.size * dataset.lon.size

    must_load = len(agg_methods) > 1 or any(
        AGG_METHODS[agg_method] == MUST_LOAD for agg_method in agg_methods)
    if must_load:
        dataset.load()

    agg_datasets = []
    if use_groupby:
        time_group = dataset.groupby('time')
        for agg_method in agg_methods:
            method = getattr(time_group, agg_method)
            if agg_method == 'count':
                agg_dataset = method(dim=xr.ALL_DIMS)
            else:
                agg_dataset = method(dim=xr.ALL_DIMS, skipna=True)
            agg_datasets.append(agg_dataset)
    else:
        for agg_method in agg_methods:
            method = getattr(dataset, agg_method)
            if agg_method == 'count':
                agg_dataset = method(dim=('lat', 'lon'))
            else:
                agg_dataset = method(dim=('lat', 'lon'), skipna=True)
            agg_datasets.append(agg_dataset)

    agg_datasets = [
        agg_dataset.rename(name_dict=dict(
            {v: f"{v}_{agg_method}"
             for v in agg_dataset.data_vars}))
        for agg_method, agg_dataset in zip(agg_methods, agg_datasets)
    ]

    ts_dataset = xr.merge(agg_datasets)
    ts_dataset = ts_dataset.assign_attrs(
        max_number_of_observations=max_number_of_observations)

    return ts_dataset