Example #1
0
def select_var(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None) -> xr.Dataset:
    """
    Filter the dataset, by leaving only the desired variables in it. The original dataset
    information, including original coordinates, is preserved.

    :param ds: The dataset or dataframe from which to perform selection.
    :param var: One or more variable names to select and preserve in the dataset. \
    All of these are valid 'var_name' 'var_name1,var_name2,var_name3' ['var_name1', 'var_name2']. \
    One can also use wildcards when doing the selection. E.g., choosing 'var_name*' for selection \
    will select all variables that start with 'var_name'. This can be used to select variables \
    along with their auxiliary variables, to select all uncertainty variables, and so on.
    :return: A filtered dataset
    """
    if not var:
        return ds

    ds = DatasetLike.convert(ds)

    var_names = VarNamesLike.convert(var)
    dropped_var_names = list(ds.data_vars.keys())

    for pattern in var_names:
        keep = fnmatch.filter(dropped_var_names, pattern)
        for name in keep:
            dropped_var_names.remove(name)

    return ds.drop(dropped_var_names)
Example #2
0
File: select.py Project: whigg/cate
def select_var(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None) -> xr.Dataset:
    """
    Filter the dataset, by leaving only the desired variables in it. The original dataset
    information, including original coordinates, is preserved.

    :param ds: The dataset or dataframe from which to perform selection.
    :param var: One or more variable names to select and preserve in the dataset. \
    All of these are valid 'var_name' 'var_name1,var_name2,var_name3' ['var_name1', 'var_name2']. \
    One can also use wildcards when doing the selection. E.g., choosing 'var_name*' for selection \
    will select all variables that start with 'var_name'. This can be used to select variables \
    along with their auxiliary variables, to select all uncertainty variables, and so on.
    :return: A filtered dataset
    """
    if not var:
        return ds

    ds = DatasetLike.convert(ds)

    var_names = VarNamesLike.convert(var)
    dropped_var_names = list(ds.data_vars.keys())

    for pattern in var_names:
        keep = fnmatch.filter(dropped_var_names, pattern)
        for name in keep:
            dropped_var_names.remove(name)

    return ds.drop_vars(dropped_var_names)
Example #3
0
def compute(ds: DatasetLike.TYPE,
            expr: str,
            var: VarName.TYPE,
            copy: bool = False,
            _ctx: dict = None,
            monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """

    :param ds: The primary dataset. If omitted, all variables need to be prefixed by their dataset resource names.
    :param expr: Math expression in which all *ds* variables may be used by name.
    :param var: The new variable's name.
    :param copy: Whether to copy all variables from *ds*.
    :param monitor: An optional progress monitor.
    :return: A new dataset with the new variable.
    """

    if _ctx is not None and 'value_cache' in _ctx:
        local_namespace = dict(_ctx['value_cache'])
    else:
        local_namespace = dict()

    if ds is not None:
        local_namespace.update(ds.data_vars)

    with monitor.observing("Computing variable"):
        data_array = safe_eval(expr, local_namespace=local_namespace)
        data_array.name = var

    if ds is not None and copy:
        new_ds = ds.copy()
        new_ds[var] = data_array
    else:
        new_ds = xr.Dataset(data_vars={var: data_array})
    return new_ds
Example #4
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Create a 'mean over years' dataset by averaging the values of the given input
    dataset over all years. The output is a climatological dataset with the same
    resolution as the input dataset. E.g. a daily input dataset will create a daily
    climatology consisting of 365 days, a monthly input dataset will create a monthly
    climatology, etc.

    Seasonal input datasets must have matching seasons over all years denoted by the
    same date each year. E.g., first date of each quarter. The output dataset will
    then be a seasonal climatology where each season is denoted with the same date
    as in the input dataset.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Long term average operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    try:
        t_resolution = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution. Running'
            ' the adjust_temporal_attrs operation beforehand may'
            ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    if t_resolution == 'P1D':
        return _lta_daily(retset, monitor)
    elif t_resolution == 'P1M':
        return _lta_monthly(retset, monitor)
    else:
        return _lta_general(retset, monitor)
Example #5
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {
        'min': np.nanmin,
        'max': np.nanmax,
        'mean': np.nanmean,
        'median': np.nanmedian,
        'sum': np.nansum
    }

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [
            value for value in dim if value in retset[var_name].dims
        ]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Example #6
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Create a 'mean over years' dataset by averaging the values of the given input
    dataset over all years. The output is a climatological dataset with the same
    resolution as the input dataset. E.g. a daily input dataset will create a daily
    climatology consisting of 365 days, a monthly input dataset will create a monthly
    climatology, etc.

    Seasonal input datasets must have matching seasons over all years denoted by the
    same date each year. E.g., first date of each quarter. The output dataset will
    then be a seasonal climatology where each season is denoted with the same date
    as in the input dataset.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError('Long term average operation expects a dataset with the'
                              ' time coordinate of type datetime64[ns], but received'
                              ' {}. Running the normalize operation on this'
                              ' dataset may help'.format(ds.time.dtype))

    try:
        t_resolution = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError('Could not determine temporal resolution. Running'
                              ' the adjust_temporal_attrs operation beforehand may'
                              ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    if t_resolution == 'P1D':
        return _lta_daily(retset, monitor)
    elif t_resolution == 'P1M':
        return _lta_monthly(retset, monitor)
    else:
        return _lta_general(retset, monitor)
Example #7
0
def compute_dataset(ds: DatasetLike.TYPE,
                    script: str,
                    copy: bool = False,
                    _ctx: dict = None,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Compute a new dataset from the given Python *script*.
    The argument *script* must be valid Python code or a single expression comprising at least one
    value assignment of the form {name} = {expr}. Multiple assignments can be done on multiple lines
    or on a single line separated by semicolons.

    {expr} may reference variables in the given context dataset *ds* or other resources and their variables
    from the current workflow.
    In the latter case, use the dot operator to select a variable from a dataset resource.

    Any new variable in *script* whose name does not begin with and underscore ('_') and
    that has an appropriate data type will be added to the new dataset.

    The following packages are available in the *script*:

    * ``geopandas``, ``gpd``: The ``geopandas`` top-level package (http://geopandas.org/)
    * ``math``: The standard Python ``math`` library (https://docs.python.org/3/library/math.html)
    * ``numpy``, ``np``: The ``numpy`` top-level package (https://docs.scipy.org/doc/numpy/reference/)
    * ``pandas``, ``pd``: The ``pandas`` top-level package (http://pandas.pydata.org/pandas-docs/stable/api.html)
    * ``scipy``, ``sp``: The ``scipy`` top-level package (https://docs.scipy.org/doc/scipy/reference/)
    * ``xarray``, ``xr``: The ``xarray`` top-level package (http://xarray.pydata.org/en/stable/api.html)

    :param ds: Optional context dataset. If provided, all variables of this dataset are
           directly accessible in the *script*.
           If omitted, all variables (series) of other dataset (data frame) resources need to be prefixed
           by their resource name.
    :param script: Valid Python expression comprising at least one assignment of the form {name} = {expr}.
    :param copy: Whether to copy all variables from *ds*.
    :param monitor: An optional progress monitor.
    :return: A new dataset object.
    """
    data_vars = _exec_script(script, (xr.DataArray, np.ndarray, float, int),
                             _ctx, ds, monitor)

    if ds is not None and copy:
        new_ds = ds.copy()
        for name, data in data_vars.items():
            new_ds[name] = data
    else:
        new_ds = xr.Dataset(data_vars=data_vars)

    return new_ds
Example #8
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform monthly aggregation of a daily dataset according to the given
    method.

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValueError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Check if we have a daily dataset
    try:
        if ds.attrs['time_coverage_resolution'] != 'P1D':
            raise ValueError(
                'Temporal aggregation operation expects a daily dataset')
    except KeyError:
        raise ValueError('Could not determine temporal resolution. Running'
                         ' the adjust_temporal_attrs operation beforehand may'
                         ' help.')

    with monitor.observing("resample dataset"):
        retset = ds.resample(freq='MS',
                             dim='time',
                             keep_attrs=True,
                             how=method)

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                    retset[var].attrs['cell_methods'] + \
                    ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
Example #9
0
def compute_dataset(ds: DatasetLike.TYPE,
                    script: str,
                    copy: bool = False,
                    _ctx: dict = None,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Compute a new dataset from the given Python *script*.
    The argument *script* must be valid Python code or a single expression comprising at least one
    value assignment of the form {name} = {expr}. Multiple assignments can be done on multiple lines
    or on a single line separated by semicolons.

    {expr} may reference variables in the given context dataset *ds* or other resources and their variables
    from the current workflow.
    In the latter case, use the dot operator to select a variable from a dataset resource.

    Any new variable in *script* whose name does not begin with and underscore ('_') and
    that has an appropriate data type will be added to the new dataset.

    The following packages are available in the *script*:

    * ``geopandas``, ``gpd``: The ``geopandas`` top-level package (http://geopandas.org/)
    * ``math``: The standard Python ``math`` library (https://docs.python.org/3/library/math.html)
    * ``numpy``, ``np``: The ``numpy`` top-level package (https://docs.scipy.org/doc/numpy/reference/)
    * ``pandas``, ``pd``: The ``pandas`` top-level package (http://pandas.pydata.org/pandas-docs/stable/api.html)
    * ``scipy``, ``sp``: The ``scipy`` top-level package (https://docs.scipy.org/doc/scipy/reference/)
    * ``xarray``, ``xr``: The ``xarray`` top-level package (http://xarray.pydata.org/en/stable/api.html)

    :param ds: Optional context dataset. If provided, all variables of this dataset are
           directly accessible in the *script*.
           If omitted, all variables (series) of other dataset (data frame) resources need to be prefixed
           by their resource name.
    :param script: Valid Python expression comprising at least one assignment of the form {name} = {expr}.
    :param copy: Whether to copy all variables from *ds*.
    :param monitor: An optional progress monitor.
    :return: A new dataset object.
    """
    data_vars = _exec_script(script, (xr.DataArray, np.ndarray, float, int), _ctx, ds, monitor)

    if ds is not None and copy:
        new_ds = ds.copy()
        for name, data in data_vars.items():
            new_ds[name] = data
    else:
        new_ds = xr.Dataset(data_vars=data_vars)

    return new_ds
Example #10
0
def sel(ds: DatasetLike.TYPE,
        point: PointLike.TYPE = None,
        time: TimeLike.TYPE = None,
        indexers: DictLike.TYPE = None,
        method: str = 'nearest') -> xr.Dataset:
    """
    Return a new dataset with each array indexed by tick labels along the specified dimension(s).

    This is a wrapper for the ``xarray.sel()`` function.

    For documentation refer to xarray documentation at
    http://xarray.pydata.org/en/stable/generated/xarray.Dataset.sel.html#xarray.Dataset.sel

    :param ds: The dataset from which to select.
    :param point: Optional geographic point given by longitude and latitude
    :param time: Optional time
    :param indexers: Keyword arguments with names matching dimensions and values given by scalars,
           slices or arrays of tick labels. For dimensions with multi-index, the indexer may also be
           a dict-like object with keys matching index level names.
    :param method: Method to use for inexact matches:
           * None: only exact matches
           * ``pad`` / ``ffill``: propagate last valid index value forward
           * ``backfill`` / ``bfill``: propagate next valid index value backward
           * ``nearest`` (default): use nearest valid index value
    :return: A new Dataset with the same contents as this dataset, except each variable and dimension
             is indexed by the appropriate indexers. In general, each variable's data will be a view of the
             variable's data in this dataset.
    """
    ds = DatasetLike.convert(ds)
    point = PointLike.convert(point)
    time = TimeLike.convert(time)
    indexers = DictLike.convert(indexers)
    indexers = dict(indexers or {})
    if point is not None:
        indexers.setdefault('lon', point.x)
        indexers.setdefault('lat', point.y)
    if time is not None:
        indexers.setdefault('time', time)
    # Filter out non-existent coordinates
    indexers = {
        name: value
        for name, value in indexers.items() if name in ds.coords
    }
    return ds.sel(method=method, **indexers)
Example #11
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean,
              'median': np.nanmedian, 'sum': np.nansum}

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [value for value in dim if value in retset[var_name].dims]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Example #12
0
def sel(ds: DatasetLike.TYPE,
        point: PointLike.TYPE = None,
        time: TimeLike.TYPE = None,
        indexers: DictLike.TYPE = None,
        method: str = 'nearest') -> xr.Dataset:
    """
    Return a new dataset with each array indexed by tick labels along the specified dimension(s).

    This is a wrapper for the ``xarray.sel()`` function.

    For documentation refer to xarray documentation at
    http://xarray.pydata.org/en/stable/generated/xarray.Dataset.sel.html#xarray.Dataset.sel

    :param ds: The dataset from which to select.
    :param point: Optional geographic point given by longitude and latitude
    :param time: Optional time
    :param indexers: Keyword arguments with names matching dimensions and values given by scalars,
           slices or arrays of tick labels. For dimensions with multi-index, the indexer may also be
           a dict-like object with keys matching index level names.
    :param method: Method to use for inexact matches:
           * None: only exact matches
           * ``pad`` / ``ffill``: propagate last valid index value forward
           * ``backfill`` / ``bfill``: propagate next valid index value backward
           * ``nearest`` (default): use nearest valid index value
    :return: A new Dataset with the same contents as this dataset, except each variable and dimension
             is indexed by the appropriate indexers. In general, each variable's data will be a view of the
             variable's data in this dataset.
    """
    ds = DatasetLike.convert(ds)
    point = PointLike.convert(point)
    time = TimeLike.convert(time)
    indexers = DictLike.convert(indexers)
    indexers = dict(indexers or {})
    if point is not None:
        indexers.setdefault('lon', point.x)
        indexers.setdefault('lat', point.y)
    if time is not None:
        indexers.setdefault('time', time)
    # Filter out non-existent coordinates
    indexers = {name: value for name, value in indexers.items() if name in ds.coords}
    return ds.sel(method=method, **indexers)
Example #13
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution of input dataset.'
            ' Running the adjust_temporal_attrs operation beforehand may'
            ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq,
                                                            keep_attrs=True))
        except AttributeError:
            raise ValidationError(
                f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
Example #14
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform long term average of the given dataset by doing a mean of monthly
    values over the time range covered by the dataset. E.g. it averages all
    January values, all February values, etc, to create a dataset with twelve
    time slices each containing a mean of respective monthly values.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A monthly dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValueError(
            'Long term average operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Check if we have a monthly dataset
    try:
        if ds.attrs['time_coverage_resolution'] != 'P1M':
            raise ValueError(
                'Long term average operation expects a monthly dataset'
                ' running temporal aggregation on this dataset'
                ' beforehand may help.')
    except KeyError:
        raise ValueError('Could not determine temporal resolution. Running'
                         ' the adjust_temporal_attrs operation beforehand may'
                         ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    time_min = pd.Timestamp(ds.time.values[0])
    time_max = pd.Timestamp(ds.time.values[-1])

    total_work = 100

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / 12
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month',
                                squeeze=False).apply(_mean, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.rename({'month': 'time'})
    retset['time'] = pd.date_range('{}-01-01'.format(time_min.year),
                                   freq='MS',
                                   periods=12)

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (12, 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                    retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset
Example #15
0
def compute(ds: DatasetLike.TYPE,
            script: str,
            copy: bool = False,
            _ctx: dict = None,
            monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Compute a new dataset from the given Python *script*.
    The argument *script* must be valid Python code or a single expression comprising at least one
    value assignment of the form <name> = <expr>. Multiple assignments can be done on multiple lines
    or on a single line separated by semicolons.

    <expr> may reference variables in the given context dataset *ds* or other resources and their variables
    from the current workflow.
    In the latter case, use the dot operator to select a variable from a dataset resource.

    Every new variable in *script* of type data array will be added to the new dataset.

    The following packages are available in the code:

    * ``np``: The ``numpy`` top-level package (https://docs.scipy.org/doc/numpy/reference/)
    * ``pd``: The ``pandas`` top-level package (http://pandas.pydata.org/pandas-docs/stable/api.html)
    * ``xr``: The ``xarray`` top-level package (http://xarray.pydata.org/en/stable/api.html)
    * ``xu``: The ``xarray.ufuncs`` package (http://xarray.pydata.org/en/stable/api.html#universal-functions)

    Note, in contrast to the ``np`` package, all the math functions defined in ``xu`` will preserve variable attributes.

    :param ds: Optional context dataset. All variables of this dataset are directly accessible in the *script*.
               If omitted, all variables need to be prefixed by their dataset resource names.
    :param script: Valid Python expression comprising at least one assignment of the form <name> = <expr>.
    :param copy: Whether to copy all variables from *ds*.
    :param monitor: An optional progress monitor.
    :return: A new dataset.
    """

    if _ctx is not None and 'value_cache' in _ctx:
        orig_namespace = dict(_ctx['value_cache'])
    else:
        orig_namespace = dict()

    if ds is not None:
        orig_namespace.update(ds.data_vars)

    orig_namespace['np'] = np
    orig_namespace['pd'] = pd
    orig_namespace['xr'] = xr
    orig_namespace['xu'] = xu

    local_namespace = dict(orig_namespace)

    with monitor.observing("Executing script"):
        safe_exec(script, local_namespace=local_namespace)

    data_vars = {}
    for name, array in local_namespace.items():
        if isinstance(array, xr.DataArray) or isinstance(array, xr.Variable):
            is_new_data_var = name not in orig_namespace
            if not is_new_data_var:
                is_new_data_var = array is not orig_namespace[name]
            if is_new_data_var:
                array.name = name
                data_vars[name] = array

    if ds is not None and copy:
        new_ds = ds.copy()
        for name, array in data_vars.items():
            new_ds[name] = array
    else:
        new_ds = xr.Dataset(data_vars=data_vars)

    return new_ds
Example #16
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError('Temporal aggregation operation expects a dataset with the'
                              ' time coordinate of type datetime64[ns], but received'
                              ' {}. Running the normalize operation on this'
                              ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError('Could not determine temporal resolution of input dataset.'
                              ' Running the adjust_temporal_attrs operation beforehand may'
                              ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq, keep_attrs=True))
        except AttributeError:
            raise ValidationError(f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(method)

    return adjust_temporal_attrs(retset)