コード例 #1
0
ファイル: arithmetics.py プロジェクト: TomBlock/cate
def compute(ds: DatasetLike.TYPE,
            expr: str,
            var: VarName.TYPE,
            copy: bool = False,
            _ctx: dict = None,
            monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """

    :param ds: The primary dataset. If omitted, all variables need to be prefixed by their dataset resource names.
    :param expr: Math expression in which all *ds* variables may be used by name.
    :param var: The new variable's name.
    :param copy: Whether to copy all variables from *ds*.
    :param monitor: An optional progress monitor.
    :return: A new dataset with the new variable.
    """

    if _ctx is not None and 'value_cache' in _ctx:
        local_namespace = dict(_ctx['value_cache'])
    else:
        local_namespace = dict()

    if ds is not None:
        local_namespace.update(ds.data_vars)

    with monitor.observing("Computing variable"):
        data_array = safe_eval(expr, local_namespace=local_namespace)
        data_array.name = var

    if ds is not None and copy:
        new_ds = ds.copy()
        new_ds[var] = data_array
    else:
        new_ds = xr.Dataset(data_vars={var: data_array})
    return new_ds
コード例 #2
0
ファイル: anomaly.py プロジェクト: CCI-Tools/ect-core
def anomaly_internal(ds: xr.Dataset,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate anomaly using as reference data the mean of an optional region
    and time slice from the given dataset. If no time slice/spatial region is
    given, the operation will calculate anomaly using the mean of the whole
    dataset as the reference.

    This is done for each data array in the dataset.
    :param ds: The dataset to calculate anomalies from
    :param time_range: Time range to use for reference data
    :param region: Spatial region to use for reference data
    :param monitor: a progress monitor.
    :return: The anomaly dataset
    """
    ref = ds.copy()
    if time_range:
        time_range = TimeRangeLike.convert(time_range)
        ref = subset_temporal(ref, time_range)
    if region:
        region = PolygonLike.convert(region)
        ref = subset_spatial(ref, region)
    with monitor.observing("Calculating anomaly"):
        ref = ref.mean(keep_attrs=True, skipna=True)
        diff = ds - ref
    return diff
コード例 #3
0
def anomaly_internal(ds: xr.Dataset,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate anomaly using as reference data the mean of an optional region
    and time slice from the given dataset. If no time slice/spatial region is
    given, the operation will calculate anomaly using the mean of the whole
    dataset as the reference.

    This is done for each data array in the dataset.
    :param ds: The dataset to calculate anomalies from
    :param time_range: Time range to use for reference data
    :param region: Spatial region to use for reference data
    :param monitor: a progress monitor.
    :return: The anomaly dataset
    """
    ref = ds.copy()
    if time_range:
        time_range = TimeRangeLike.convert(time_range)
        ref = subset_temporal(ref, time_range)
    if region:
        region = PolygonLike.convert(region)
        ref = subset_spatial(ref, region)
    with monitor.observing("Calculating anomaly"):
        ref = ref.mean(keep_attrs=True, skipna=True)
        diff = ds - ref
    return diff
コード例 #4
0
ファイル: correlation.py プロジェクト: CCI-Tools/ect-core
def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
                               ds_y: DatasetLike.TYPE,
                               var_x: VarName.TYPE,
                               var_y: VarName.TYPE,
                               monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two data variables and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if (array_x.dims != array_y.dims):
        raise ValidationError('Both datasets should feature the same'
                              ' dimensionality. Currently provided ds_x[var_x] '
                              f'has {array_x.dims}, provided ds_y[var_y]'
                              f' has {array_y.dims}')

    for dim in array_x.dims:
        if len(array_x[dim]) != len(array_y[dim]):
            raise ValidationError('All dimensions of both provided data variables'
                                  f' must be the same length. Currently {dim} of ds_x[var_x]'
                                  f' has {len(array_x[dim])} values, while'
                                  f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.'
                                  ' You may want to try to coregister the datasets beforehand.')

    n_vals = 1
    for dim in array_x.dims:
        n_vals = n_vals * len(array_x[dim])

    if n_vals < 3:
        raise ValidationError('There should be no less than 3 values in both data variables'
                              f' to perform the correlation. Currently there are {n_vals} values')

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.stack(z=array_x.dims), array_y.stack(z=array_y.dims))

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
コード例 #5
0
ファイル: arithmetics.py プロジェクト: TomBlock/cate
def diff(ds: xr.Dataset,
         ds2: xr.Dataset,
         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate the difference of two datasets (ds - ds2). This is done by
    matching variable names in the two datasets against each other and taking
    the difference of matching variables.

    If lat/lon/time extents differ between the datasets, the default behavior
    is to take the intersection of the datasets and run subtraction on that.
    However, broadcasting is possible. E.g. ds(lat/lon/time) - ds(lat/lon) is
    valid. In this case the subtrahend will be stretched to the size of
    ds(lat/lon/time) so that it can be subtracted. This also works if the
    subtrahend is a single time slice of arbitrary temporal position. In this
    case, the time dimension will be squeezed out leaving a lat/lon dataset.

    :param ds: The minuend dataset
    :param ds2: The subtrahend dataset
    :param monitor: a progress monitor.
    :return: The difference dataset
    """
    try:
        # Times do not intersect
        if 0 == len(ds.time - ds2.time) and \
                len(ds.time) == len(ds2.time):  # Times are the same length
            # If the datasets don't intersect in time dimension, a naive difference
            # would return empty data variables. Hence, the time coordinate has to
            # be dropped beforehand
            ds = ds.drop('time')
            ds2 = ds2.drop('time')
            return ds - ds2
    except AttributeError:
        # It is likely that the one operand is a lat/lon array that can be
        # broadcast against the other operand
        pass

    try:
        if 1 == len(ds2.time):
            # The subtrahend is a single time-slice -> squeeze 'time' dimension to
            # be able to broadcast is along minuend
            ds2 = ds2.squeeze('time', drop=True)
    except AttributeError:
        # Doesn't have a time dimension already
        pass
    except TypeError as e:
        if 'unsized object' in str(e):
            # The 'time' variable is a scalar
            pass
        else:
            raise TypeError(str(e))

    with monitor.observing("Subtract datasets"):
        diff = ds - ds2

    return diff
コード例 #6
0
def pearson_correlation_scalar(
        ds_x: DatasetLike.TYPE,
        ds_y: DatasetLike.TYPE,
        var_x: VarName.TYPE,
        var_y: VarName.TYPE,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two timeseries and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if ((len(array_x.dims) != len(array_y.dims)) and (len(array_x.dims) != 1)):
        raise ValidationError('To calculate simple correlation, both provided'
                              ' datasets should be simple 1d timeseries. To'
                              ' create a map of correlation coefficients, use'
                              ' pearson_correlation operation instead.')

    if len(array_x['time']) != len(array_y['time']):
        raise ValidationError(
            'The length of the time dimension differs between'
            ' the given datasets. Can not perform the calculation'
            ', please review operation documentation.')

    if len(array_x['time']) < 3:
        raise ValidationError(
            'The length of the time dimension should not be less'
            ' than three to run the calculation.')

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.values, array_y.values)

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
コード例 #7
0
ファイル: arithmetics.py プロジェクト: CCI-Tools/ect-core
def diff(ds: xr.Dataset,
         ds2: xr.Dataset,
         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate the difference of two datasets (ds - ds2). This is done by
    matching variable names in the two datasets against each other and taking
    the difference of matching variables.

    If lat/lon/time extents differ between the datasets, the default behavior
    is to take the intersection of the datasets and run subtraction on that.
    However, broadcasting is possible. E.g. ds(lat/lon/time) - ds(lat/lon) is
    valid. In this case the subtrahend will be stretched to the size of
    ds(lat/lon/time) so that it can be subtracted. This also works if the
    subtrahend is a single time slice of arbitrary temporal position. In this
    case, the time dimension will be squeezed out leaving a lat/lon dataset.

    :param ds: The minuend dataset
    :param ds2: The subtrahend dataset
    :param monitor: a progress monitor.
    :return: The difference dataset
    """
    try:
        # Times do not intersect
        if 0 == len(ds.time - ds2.time) and \
                len(ds.time) == len(ds2.time):  # Times are the same length
            # If the datasets don't intersect in time dimension, a naive difference
            # would return empty data variables. Hence, the time coordinate has to
            # be dropped beforehand
            ds = ds.drop('time')
            ds2 = ds2.drop('time')
            return ds - ds2
    except AttributeError:
        # It is likely that the one operand is a lat/lon array that can be
        # broadcast against the other operand
        pass

    try:
        if 1 == len(ds2.time):
            # The subtrahend is a single time-slice -> squeeze 'time' dimension to
            # be able to broadcast is along minuend
            ds2 = ds2.squeeze('time', drop=True)
    except AttributeError:
        # Doesn't have a time dimension already
        pass
    except TypeError as e:
        if 'unsized object' in str(e):
            # The 'time' variable is a scalar
            pass
        else:
            raise TypeError(str(e))

    with monitor.observing("Subtract datasets"):
        diff = ds - ds2

    return diff
コード例 #8
0
ファイル: io.py プロジェクト: whigg/cate
def save_dataset(ds: xr.Dataset, file: str, format: str = None, monitor: Monitor = Monitor.NONE):
    """
    Save a dataset to NetCDF file.

    :param ds: The dataset
    :param file: File path
    :param format: NetCDF format flavour, one of 'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'.
    :param monitor: a progress monitor.
    """
    with monitor.observing("save_dataset"):
        ds.to_netcdf(file, format=format)
コード例 #9
0
ファイル: arithmetics.py プロジェクト: whigg/cate
def _exec_script(script: str,
                 element_types: Tuple[type, ...],
                 operation_context: Mapping[str, Any] = None,
                 context_object: Mapping[str, Any] = None,
                 monitor: Monitor = Monitor.NONE) -> Dict[str, Any]:
    """
    Helper for compute_dataset() and compute_data_frame().
    """
    if not script:
        raise ValidationError(f'Python script must not be empty')

    # Include common libraries
    orig_namespace = dict(
        gpd=gpd,
        geopandas=geopandas,
        math=math,
        np=np,
        numpy=numpy,
        pd=pd,
        pandas=pandas,
        sp=sp,
        scipy=scipy,
        xr=xr,
        xarray=xarray,
    )

    if operation_context is not None and 'value_cache' in operation_context:
        orig_namespace.update(operation_context['value_cache'])

    if context_object is not None:
        orig_namespace.update(context_object)

    local_namespace = dict(orig_namespace)

    with monitor.observing("Executing script"):
        try:
            safe_exec(script, local_namespace=local_namespace)
        except BaseException as e:
            raise ValidationError(f'Error in Python script: {e}') from e

    elements = dict()
    for name, element in local_namespace.items():
        if not name.startswith('_'):
            if isinstance(element, element_types):
                if name not in orig_namespace or element is not orig_namespace[
                        name]:
                    elements[name] = element

    return elements
コード例 #10
0
ファイル: arithmetics.py プロジェクト: CCI-Tools/ect-core
def _exec_script(script: str,
                 element_types: Tuple[type, ...],
                 operation_context: Mapping[str, Any] = None,
                 context_object: Mapping[str, Any] = None,
                 monitor: Monitor = Monitor.NONE) -> Dict[str, Any]:
    """
    Helper for compute_dataset() and compute_data_frame().
    """
    if not script:
        raise ValidationError(f'Python script must not be empty')

    # Include common libraries
    orig_namespace = dict(
        gpd=gpd,
        geopandas=geopandas,
        math=math,
        np=np,
        numpy=numpy,
        pd=pd,
        pandas=pandas,
        sp=sp,
        scipy=scipy,
        xr=xr,
        xarray=xarray,
    )

    if operation_context is not None and 'value_cache' in operation_context:
        orig_namespace.update(operation_context['value_cache'])

    if context_object is not None:
        orig_namespace.update(context_object)

    local_namespace = dict(orig_namespace)

    with monitor.observing("Executing script"):
        try:
            safe_exec(script, local_namespace=local_namespace)
        except BaseException as e:
            raise ValidationError(f'Error in Python script: {e}') from e

    elements = dict()
    for name, element in local_namespace.items():
        if not name.startswith('_'):
            if isinstance(element, element_types):
                if name not in orig_namespace or element is not orig_namespace[name]:
                    elements[name] = element

    return elements
コード例 #11
0
ファイル: aggregate.py プロジェクト: stratosgear/cate
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform monthly aggregation of a daily dataset according to the given
    method.

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValueError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Check if we have a daily dataset
    try:
        if ds.attrs['time_coverage_resolution'] != 'P1D':
            raise ValueError(
                'Temporal aggregation operation expects a daily dataset')
    except KeyError:
        raise ValueError('Could not determine temporal resolution. Running'
                         ' the adjust_temporal_attrs operation beforehand may'
                         ' help.')

    with monitor.observing("resample dataset"):
        retset = ds.resample(freq='MS',
                             dim='time',
                             keep_attrs=True,
                             how=method)

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                    retset[var].attrs['cell_methods'] + \
                    ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
コード例 #12
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution of input dataset.'
            ' Running the adjust_temporal_attrs operation beforehand may'
            ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq,
                                                            keep_attrs=True))
        except AttributeError:
            raise ValidationError(
                f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
コード例 #13
0
ファイル: correlation.py プロジェクト: whigg/cate
def pearson_correlation_scalar(
        ds_x: DatasetLike.TYPE,
        ds_y: DatasetLike.TYPE,
        var_x: VarName.TYPE,
        var_y: VarName.TYPE,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two data variables and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if (array_x.dims != array_y.dims):
        raise ValidationError(
            'Both datasets should feature the same'
            ' dimensionality. Currently provided ds_x[var_x] '
            f'has {array_x.dims}, provided ds_y[var_y]'
            f' has {array_y.dims}')

    for dim in array_x.dims:
        if len(array_x[dim]) != len(array_y[dim]):
            raise ValidationError(
                'All dimensions of both provided data variables'
                f' must be the same length. Currently {dim} of ds_x[var_x]'
                f' has {len(array_x[dim])} values, while'
                f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.'
                ' You may want to try to coregister the datasets beforehand.')

    n_vals = 1
    for dim in array_x.dims:
        n_vals = n_vals * len(array_x[dim])

    if n_vals < 3:
        raise ValidationError(
            'There should be no less than 3 values in both data variables'
            f' to perform the correlation. Currently there are {n_vals} values'
        )

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.stack(z=array_x.dims),
                          array_y.stack(z=array_y.dims))

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
コード例 #14
0
ファイル: arithmetics.py プロジェクト: TonioF/cate
def compute(ds: DatasetLike.TYPE,
            script: str,
            copy: bool = False,
            _ctx: dict = None,
            monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Compute a new dataset from the given Python *script*.
    The argument *script* must be valid Python code or a single expression comprising at least one
    value assignment of the form <name> = <expr>. Multiple assignments can be done on multiple lines
    or on a single line separated by semicolons.

    <expr> may reference variables in the given context dataset *ds* or other resources and their variables
    from the current workflow.
    In the latter case, use the dot operator to select a variable from a dataset resource.

    Every new variable in *script* of type data array will be added to the new dataset.

    The following packages are available in the code:

    * ``np``: The ``numpy`` top-level package (https://docs.scipy.org/doc/numpy/reference/)
    * ``pd``: The ``pandas`` top-level package (http://pandas.pydata.org/pandas-docs/stable/api.html)
    * ``xr``: The ``xarray`` top-level package (http://xarray.pydata.org/en/stable/api.html)
    * ``xu``: The ``xarray.ufuncs`` package (http://xarray.pydata.org/en/stable/api.html#universal-functions)

    Note, in contrast to the ``np`` package, all the math functions defined in ``xu`` will preserve variable attributes.

    :param ds: Optional context dataset. All variables of this dataset are directly accessible in the *script*.
               If omitted, all variables need to be prefixed by their dataset resource names.
    :param script: Valid Python expression comprising at least one assignment of the form <name> = <expr>.
    :param copy: Whether to copy all variables from *ds*.
    :param monitor: An optional progress monitor.
    :return: A new dataset.
    """

    if _ctx is not None and 'value_cache' in _ctx:
        orig_namespace = dict(_ctx['value_cache'])
    else:
        orig_namespace = dict()

    if ds is not None:
        orig_namespace.update(ds.data_vars)

    orig_namespace['np'] = np
    orig_namespace['pd'] = pd
    orig_namespace['xr'] = xr
    orig_namespace['xu'] = xu

    local_namespace = dict(orig_namespace)

    with monitor.observing("Executing script"):
        safe_exec(script, local_namespace=local_namespace)

    data_vars = {}
    for name, array in local_namespace.items():
        if isinstance(array, xr.DataArray) or isinstance(array, xr.Variable):
            is_new_data_var = name not in orig_namespace
            if not is_new_data_var:
                is_new_data_var = array is not orig_namespace[name]
            if is_new_data_var:
                array.name = name
                data_vars[name] = array

    if ds is not None and copy:
        new_ds = ds.copy()
        for name, array in data_vars.items():
            new_ds[name] = array
    else:
        new_ds = xr.Dataset(data_vars=data_vars)

    return new_ds
コード例 #15
0
ファイル: aggregate.py プロジェクト: CCI-Tools/ect-core
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError('Temporal aggregation operation expects a dataset with the'
                              ' time coordinate of type datetime64[ns], but received'
                              ' {}. Running the normalize operation on this'
                              ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError('Could not determine temporal resolution of input dataset.'
                              ' Running the adjust_temporal_attrs operation beforehand may'
                              ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq, keep_attrs=True))
        except AttributeError:
            raise ValidationError(f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(method)

    return adjust_temporal_attrs(retset)