Ejemplo n.º 1
0
def _validate_freq(in_res: str, out_res: str) -> None:
    """
    Validate the aggregation step

    See also: `ISO 8601 Durations <https://en.wikipedia.org/wiki/ISO_8601#Durations>`_
    """
    # Validate output frequency as a valid offset string
    try:
        dates = pd.date_range('2000-01-01', periods=5, freq=out_res)
    except ValueError:
        raise ValidationError('Invalid custom resolution: {}.'
                              ' Please check operation documentation.'.format(out_res))

    # Assuming simple ISO_8601 periods: PXXD/M
    try:
        count = int(in_res[1:-1])
    except ValueError:
        raise ValidationError('Could not interpret time coverage resolution of'
                              ' the given dataset: {}'.format(in_res))

    if in_res == 'P1M' and out_res == 'MS':
        raise ValidationError('Input dataset is already at the requested output resolution.'
                              ' Execution stopped.')

    in_delta = pd.Timedelta(count, unit=in_res[-1])
    out_delta = dates[1] - dates[0]

    if out_delta < in_delta:
        raise ValidationError('Requested output resolution is smaller than dataset resolution.'
                              ' This operation only performs aggregation to larger resolutions.')
    elif out_delta == in_delta:
        raise ValidationError('Input dataset is already at the requested output resolution.'
                              'Execution stopped.')

    return
Ejemplo n.º 2
0
def get_var_data(var, indexers: dict, remaining_dims=None):
    """Select an arbitrary piece of an xarray dataset by using indexers."""
    if indexers:
        if remaining_dims:
            for dim in remaining_dims:
                if dim not in var.dims:
                    raise ValidationError(
                        f'The specified dataset does not have a dimension called \'{dim}\'.'
                    )
                if dim in indexers:
                    raise ValidationError(
                        f'Dimension \'{dim}\' is also specified as indexers. Please ensure that a '
                        f'dimension is used exclusively either as indexers or as the selected '
                        f'dimension.')
        for dim in indexers:
            if dim not in var.dims:
                raise ValidationError(
                    f'The specified dataset does not have a dimension called \'{dim}\'.'
                )
        var = var.sel(method='nearest', **indexers)

    if remaining_dims:
        isel_indexers = {
            dim_name: 0
            for dim_name in var.dims if dim_name not in remaining_dims
        }
        var = var.isel(**isel_indexers)

    return var
Ejemplo n.º 3
0
def _find_intersection(
        first: np.ndarray, second: np.ndarray,
        global_bounds: Tuple[float, float]) -> Tuple[float, float]:
    """
    Find 1D intersection of given arrays such that the resulting intersection
    bounds fall on 'pixel' boundaries for both given arrays.

    :param first: First 1D array
    :param second: Second 1D array
    :param global_bounds: (min, max) maximum interval for a valid intersection
    :return: (min, max) intersection bounds
    """
    first_px_size = abs(first[1] - first[0])
    second_px_size = abs(second[1] - second[0])

    minimum = max(first[0] - first_px_size / 2, second[0] - second_px_size / 2)
    maximum = min(first[-1] + first_px_size / 2,
                  second[-1] + second_px_size / 2)

    delta = maximum - minimum
    if delta < max(first_px_size, second_px_size):
        raise ValidationError('Could not find a valid intersection to perform'
                              ' coregistration on')

    # Make sure min/max fall on pixel boundaries for both grids
    # Because there exists a number N denoting how many smaller pixels fall
    # into one larger pixel (for pixel registered datasets with the same
    # origin) => the boundary has to be adjusted by steps equal
    # to smaller pixels.
    finer = min(first_px_size, second_px_size)
    safety = 100
    i = 0
    while (0 != (minimum - global_bounds[0]) % first_px_size and 0 !=
           (minimum - global_bounds[0]) % second_px_size):
        if i == safety:
            raise ValidationError(
                'Could not find a valid intersection to perform'
                ' coregistration on')
        minimum = minimum + finer
        i = i + 1

    i = 0
    while (0 != (global_bounds[1] - maximum) % first_px_size and 0 !=
           (global_bounds[1] - maximum) % second_px_size):
        if i == safety:
            raise ValidationError(
                'Could not find a valid intersection to perform'
                ' coregistration on')
        maximum = maximum - finer
        i = i + 1

    # This is possible in some cases when mis-aligned grid arrays are presented
    if maximum <= minimum:
        raise ValidationError('Could not find a valid intersection to perform'
                              ' coregistration on')

    return (minimum, maximum)
Ejemplo n.º 4
0
def anomaly_external(ds: xr.Dataset,
                     file: str,
                     transform: str = None,
                     monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate anomaly with external reference data, for example, a climatology.
    The given reference dataset is expected to consist of 12 time slices, one
    for each month.

    The returned dataset will contain the variable names found in both - the
    reference and the given dataset. Names found in the given dataset, but not in
    the reference, will be dropped from the resulting dataset. The calculated
    anomaly will be against the corresponding month of the reference data.
    E.g. January against January, etc.

    In case spatial extents differ between the reference and the given dataset,
    the anomaly will be calculated on the intersection.

    :param ds: The dataset to calculate anomalies from
    :param file: Path to reference data file
    :param transform: Apply the given transformation before calculating the anomaly.
                      For supported operations see help on 'ds_arithmetics' operation.
    :param monitor: a progress monitor.
    :return: The anomaly dataset
    """
    # Check if the time coordinate is of dtype datetime
    try:
        if ds.time.dtype != 'datetime64[ns]':
            raise ValidationError(
                'The dataset provided for anomaly calculation'
                ' is required to have a time coordinate of'
                ' dtype datetime64[ns]. Running the normalize'
                ' operation on this dataset might help.')
    except AttributeError:
        raise ValidationError('The dataset provided for anomaly calculation'
                              ' is required to have a time coordinate.')

    clim = xr.open_dataset(file)
    ret = ds.copy()
    if transform:
        ret = ds_arithmetics(ds, transform)
    # Group by months, subtract the appropriate slice from the reference
    # Note that this requires that 'time' coordinate labels are of type
    # datetime64[ns]
    total_work = 100
    step = 100 / 12

    with monitor.starting('Anomaly', total_work=total_work):
        monitor.progress(work=0)
        kwargs = {'ref': clim, 'monitor': monitor, 'step': step}
        ret = ret.groupby(ds['time.month']).apply(_group_anomaly, **kwargs)

    # Running groupby results in a redundant 'month' variable being added to
    # the dataset
    ret = ret.drop('month')
    return ret
Ejemplo n.º 5
0
def pearson_correlation_scalar(
        ds_x: DatasetLike.TYPE,
        ds_y: DatasetLike.TYPE,
        var_x: VarName.TYPE,
        var_y: VarName.TYPE,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two timeseries and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if ((len(array_x.dims) != len(array_y.dims)) and (len(array_x.dims) != 1)):
        raise ValidationError('To calculate simple correlation, both provided'
                              ' datasets should be simple 1d timeseries. To'
                              ' create a map of correlation coefficients, use'
                              ' pearson_correlation operation instead.')

    if len(array_x['time']) != len(array_y['time']):
        raise ValidationError(
            'The length of the time dimension differs between'
            ' the given datasets. Can not perform the calculation'
            ', please review operation documentation.')

    if len(array_x['time']) < 3:
        raise ValidationError(
            'The length of the time dimension should not be less'
            ' than three to run the calculation.')

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.values, array_y.values)

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
Ejemplo n.º 6
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Create a 'mean over years' dataset by averaging the values of the given input
    dataset over all years. The output is a climatological dataset with the same
    resolution as the input dataset. E.g. a daily input dataset will create a daily
    climatology consisting of 365 days, a monthly input dataset will create a monthly
    climatology, etc.

    Seasonal input datasets must have matching seasons over all years denoted by the
    same date each year. E.g., first date of each quarter. The output dataset will
    then be a seasonal climatology where each season is denoted with the same date
    as in the input dataset.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Long term average operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    try:
        t_resolution = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution. Running'
            ' the adjust_temporal_attrs operation beforehand may'
            ' help.')

    var = VarNamesLike.convert(var)
    # Shallow

    if var:
        ds = select_var(ds, var)

    if t_resolution == 'P1D':
        return _lta_daily(ds)
    elif t_resolution == 'P1M':
        return _lta_monthly(ds, monitor)
    else:
        return _lta_general(ds, monitor)
Ejemplo n.º 7
0
def _exec_script(script: str,
                 element_types: Tuple[type, ...],
                 operation_context: Mapping[str, Any] = None,
                 context_object: Mapping[str, Any] = None,
                 monitor: Monitor = Monitor.NONE) -> Dict[str, Any]:
    """
    Helper for compute_dataset() and compute_data_frame().
    """
    if not script:
        raise ValidationError(f'Python script must not be empty')

    # Include common libraries
    orig_namespace = dict(
        gpd=gpd,
        geopandas=geopandas,
        math=math,
        np=np,
        numpy=numpy,
        pd=pd,
        pandas=pandas,
        sp=sp,
        scipy=scipy,
        xr=xr,
        xarray=xarray,
    )

    if operation_context is not None and 'value_cache' in operation_context:
        orig_namespace.update(operation_context['value_cache'])

    if context_object is not None:
        orig_namespace.update(context_object)

    local_namespace = dict(orig_namespace)

    with monitor.observing("Executing script"):
        try:
            safe_exec(script, local_namespace=local_namespace)
        except BaseException as e:
            raise ValidationError(f'Error in Python script: {e}') from e

    elements = dict()
    for name, element in local_namespace.items():
        if not name.startswith('_'):
            if isinstance(element, element_types):
                if name not in orig_namespace or element is not orig_namespace[
                        name]:
                    elements[name] = element

    return elements
Ejemplo n.º 8
0
Archivo: io.py Proyecto: whigg/cate
def write_geo_data_frame(gdf: gpd.GeoDataFrame,
                         file: str, crs: str = None,
                         more_args: DictLike.TYPE = None):
    """
    Write a geo data frame to files with formats such as ESRI Shapefile or GeoJSON.

    :param gdf: A geo data frame.
    :param file: Is either the absolute or relative path to the file to be opened.
    :param more_args: Other optional keyword arguments.
           Please refer to Python documentation of ``fiona.open()`` function.
    """
    kwargs = DictLike.convert(more_args) or {}
    if "driver" in kwargs:
        driver = kwargs.pop("driver")
    else:
        root, ext = os.path.splitext(file)
        ext_low = ext.lower()
        if ext_low == "":
            driver = "ESRI Shapefile"
            file += ".shp"
        elif ext_low == ".shp":
            driver = "ESRI Shapefile"
        elif ext_low == ".json" or ext_low == ".geojson":
            driver = "GeoJSON"
        elif ext_low == ".gpx":
            driver = "GPX"
        elif ext_low == ".gpkg":
            driver = "GPKG"
        else:
            raise ValidationError(f'Cannot detect supported format from file extension "{ext}"')
    gdf.to_file(file, driver=driver, **kwargs)
Ejemplo n.º 9
0
def no_op(num_steps: int = 20,
          step_duration: float = 0.5,
          fail_before: bool = False,
          fail_after: bool = False,
          monitor: Monitor = Monitor.NONE) -> bool:
    """
    An operation that basically does nothing but spending configurable time.
    It may be useful for testing purposes.

    :param num_steps: Number of steps to iterate.
    :param step_duration: How much time to spend in each step in seconds.
    :param fail_before: If the operation should fail before spending time doing nothing (raise a ValidationError).
    :param fail_after: If the operation should fail after spending time doing nothing (raise a ValueError).
    :param monitor: A progress monitor.
    :return: Always True
    """
    import time
    monitor.start('Computing nothing', num_steps)
    if fail_before:
        raise ValidationError('Intentionally failed before doing anything.')
    for i in range(num_steps):
        time.sleep(step_duration)
        monitor.progress(1.0,
                         'Step %s of %s doing nothing' % (i + 1, num_steps))
    if fail_after:
        raise ValueError('Intentionally failed after doing nothing.')
    monitor.done()
    return True
Ejemplo n.º 10
0
def get_vars_data(ds, indexers: dict, remaining_dims=None):
    """Select an arbitrary piece of an xarray dataset by using indexers."""
    # to avoid the original dataset being affected (especially useful in unit tests)
    ds = ds.copy()

    if indexers:
        invalid_indexers = list(indexers)
        for var_name in ds:
            if ds[var_name].name in ds[var_name].dims:
                continue
            var_indexers = {}
            if remaining_dims:
                for dim in remaining_dims:
                    if dim not in ds[var_name].dims:
                        raise ValidationError(
                            f'The specified dataset does not have a dimension called \'{dim}\'.'
                        )
                    if dim in indexers:
                        raise ValidationError(
                            f'Dimension \'{dim}\' is also specified as indexers. Please ensure that a '
                            f'dimension is used exclusively either as indexers or as the selected '
                            f'dimension.')
            for dim in ds[var_name].dims:
                if dim in indexers:
                    var_indexers[dim] = indexers[dim]
            for dim in invalid_indexers:
                if dim in ds[var_name].dims:
                    invalid_indexers.remove(dim)
            ds[var_name] = ds[var_name].sel(method='nearest', **var_indexers)

            if remaining_dims:
                isel_indexers = {
                    dim_name: 0
                    for dim_name in ds[var_name].dims
                    if dim_name not in remaining_dims
                }
                ds[var_name] = ds[var_name].isel(**isel_indexers)

        if len(invalid_indexers) > 0:
            raise ValidationError(
                f'There are dimensions specified in indexers but do not match dimensions in '
                f'any variables: {invalid_indexers}')

    return ds
Ejemplo n.º 11
0
def ds_arithmetics(ds: DatasetLike.TYPE,
                   op: str,
                   monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do arithmetic operations on the given dataset by providing a list of
    arithmetic operations and the corresponding constant. The operations will
    be applied to the dataset in the order in which they appear in the list.
    For example:
    'log,+5,-2,/3,*2'

    Currently supported arithmetic operations:
    log,log10,log2,log1p,exp,+,-,/,*

    where:
        log - natural logarithm
        log10 - base 10 logarithm
        log2 - base 2 logarithm
        log1p - log(1+x)
        exp - the exponential

    The operations will be applied element-wise to all arrays of the dataset.

    :param ds: The dataset to which to apply arithmetic operations
    :param op: A comma separated list of arithmetic operations to apply
    :param monitor: a progress monitor.
    :return: The dataset with given arithmetic operations applied
    """
    ds = DatasetLike.convert(ds)
    retset = ds
    with monitor.starting('Calculate result', total_work=len(op.split(','))):
        for item in op.split(','):
            with monitor.child(1).observing("Calculate"):
                item = item.strip()
                if item[0] == '+':
                    retset = retset + float(item[1:])
                elif item[0] == '-':
                    retset = retset - float(item[1:])
                elif item[0] == '*':
                    retset = retset * float(item[1:])
                elif item[0] == '/':
                    retset = retset / float(item[1:])
                elif item[:] == 'log':
                    retset = xu.log(retset)
                elif item[:] == 'log10':
                    retset = xu.log10(retset)
                elif item[:] == 'log2':
                    retset = xu.log2(retset)
                elif item[:] == 'log1p':
                    retset = xu.log1p(retset)
                elif item[:] == 'exp':
                    retset = xu.exp(retset)
                else:
                    raise ValidationError('Arithmetic operation {} not'
                                          ' implemented.'.format(item[0]))

    return retset
Ejemplo n.º 12
0
    def open_dataset(self,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     var_names: VarNamesLike.TYPE = None,
                     protocol: str = None,
                     monitor: Monitor = Monitor.NONE) -> Any:
        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(
                self.id)
            if time_range is not None:
                msg += ' in given time range {}'.format(
                    TimeRangeLike.format(time_range))
            raise ValidationError(msg)

        files = self._get_urls_list(selected_file_list, _ODP_PROTOCOL_OPENDAP)
        try:
            ds = open_xarray_dataset(files, monitor=monitor)
            if region:
                ds = normalize_impl(ds)
                ds = subset_spatial_impl(ds, region)
            if var_names:
                ds = ds.drop([
                    var_name for var_name in ds.data_vars.keys()
                    if var_name not in var_names
                ])
            return ds

        except OSError as e:
            if time_range:
                raise ValidationError(
                    "Cannot open remote dataset for time range {}:\n"
                    "{}".format(TimeRangeLike.format(time_range), e),
                    source=self) from e
            else:
                raise DataAccessError("Cannot open remote dataset:\n"
                                      "{}".format(
                                          TimeRangeLike.format(time_range), e),
                                      source=self) from e
Ejemplo n.º 13
0
 def convert(cls, value, default=None) -> ExamplePoint:
     try:
         if isinstance(value, ExamplePoint):
             return value
         if isinstance(value, str):
             pair = value.split(',')
             return ExamplePoint(float(pair[0]), float(pair[1]))
         return ExamplePoint(value[0], value[1])
     except Exception:
         raise ValidationError('Cannot convert value <%s> to %s.' %
                               (repr(value), cls.name()))
Ejemplo n.º 14
0
Archivo: plot.py Proyecto: TonioF/cate
def plot_contour(ds: xr.Dataset,
                 var: VarName.TYPE,
                 time: TimeLike.TYPE = None,
                 indexers: DictLike.TYPE = None,
                 title: str = None,
                 filled: bool = True,
                 properties: DictLike.TYPE = None,
                 file: str = None) -> Figure:
    """
    Create a contour plot of a variable given by dataset *ds* and variable name *var*.

    :param ds: the dataset containing the variable to plot
    :param var: the variable's name
    :param time: time slice index to plot, can be a string "YYYY-MM-DD" or an integer number
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "layer=4".
    :param title: an optional title
    :param filled: whether the regions between two contours shall be filled
    :param properties: optional plot properties for Python matplotlib,
           e.g. "bins=512, range=(-1.5, +1.5), label='Sea Surface Temperature'"
           For full reference refer to
           https://matplotlib.org/api/lines_api.html and
           https://matplotlib.org/devdocs/api/_as_gen/matplotlib.patches.Patch.html#matplotlib.patches.Patch
    :param file: path to a file in which to save the plot
    :return: a matplotlib figure object or None if in IPython mode
    """
    var_name = VarName.convert(var)
    if not var_name:
        raise ValidationError("Missing name for 'var'")
    var = ds[var_name]

    time = TimeLike.convert(time)
    indexers = DictLike.convert(indexers) or {}
    properties = DictLike.convert(properties) or {}

    figure = plt.figure(figsize=(8, 4))
    ax = figure.add_subplot(111)

    var_data = get_var_data(var, indexers, time=time)
    if filled:
        var_data.plot.contourf(ax=ax, **properties)
    else:
        var_data.plot.contour(ax=ax, **properties)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Ejemplo n.º 15
0
def plot_hist(ds: xr.Dataset,
              var: VarName.TYPE,
              indexers: DictLike.TYPE = None,
              title: str = None,
              properties: DictLike.TYPE = None,
              file: str = None) -> Figure:
    """
    Plot a variable, optionally save the figure in a file.

    The plot can either be shown using pyplot functionality, or saved,
    if a path is given. The following file formats for saving the plot
    are supported: eps, jpeg, jpg, pdf, pgf, png, ps, raw, rgba, svg,
    svgz, tif, tiff

    :param ds: Dataset that contains the variable named by *var*.
    :param var: The name of the variable to plot
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "lon=12.6, layer=3, time='2012-05-02'".
    :param title: an optional title
    :param properties: optional histogram plot properties for Python matplotlib,
           e.g. "bins=512, range=(-1.5, +1.5), label='Sea Surface Temperature'"
           For full reference refer to
           https://matplotlib.org/devdocs/api/_as_gen/matplotlib.pyplot.hist.html and
           https://matplotlib.org/devdocs/api/_as_gen/matplotlib.patches.Patch.html#matplotlib.patches.Patch
    :param file: path to a file in which to save the plot
    :return: a matplotlib figure object or None if in IPython mode
    """
    var_name = VarName.convert(var)
    if not var_name:
        raise ValidationError("Missing name for 'var'")

    var = ds[var]

    indexers = DictLike.convert(indexers)
    properties = DictLike.convert(properties) or {}

    figure = plt.figure(figsize=(8, 4))
    ax = figure.add_subplot(111)
    figure.tight_layout()

    var_data = get_var_data(var, indexers)
    var_data.plot.hist(ax=ax, **properties)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Ejemplo n.º 16
0
def merge(ds_1: DatasetLike.TYPE,
          ds_2: DatasetLike.TYPE,
          ds_3: DatasetLike.TYPE = None,
          ds_4: DatasetLike.TYPE = None,
          join: str = 'outer',
          compat: str = 'no_conflicts') -> xr.Dataset:
    """
    Merge up to four datasets to produce a new dataset with combined variables from each input dataset.

    This is a wrapper for the ``xarray.merge()`` function.

    For documentation refer to xarray documentation at
    http://xarray.pydata.org/en/stable/generated/xarray.Dataset.merge.html#xarray.Dataset.merge

    The *compat* argument indicates how to compare variables of the same name for potential conflicts:

    * "broadcast_equals": all values must be equal when variables are broadcast
      against each other to ensure common dimensions.
    * "equals": all values and dimensions must be the same.
    * "identical": all values, dimensions and attributes must be the same.
    * "no_conflicts": only values which are not null in both datasets must be equal.
      The returned dataset then contains the combination of all non-null values.

    :param ds_1: The first input dataset.
    :param ds_2: The second input dataset.
    :param ds_3: An optional 3rd input dataset.
    :param ds_4: An optional 4th input dataset.
    :param join: How to combine objects with different indexes.
    :param compat: How to compare variables of the same name for potential conflicts.
    :return: A new dataset with combined variables from each input dataset.
    """

    ds_1 = DatasetLike.convert(ds_1)
    ds_2 = DatasetLike.convert(ds_2)
    ds_3 = DatasetLike.convert(ds_3)
    ds_4 = DatasetLike.convert(ds_4)

    datasets = []
    for ds in (ds_1, ds_2, ds_3, ds_4):
        if ds is not None:
            included = False
            for ds2 in datasets:
                if ds is ds2:
                    included = True
            if not included:
                datasets.append(ds)

    if len(datasets) == 0:
        raise ValidationError('At least two different datasets must be given')
    elif len(datasets) == 1:
        return datasets[0]
    else:
        return xr.merge(datasets, compat=compat, join=join)
Ejemplo n.º 17
0
def _get_min_max(data, monitor=None):
    """
    Get min and max of a dataset, while accounting for all-NaN
    datasets and observing it with the monitor.
    """
    with monitor.child(1).observing("find minimum"):
        data_min = data.min()
    if np.isnan(data_min):
        # Handle all-NaN dataset
        raise ValidationError('Can not create an animation of a dataset containing only NaN values.')
    else:
        with monitor.child(1).observing("find maximum"):
            data_max = data.max()

    return (data_min.values, data_max.values)
Ejemplo n.º 18
0
def fix_lon_360(ds: xr.Dataset) -> xr.Dataset:
    """
    Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees.

    :param ds: The dataset whose longitudes are given in the range 0 to 360.
    :return: The fixed dataset.
    """
    if 'lon' not in ds.coords:
        raise ValidationError('missing coordinate variable "lon"')
    if 'lon' not in ds.sizes:
        raise ValidationError('missing dimension "lon"')
    if len(ds.lon.shape) != 1:
        raise ValidationError('coordinate variable "lon" must be 1-dimensional')
    if len(ds.lon) < 2:
        raise ValidationError('coordinate variable "lon" must have more than one element')

    new_ds = ds.copy()
    lon_size = ds.sizes['lon']
    lon_size_05 = lon_size // 2

    for var_name in new_ds.variables:
        if var_name != 'lon':
            var = new_ds.variables[var_name]
            if len(var.dims) >= 1 and var.dims[-1] == 'lon':
                temp = var.values[..., : lon_size_05]
                var.values[..., : lon_size_05] = var.values[..., lon_size_05:]
                var.values[..., lon_size_05:] = temp

    delta_lon = new_ds['lon'][1] - new_ds['lon'][0]

    new_ds['lon'] = xr.DataArray(np.linspace(-180. + 0.5 * delta_lon, +180. - 0.5 * delta_lon, lon_size),
                                 dims=ds['lon'].dims,
                                 attrs=ds['lon'].attrs)

    new_ds['lon'].attrs['units'] = 'degrees east'
    return new_ds
Ejemplo n.º 19
0
def plot(ds: DatasetLike.TYPE,
         var: VarName.TYPE,
         indexers: DictLike.TYPE = None,
         title: str = None,
         properties: DictLike.TYPE = None,
         file: str = None) -> Figure:
    """
    Create a 1D/line or 2D/image plot of a variable given by dataset *ds* and variable name *var*.

    :param ds: Dataset or Dataframe that contains the variable named by *var*.
    :param var: The name of the variable to plot
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "lat=12.4, time='2012-05-02'".
    :param title: an optional plot title
    :param properties: optional plot properties for Python matplotlib,
           e.g. "bins=512, range=(-1.5, +1.5), label='Sea Surface Temperature'"
           For full reference refer to
           https://matplotlib.org/api/lines_api.html and
           https://matplotlib.org/devdocs/api/_as_gen/matplotlib.patches.Patch.html#matplotlib.patches.Patch
    :param file: path to a file in which to save the plot
    :return: a matplotlib figure object or None if in IPython mode
    """
    ds = DatasetLike.convert(ds)

    var_name = VarName.convert(var)
    if not var_name:
        raise ValidationError("Missing name for 'var'")
    var = ds[var_name]

    indexers = DictLike.convert(indexers)
    properties = DictLike.convert(properties) or {}

    figure = plt.figure()
    ax = figure.add_subplot(111)

    var_data = get_var_data(var, indexers)
    var_data.plot(ax=ax, **properties)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Ejemplo n.º 20
0
Archivo: index.py Proyecto: whigg/cate
def enso(ds: xr.Dataset,
         var: VarName.TYPE,
         file: str,
         region: str = 'n34',
         custom_region: PolygonLike.TYPE = None,
         threshold: float = None,
         monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Calculate ENSO index, which is defined as a five month running mean of
    anomalies of monthly means of SST data in the given region.

    :param ds: A monthly SST dataset
    :param file: Path to the reference data file e.g. a climatology. A suitable reference dataset
    can be generated using the long_term_average operation
    :param var: Dataset variable to use for index calculation
    :param region: Region for index calculation, the default is Nino3.4
    :param custom_region: If 'custom' is chosen as the 'region', this parameter
    has to be provided to set the desired region.
    :param threshold: If given, boolean El Nino/La Nina timeseries will be
    calculated and added to the output dataset, according to the given
    threshold. Where anomaly larger than then positive value of the threshold
    indicates El Nino and anomaly smaller than the negative of the given
    threshold indicates La Nina.
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries.
    """
    regions = {
        'N1+2': '-90, -10, -80, 0',
        'N3': '-150, -5, -90, 5',
        'N3.4': '-170, -5, -120, 5',
        'N4': '160, -5, -150, 5',
        'custom': custom_region
    }
    converted_region = PolygonLike.convert(regions[region])
    if not converted_region:
        raise ValidationError(
            'No region has been provided to ENSO index calculation')

    name = 'ENSO ' + region + ' Index'
    if 'custom' == region:
        name = 'ENSO Index over ' + PolygonLike.format(converted_region)

    return _generic_index_calculation(ds, var, converted_region, 5, file, name,
                                      threshold, monitor)
Ejemplo n.º 21
0
def handle_plot_polygon(region: PolygonLike.TYPE = None):
    """
    Return extents of the given PolygonLike.

    :param region: PolygonLike to introspect
    :return: extents
    """
    if region is None:
        return None

    extents, explicit_coords = get_extents(region)

    lon_min, lat_min, lon_max, lat_max = extents

    if not check_bounding_box(lat_min, lat_max, lon_min, lon_max):
        raise ValidationError(
            'Provided plot extents do not form a valid bounding box '
            'within [-180.0,+180.0,-90.0,+90.0]')
    return extents
Ejemplo n.º 22
0
Archivo: io.py Proyecto: pwambach/cate
def read_zarr(path: str,
              file_system: str = 'Local',
              drop_variables: VarNamesLike.TYPE = None,
              decode_cf: bool = True,
              decode_times: bool = True,
              normalize: bool = True) -> xr.Dataset:
    """
    Read a dataset from a Zarr directory, Zarr ZIP archive, or remote Zarr object storage.

    For the Zarr format, refer to http://zarr.readthedocs.io/en/stable/.

    :param path: Zarr directory path, Zarr ZIP archive path, or object storage path or bucket name.
    :param file_system: File system identifier, "Local" is your locally mounted file system,
           for Amazon S3 use "S3", for general Object Storage use "OBS".
    :param drop_variables: List of variables to be dropped.
    :param decode_cf: Whether to decode CF attributes and coordinate variables.
    :param decode_times: Whether to decode time information (convert time coordinates to ``datetime`` objects).
    :param normalize: Whether to normalize the dataset's geo- and time-coding upon opening. See operation ``normalize``.
    """
    drop_variables = VarNamesLike.convert(drop_variables)

    if file_system == 'Local':
        ds = xr.open_zarr(path,
                          drop_variables=drop_variables,
                          decode_cf=decode_cf,
                          decode_times=decode_times)
    elif file_system == 'S3' or file_system == 'OBS':
        import s3fs
        store = s3fs.S3Map(path, s3=(s3fs.S3FileSystem(anon=True)))
        ds = xr.open_zarr(store,
                          drop_variables=drop_variables,
                          decode_cf=decode_cf,
                          decode_times=decode_times)
    else:
        raise ValidationError(f'Unknown file_system {file_system!r}')

    if normalize:
        return adjust_temporal_attrs(normalize_op(ds))
    return ds
Ejemplo n.º 23
0
def plot_data_frame(df: pd.DataFrame,
                    plot_type: str = 'line',
                    file: str = None,
                    **kwargs) -> Figure:
    """
    Plot a data frame.
    This is a wrapper of pandas.DataFrame.plot() function.
    For further documentation please see
    http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html
    :param df: A pandas dataframe to plot
    :param plot_type: Plot type
    :param file: path to a file in which to save the plot
    :param kwargs: Keyword arguments to pass to the underlying
                   pandas.DataFrame.plot function
    """
    if not isinstance(df, pd.DataFrame):
        raise ValidationError('"df" must be of type "pandas.DataFrame"')

    ax = df.plot(kind=plot_type, figsize=(8, 4), **kwargs)
    figure = ax.get_figure()
    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Ejemplo n.º 24
0
Archivo: io.py Proyecto: whigg/cate
def write_csv(obj: DataFrameLike.TYPE,
              file: FileLike.TYPE,
              columns: VarNamesLike.TYPE = None,
              na_rep: str = '',
              delimiter: str = ',',
              quotechar: str = None,
              more_args: DictLike.TYPE = None,
              monitor: Monitor = Monitor.NONE):
    """
    Write comma-separated values (CSV) to plain text file from a DataFrame or Dataset.

    :param obj: The object to write as CSV; must be a ``DataFrame`` or a ``Dataset``.
    :param file: The CSV file path.
    :param columns: The names of variables that should be converted to columns. If given,
           coordinate variables are included automatically.
    :param delimiter: Delimiter to use.
    :param na_rep: A string representation of a missing value (no-data value).
    :param quotechar: The character used to denote the start and end of a quoted item.
           Quoted items can include the delimiter and it will be ignored.
    :param more_args: Other optional keyword arguments.
           Please refer to Pandas documentation of ``pandas.to_csv()`` function.
    :param monitor: optional progress monitor
    """
    if obj is None:
        raise ValidationError('obj must not be None')

    columns = VarNamesLike.convert(columns)

    if isinstance(obj, pd.DataFrame):
        # The following code is needed, because Pandas treats any kw given in kwargs as being set, even if just None.
        kwargs = DictLike.convert(more_args)
        if kwargs is None:
            kwargs = {}
        if columns:
            kwargs.update(columns=columns)
        if delimiter:
            kwargs.update(sep=delimiter)
        if na_rep:
            kwargs.update(na_rep=na_rep)
        if quotechar:
            kwargs.update(quotechar=quotechar)
        with monitor.starting('Writing to CSV', 1):
            obj.to_csv(file, index_label='index', **kwargs)
            monitor.progress(1)
    elif isinstance(obj, xr.Dataset):
        var_names = [var_name for var_name in obj.data_vars if columns is None or var_name in columns]
        dim_names = None
        data_vars = []
        for var_name in var_names:
            data_var = obj.data_vars[var_name]
            if dim_names is None:
                dim_names = data_var.dims
            elif dim_names != data_var.dims:
                raise ValidationError('Not all variables have the same dimensions. '
                                      'Please select variables so that their dimensions are equal.')
            data_vars.append(data_var)
        if dim_names is None:
            raise ValidationError('None of the selected variables has a dimension.')

        coord_vars = []
        for dim_name in dim_names:
            if dim_name in obj.coords:
                coord_var = obj.coords[dim_name]
            else:
                coord_var = None
                for data_var in obj.coords.values():
                    if len(data_var.dims) == 1 and data_var.dims[0] == dim_name:
                        coord_var = data_var
                        break
                if coord_var is None:
                    raise ValueError(f'No coordinate variable found for dimension "{dim_name}"')
            coord_vars.append(coord_var)
        coord_indexes = [range(len(coord_var)) for coord_var in coord_vars]
        num_coords = len(coord_vars)

        num_rows = 1
        for coord_var in coord_vars:
            num_rows *= len(coord_var)

        stream = open(file, 'w') if isinstance(file, str) else file
        try:
            # Write header row
            stream.write('index')
            for i in range(num_coords):
                stream.write(delimiter)
                stream.write(coord_vars[i].name)
            for data_var in data_vars:
                stream.write(delimiter)
                stream.write(data_var.name)
            stream.write('\n')

            with monitor.starting('Writing CSV', num_rows):
                row = 0
                for index in itertools.product(*coord_indexes):
                    # Write data row
                    stream.write(str(row))
                    for i in range(num_coords):
                        coord_value = coord_vars[i].values[index[i]]
                        stream.write(delimiter)
                        stream.write(str(coord_value))
                    for data_var in data_vars:
                        var_value = data_var.values[index]
                        stream.write(delimiter)
                        stream.write(str(var_value))
                    stream.write('\n')
                    monitor.progress(1)
                    row += 1
        finally:
            if isinstance(file, str):
                stream.close()

    elif obj is None:
        raise ValidationError('obj must not be None')
    else:
        raise ValidationError('obj must be a pandas.DataFrame or a xarray.Dataset')
Ejemplo n.º 25
0
def animate_map(ds: xr.Dataset,
                var: VarName.TYPE = None,
                animate_dim: str = 'time',
                interval: int = 200,
                true_range: bool = False,
                indexers: DictLike.TYPE = None,
                region: PolygonLike.TYPE = None,
                projection: str = 'PlateCarree',
                central_lon: float = 0.0,
                title: str = None,
                contour_plot: bool = False,
                cmap_params: DictLike.TYPE = None,
                plot_properties: DictLike.TYPE = None,
                file: str = None,
                monitor: Monitor = Monitor.NONE) -> HTML:
    """
    Create a geographic map animation for the variable given by dataset *ds* and variable name *var*.

    Creates an animation of the given variable from the given dataset on a map with coastal lines.
    In case no variable name is given, the first encountered variable in the
    dataset is animated.
    It is also possible to set extents of the animation. If no extents
    are given, a global animation is created.

    The following file formats for saving the animation are supported: html

    :param ds: the dataset containing the variable to animate
    :param var: the variable's name
    :param animate_dim: Dimension to animate, if none given defaults to time.
    :param interval: Delay between frames in milliseconds. Defaults to 200.
    :param true_range: If True, calculates colormap and colorbar configuration parameters from the
    whole dataset. Can potentially take a lot of time. Defaults to False, in which case the colormap
    is calculated from the first frame.
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "layer=4".
    :param region: Region to animate
    :param projection: name of a global projection, see http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html
    :param central_lon: central longitude of the projection in degrees
    :param title: an optional title
    :param contour_plot: If true plot a filled contour plot of data, otherwise plots a pixelated colormesh
    :param cmap_params: optional additional colormap configuration parameters,
           e.g. "vmax=300, cmap='magma'"
           For full reference refer to
           http://xarray.pydata.org/en/stable/generated/xarray.plot.contourf.html
    :param plot_properties: optional plot properties for Python matplotlib,
           e.g. "bins=512, range=(-1.5, +1.5)"
           For full reference refer to
           https://matplotlib.org/api/lines_api.html and
           https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.contourf.html
    :param file: path to a file in which to save the animation
    :param monitor: A progress monitor.
    :return: An animation in HTML format
    """
    if not isinstance(ds, xr.Dataset):
        raise NotImplementedError('Only gridded datasets are currently supported')

    var_name = None
    if not var:
        for key in ds.data_vars.keys():
            var_name = key
            break
    else:
        var_name = VarName.convert(var)

    try:
        var = ds[var_name]
    except KeyError:
        raise ValidationError('Provided variable name "{}" does not exist in the given dataset'.format(var_name))

    indexers = DictLike.convert(indexers) or {}
    properties = DictLike.convert(plot_properties) or {}
    cmap_params = DictLike.convert(cmap_params) or {}

    extents = None
    bounds = handle_plot_polygon(region)
    if bounds:
        lon_min, lat_min, lon_max, lat_max = bounds
        extents = [lon_min, lon_max, lat_min, lat_max]

    if len(ds.lat) < 2 or len(ds.lon) < 2:
        # Matplotlib can not plot datasets with less than these dimensions with
        # contourf and pcolormesh methods
        raise ValidationError('The minimum dataset spatial dimensions to create a map'
                              ' plot are (2,2)')

    # See http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html#
    if projection == 'PlateCarree':
        proj = ccrs.PlateCarree(central_longitude=central_lon)
    elif projection == 'LambertCylindrical':
        proj = ccrs.LambertCylindrical(central_longitude=central_lon)
    elif projection == 'Mercator':
        proj = ccrs.Mercator(central_longitude=central_lon)
    elif projection == 'Miller':
        proj = ccrs.Miller(central_longitude=central_lon)
    elif projection == 'Mollweide':
        proj = ccrs.Mollweide(central_longitude=central_lon)
    elif projection == 'Orthographic':
        proj = ccrs.Orthographic(central_longitude=central_lon)
    elif projection == 'Robinson':
        proj = ccrs.Robinson(central_longitude=central_lon)
    elif projection == 'Sinusoidal':
        proj = ccrs.Sinusoidal(central_longitude=central_lon)
    elif projection == 'NorthPolarStereo':
        proj = ccrs.NorthPolarStereo(central_longitude=central_lon)
    elif projection == 'SouthPolarStereo':
        proj = ccrs.SouthPolarStereo(central_longitude=central_lon)
    else:
        raise ValidationError('illegal projection: "%s"' % projection)

    figure = plt.figure(figsize=(8, 4))
    ax = plt.axes(projection=proj)
    if extents:
        ax.set_extent(extents, ccrs.PlateCarree())
    else:
        ax.set_global()

    ax.coastlines()

    if not animate_dim:
        animate_dim = 'time'

    indexers[animate_dim] = var[animate_dim][0]

    var_data = get_var_data(var, indexers, remaining_dims=('lon', 'lat'))

    with monitor.starting("animate", len(var[animate_dim]) + 3):
        if true_range:
            data_min, data_max = _get_min_max(var, monitor=monitor)
        else:
            data_min, data_max = _get_min_max(var_data, monitor=monitor)

        cmap_params = determine_cmap_params(data_min, data_max, **cmap_params)
        plot_kwargs = {**properties, **cmap_params}

        # Plot the first frame to set-up the axes with the colorbar properly
        # transform keyword is for the coordinate our data is in, which in case of a
        # 'normal' lat/lon dataset is PlateCarree.
        if contour_plot:
            var_data.plot.contourf(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj},
                                   add_colorbar=True, **plot_kwargs)
        else:
            var_data.plot.pcolormesh(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj},
                                     add_colorbar=True, **plot_kwargs)
        if title:
            ax.set_title(title)
        figure.tight_layout()
        monitor.progress(1)

        def run(value):
            ax.clear()
            if extents:
                ax.set_extent(extents, ccrs.PlateCarree())
            else:
                ax.set_global()
            ax.coastlines()
            indexers[animate_dim] = value
            var_data = get_var_data(var, indexers, remaining_dims=('lon', 'lat'))
            var_data.plot.contourf(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj},
                                   add_colorbar=False, **plot_kwargs)
            if title:
                ax.set_title(title)
            monitor.progress(1)
            return ax
        anim = animation.FuncAnimation(figure, run, [i for i in var[animate_dim]],
                                       interval=interval, blit=False, repeat=False)
        anim_html = anim.to_jshtml()

        # Prevent the animation for running after it's finished
        del anim

        # Delete the rogue temp-file
        try:
            os.remove('None0000000.png')
        except FileNotFoundError:
            pass

        if file:
            with open(file, 'w') as outfile:
                outfile.write(anim_html)
                monitor.progress(1)

    return HTML(anim_html)
Ejemplo n.º 26
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution of input dataset.'
            ' Running the adjust_temporal_attrs operation beforehand may'
            ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq,
                                                            keep_attrs=True))
        except AttributeError:
            raise ValidationError(
                f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
Ejemplo n.º 27
0
def _lta_general(ds: xr.Dataset, monitor: Monitor):
    """
    Try to carry out a long term average in a general case, notably
    in the case of having seasonal datasets

    :param ds: Dataset to aggregate
    :param monitor: Progress monitor
    :return: Aggregated dataset
    """
    time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc)
    time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc)
    total_work = 100
    retset = ds

    # The dataset should feature time periods consistent over years
    # and denoted with the same dates each year
    if not _is_seasonal(ds.time):
        raise ValidationError(
            "A long term average dataset can not be created for"
            " a dataset with inconsistent seasons.")

    # Get 'representative year'
    c = 0
    for group in ds.time.groupby('time.year'):
        c = c + 1
        if c == 1:
            rep_year = group[1].time
            continue
        if c == 2 and len(group[1].time) > len(rep_year):
            rep_year = group[1].time
            break

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / len(rep_year.time)
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month',
                                squeeze=False).apply(_groupby_day, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.stack(time=('month', 'day'))

    # Turn month, day coordinates to time
    retset = retset.reset_index('time')
    retset = retset.drop(['month', 'day'])
    retset['time'] = rep_year.time

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (len(rep_year), 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset
Ejemplo n.º 28
0
def coregister(ds_master: xr.Dataset,
               ds_replica: xr.Dataset,
               method_us: str = 'linear',
               method_ds: str = 'mean',
               monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform coregistration of two datasets by resampling the replica dataset unto the
    grid of the master. If upsampling has to be performed, this is achieved using
    interpolation, if downsampling has to be performed, the pixels of the replica dataset
    are aggregated to form a coarser grid.

    The returned dataset will contain the lat/lon intersection of provided
    master and replica datasets, resampled unto the master grid frequency.

    This operation works on datasets whose spatial dimensions are defined on
    pixel-registered and equidistant in lat/lon coordinates grids. E.g., data points
    define the middle of a pixel and pixels have the same size across the dataset.

    This operation will resample all variables in a dataset, as the lat/lon grid is
    defined per dataset. It works only if all variables in the dataset have lat
    and lon as dimensions.

    For an overview of downsampling/upsampling methods used in this operation, please
    see https://github.com/CAB-LAB/gridtools

    Whether upsampling or downsampling has to be performed is determined automatically
    based on the relationship of the grids of the provided datasets.

    :param ds_master: The dataset whose grid is used for resampling
    :param ds_replica: The dataset that will be resampled
    :param method_us: Interpolation method to use for upsampling.
    :param method_ds: Interpolation method to use for downsampling.
    :param monitor: a progress monitor.
    :return: The replica dataset resampled on the grid of the master
    """
    try:
        grids = (('replica', ds_replica['lat'].values,
                  -90), ('replica', ds_replica['lon'].values,
                         -180), ('master', ds_master['lat'].values, -90),
                 ('master', ds_master['lon'].values, -180))
    except KeyError:
        raise ValidationError(
            'Coregistration requires that both datasets are'
            ' spatial datasets with lon and lat dimensions. The'
            ' dimensionality of the provided master dataset is: {},'
            ' the dimensionality of the provided replica dataset is:'
            ' {}. Running the normalize operation might help in'
            ' case spatial dimensions have different'
            ' names'.format(ds_master.dims, ds_replica.dims))

    # Don't do anything if datasets already have the same spatial definition
    if _grids_equal(ds_master, ds_replica):
        return ds_replica

    # Check if all arrays of the replica dataset have the required dimensionality
    for key in ds_replica.data_vars:
        if not _is_valid_array(ds_replica[key]):
            raise ValidationError(
                '{} data array of replica dataset is not valid for'
                ' coregistration. The data array is expected to'
                ' have lat and lon dimensions. The data array has'
                ' the following dimensions: {}. Consider running'
                ' select_var operation to exclude this'
                ' data array'.format(key, ds_replica[key].dims))

    # Check if the grids of the provided datasets are equidistant and pixel
    # registered
    for array in grids:
        if not _within_bounds(array[1], array[2]):
            raise ValidationError(
                'The {} dataset grid does not fall into required'
                ' boundaries. Required boundaries are ({}, {}),'
                ' dataset boundaries are ({}, {}). Running the'
                ' normalize operation'
                ' may help.'.format(array[0], array[2], abs(array[2]),
                                    array[1][0], array[1][-1]))
        if not _is_equidistant(array[1]):
            raise ValidationError('The {} dataset grid is not'
                                  ' equidistant, can not perform'
                                  ' coregistration'.format(array[0]))

        if not _is_pixel_registered(array[1], array[2]):
            raise ValidationError('The {} dataset grid is not'
                                  ' pixel-registered, can not perform'
                                  ' coregistration'.format(array[0]))

    # Co-register
    methods_us = {'nearest': 10, 'linear': 11}
    methods_ds = {
        'first': 50,
        'last': 51,
        'mean': 54,
        'mode': 56,
        'var': 57,
        'std': 58
    }

    return _resample_dataset(ds_master, ds_replica, methods_us[method_us],
                             methods_ds[method_ds], monitor)
Ejemplo n.º 29
0
def pearson_correlation_scalar(
        ds_x: DatasetLike.TYPE,
        ds_y: DatasetLike.TYPE,
        var_x: VarName.TYPE,
        var_y: VarName.TYPE,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two data variables and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if (array_x.dims != array_y.dims):
        raise ValidationError(
            'Both datasets should feature the same'
            ' dimensionality. Currently provided ds_x[var_x] '
            f'has {array_x.dims}, provided ds_y[var_y]'
            f' has {array_y.dims}')

    for dim in array_x.dims:
        if len(array_x[dim]) != len(array_y[dim]):
            raise ValidationError(
                'All dimensions of both provided data variables'
                f' must be the same length. Currently {dim} of ds_x[var_x]'
                f' has {len(array_x[dim])} values, while'
                f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.'
                ' You may want to try to coregister the datasets beforehand.')

    n_vals = 1
    for dim in array_x.dims:
        n_vals = n_vals * len(array_x[dim])

    if n_vals < 3:
        raise ValidationError(
            'There should be no less than 3 values in both data variables'
            f' to perform the correlation. Currently there are {n_vals} values'
        )

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.stack(z=array_x.dims),
                          array_y.stack(z=array_y.dims))

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
Ejemplo n.º 30
0
def pearson_correlation(ds_x: DatasetLike.TYPE,
                        ds_y: DatasetLike.TYPE,
                        var_x: VarName.TYPE,
                        var_y: VarName.TYPE,
                        monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Perform Pearson correlation on two datasets and produce a lon/lat map of
    correlation coefficients and the correspoding p_values.

    In case two 3D lon/lat/time datasets are provided, pixel by pixel
    correlation will be performed. It is also possible two pro
    Perform Pearson correlation analysis on two time/lat/lon datasets and
    produce a lat/lon map of correlation coefficients and p_values of
    underlying timeseries in the provided datasets.

    The lat/lon definition of both datasets has to be the same. The length of
    the time dimension should be equal, but not neccessarily have the same
    definition. E.g., it is possible to correlate different times of the same
    area.

    There are 'x' and 'y' datasets. Positive correlations imply that as x
    grows, so does y. Negative correlations imply that as x increases, y
    decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: a dataset containing a map of correlation coefficients and p_values
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    # Further validate inputs
    if array_x.dims == array_y.dims:
        if len(array_x.dims) != 3 or len(array_y.dims) != 3:
            raise ValidationError(
                'A correlation coefficient map can only be produced'
                ' if both provided datasets are 3D datasets with'
                ' lon/lat/time dimensionality, or if a combination'
                ' of a 3D lon/lat/time dataset and a 1D timeseries'
                ' is provided.')

        if array_x.values.shape != array_y.values.shape:
            raise ValidationError(
                f'The provided variables {var_x} and {var_y} do not have the'
                ' same shape, Pearson correlation can not be'
                ' performed. Please review operation'
                ' documentation')

        if (not ds_x['lat'].equals(ds_y['lat'])
                or not ds_x['lon'].equals(ds_y['lon'])):
            raise ValidationError(
                'When performing a pixel by pixel correlation the'
                ' datasets have to have the same lat/lon'
                ' definition. Consider running coregistration'
                ' first')

    elif (((len(array_x.dims) == 3) and (len(array_y.dims) != 1))
          or ((len(array_x.dims) == 1) and (len(array_y.dims) != 3))
          or ((len(array_x.dims) != 3) and (len(array_y.dims) == 1))
          or ((len(array_x.dims) != 1) and (len(array_y.dims) == 3))):
        raise ValidationError(
            'A correlation coefficient map can only be produced'
            ' if both provided datasets are 3D datasets with'
            ' lon/lat/time dimensionality, or if a combination'
            ' of a 3D lon/lat/time dataset and a 1D timeseries'
            ' is provided.')

    if len(array_x['time']) != len(array_y['time']):
        raise ValidationError(
            'The length of the time dimension differs between'
            ' the given datasets. Can not perform the calculation'
            ', please review operation documentation.')

    if len(array_x['time']) < 3:
        raise ValidationError(
            'The length of the time dimension should not be less'
            ' than three to run the calculation.')

    # Do pixel by pixel correlation
    retset = _pearsonr(array_x, array_y, monitor)
    retset.attrs['Cate_Description'] = f'Correlation between {var_y} {var_x}'

    return adjust_spatial_attrs(retset)