def _validate_freq(in_res: str, out_res: str) -> None: """ Validate the aggregation step See also: `ISO 8601 Durations <https://en.wikipedia.org/wiki/ISO_8601#Durations>`_ """ # Validate output frequency as a valid offset string try: dates = pd.date_range('2000-01-01', periods=5, freq=out_res) except ValueError: raise ValidationError('Invalid custom resolution: {}.' ' Please check operation documentation.'.format(out_res)) # Assuming simple ISO_8601 periods: PXXD/M try: count = int(in_res[1:-1]) except ValueError: raise ValidationError('Could not interpret time coverage resolution of' ' the given dataset: {}'.format(in_res)) if in_res == 'P1M' and out_res == 'MS': raise ValidationError('Input dataset is already at the requested output resolution.' ' Execution stopped.') in_delta = pd.Timedelta(count, unit=in_res[-1]) out_delta = dates[1] - dates[0] if out_delta < in_delta: raise ValidationError('Requested output resolution is smaller than dataset resolution.' ' This operation only performs aggregation to larger resolutions.') elif out_delta == in_delta: raise ValidationError('Input dataset is already at the requested output resolution.' 'Execution stopped.') return
def get_var_data(var, indexers: dict, remaining_dims=None): """Select an arbitrary piece of an xarray dataset by using indexers.""" if indexers: if remaining_dims: for dim in remaining_dims: if dim not in var.dims: raise ValidationError( f'The specified dataset does not have a dimension called \'{dim}\'.' ) if dim in indexers: raise ValidationError( f'Dimension \'{dim}\' is also specified as indexers. Please ensure that a ' f'dimension is used exclusively either as indexers or as the selected ' f'dimension.') for dim in indexers: if dim not in var.dims: raise ValidationError( f'The specified dataset does not have a dimension called \'{dim}\'.' ) var = var.sel(method='nearest', **indexers) if remaining_dims: isel_indexers = { dim_name: 0 for dim_name in var.dims if dim_name not in remaining_dims } var = var.isel(**isel_indexers) return var
def _find_intersection( first: np.ndarray, second: np.ndarray, global_bounds: Tuple[float, float]) -> Tuple[float, float]: """ Find 1D intersection of given arrays such that the resulting intersection bounds fall on 'pixel' boundaries for both given arrays. :param first: First 1D array :param second: Second 1D array :param global_bounds: (min, max) maximum interval for a valid intersection :return: (min, max) intersection bounds """ first_px_size = abs(first[1] - first[0]) second_px_size = abs(second[1] - second[0]) minimum = max(first[0] - first_px_size / 2, second[0] - second_px_size / 2) maximum = min(first[-1] + first_px_size / 2, second[-1] + second_px_size / 2) delta = maximum - minimum if delta < max(first_px_size, second_px_size): raise ValidationError('Could not find a valid intersection to perform' ' coregistration on') # Make sure min/max fall on pixel boundaries for both grids # Because there exists a number N denoting how many smaller pixels fall # into one larger pixel (for pixel registered datasets with the same # origin) => the boundary has to be adjusted by steps equal # to smaller pixels. finer = min(first_px_size, second_px_size) safety = 100 i = 0 while (0 != (minimum - global_bounds[0]) % first_px_size and 0 != (minimum - global_bounds[0]) % second_px_size): if i == safety: raise ValidationError( 'Could not find a valid intersection to perform' ' coregistration on') minimum = minimum + finer i = i + 1 i = 0 while (0 != (global_bounds[1] - maximum) % first_px_size and 0 != (global_bounds[1] - maximum) % second_px_size): if i == safety: raise ValidationError( 'Could not find a valid intersection to perform' ' coregistration on') maximum = maximum - finer i = i + 1 # This is possible in some cases when mis-aligned grid arrays are presented if maximum <= minimum: raise ValidationError('Could not find a valid intersection to perform' ' coregistration on') return (minimum, maximum)
def anomaly_external(ds: xr.Dataset, file: str, transform: str = None, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Calculate anomaly with external reference data, for example, a climatology. The given reference dataset is expected to consist of 12 time slices, one for each month. The returned dataset will contain the variable names found in both - the reference and the given dataset. Names found in the given dataset, but not in the reference, will be dropped from the resulting dataset. The calculated anomaly will be against the corresponding month of the reference data. E.g. January against January, etc. In case spatial extents differ between the reference and the given dataset, the anomaly will be calculated on the intersection. :param ds: The dataset to calculate anomalies from :param file: Path to reference data file :param transform: Apply the given transformation before calculating the anomaly. For supported operations see help on 'ds_arithmetics' operation. :param monitor: a progress monitor. :return: The anomaly dataset """ # Check if the time coordinate is of dtype datetime try: if ds.time.dtype != 'datetime64[ns]': raise ValidationError( 'The dataset provided for anomaly calculation' ' is required to have a time coordinate of' ' dtype datetime64[ns]. Running the normalize' ' operation on this dataset might help.') except AttributeError: raise ValidationError('The dataset provided for anomaly calculation' ' is required to have a time coordinate.') clim = xr.open_dataset(file) ret = ds.copy() if transform: ret = ds_arithmetics(ds, transform) # Group by months, subtract the appropriate slice from the reference # Note that this requires that 'time' coordinate labels are of type # datetime64[ns] total_work = 100 step = 100 / 12 with monitor.starting('Anomaly', total_work=total_work): monitor.progress(work=0) kwargs = {'ref': clim, 'monitor': monitor, 'step': step} ret = ret.groupby(ds['time.month']).apply(_group_anomaly, **kwargs) # Running groupby results in a redundant 'month' variable being added to # the dataset ret = ret.drop('month') return ret
def pearson_correlation_scalar( ds_x: DatasetLike.TYPE, ds_y: DatasetLike.TYPE, var_x: VarName.TYPE, var_y: VarName.TYPE, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis. Performs a simple correlation analysis on two timeseries and returns a correlation coefficient and the corresponding p_value. Positive correlation implies that as x grows, so does y. Negative correlation implies that as x increases, y decreases. For more information how to interpret the results, see `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_, and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_. :param ds_x: The 'x' dataset :param ds_y: The 'y' dataset :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset :param monitor: a progress monitor. :return: {'corr_coef': correlation coefficient, 'p_value': probability value} """ ds_x = DatasetLike.convert(ds_x) ds_y = DatasetLike.convert(ds_y) var_x = VarName.convert(var_x) var_y = VarName.convert(var_y) array_y = ds_y[var_y] array_x = ds_x[var_x] if ((len(array_x.dims) != len(array_y.dims)) and (len(array_x.dims) != 1)): raise ValidationError('To calculate simple correlation, both provided' ' datasets should be simple 1d timeseries. To' ' create a map of correlation coefficients, use' ' pearson_correlation operation instead.') if len(array_x['time']) != len(array_y['time']): raise ValidationError( 'The length of the time dimension differs between' ' the given datasets. Can not perform the calculation' ', please review operation documentation.') if len(array_x['time']) < 3: raise ValidationError( 'The length of the time dimension should not be less' ' than three to run the calculation.') with monitor.observing("Calculate Pearson correlation"): cc, pv = pearsonr(array_x.values, array_y.values) return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
def long_term_average(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Create a 'mean over years' dataset by averaging the values of the given input dataset over all years. The output is a climatological dataset with the same resolution as the input dataset. E.g. a daily input dataset will create a daily climatology consisting of 365 days, a monthly input dataset will create a monthly climatology, etc. Seasonal input datasets must have matching seasons over all years denoted by the same date each year. E.g., first date of each quarter. The output dataset will then be a seasonal climatology where each season is denoted with the same date as in the input dataset. For further information on climatological datasets, see http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics :param ds: A dataset to average :param var: If given, only these variables will be preserved in the resulting dataset :param monitor: A progress monitor :return: A climatological long term average dataset """ ds = DatasetLike.convert(ds) # Check if time dtype is what we want if 'datetime64[ns]' != ds.time.dtype: raise ValidationError( 'Long term average operation expects a dataset with the' ' time coordinate of type datetime64[ns], but received' ' {}. Running the normalize operation on this' ' dataset may help'.format(ds.time.dtype)) try: t_resolution = ds.attrs['time_coverage_resolution'] except KeyError: raise ValidationError( 'Could not determine temporal resolution. Running' ' the adjust_temporal_attrs operation beforehand may' ' help.') var = VarNamesLike.convert(var) # Shallow if var: ds = select_var(ds, var) if t_resolution == 'P1D': return _lta_daily(ds) elif t_resolution == 'P1M': return _lta_monthly(ds, monitor) else: return _lta_general(ds, monitor)
def _exec_script(script: str, element_types: Tuple[type, ...], operation_context: Mapping[str, Any] = None, context_object: Mapping[str, Any] = None, monitor: Monitor = Monitor.NONE) -> Dict[str, Any]: """ Helper for compute_dataset() and compute_data_frame(). """ if not script: raise ValidationError(f'Python script must not be empty') # Include common libraries orig_namespace = dict( gpd=gpd, geopandas=geopandas, math=math, np=np, numpy=numpy, pd=pd, pandas=pandas, sp=sp, scipy=scipy, xr=xr, xarray=xarray, ) if operation_context is not None and 'value_cache' in operation_context: orig_namespace.update(operation_context['value_cache']) if context_object is not None: orig_namespace.update(context_object) local_namespace = dict(orig_namespace) with monitor.observing("Executing script"): try: safe_exec(script, local_namespace=local_namespace) except BaseException as e: raise ValidationError(f'Error in Python script: {e}') from e elements = dict() for name, element in local_namespace.items(): if not name.startswith('_'): if isinstance(element, element_types): if name not in orig_namespace or element is not orig_namespace[ name]: elements[name] = element return elements
def write_geo_data_frame(gdf: gpd.GeoDataFrame, file: str, crs: str = None, more_args: DictLike.TYPE = None): """ Write a geo data frame to files with formats such as ESRI Shapefile or GeoJSON. :param gdf: A geo data frame. :param file: Is either the absolute or relative path to the file to be opened. :param more_args: Other optional keyword arguments. Please refer to Python documentation of ``fiona.open()`` function. """ kwargs = DictLike.convert(more_args) or {} if "driver" in kwargs: driver = kwargs.pop("driver") else: root, ext = os.path.splitext(file) ext_low = ext.lower() if ext_low == "": driver = "ESRI Shapefile" file += ".shp" elif ext_low == ".shp": driver = "ESRI Shapefile" elif ext_low == ".json" or ext_low == ".geojson": driver = "GeoJSON" elif ext_low == ".gpx": driver = "GPX" elif ext_low == ".gpkg": driver = "GPKG" else: raise ValidationError(f'Cannot detect supported format from file extension "{ext}"') gdf.to_file(file, driver=driver, **kwargs)
def no_op(num_steps: int = 20, step_duration: float = 0.5, fail_before: bool = False, fail_after: bool = False, monitor: Monitor = Monitor.NONE) -> bool: """ An operation that basically does nothing but spending configurable time. It may be useful for testing purposes. :param num_steps: Number of steps to iterate. :param step_duration: How much time to spend in each step in seconds. :param fail_before: If the operation should fail before spending time doing nothing (raise a ValidationError). :param fail_after: If the operation should fail after spending time doing nothing (raise a ValueError). :param monitor: A progress monitor. :return: Always True """ import time monitor.start('Computing nothing', num_steps) if fail_before: raise ValidationError('Intentionally failed before doing anything.') for i in range(num_steps): time.sleep(step_duration) monitor.progress(1.0, 'Step %s of %s doing nothing' % (i + 1, num_steps)) if fail_after: raise ValueError('Intentionally failed after doing nothing.') monitor.done() return True
def get_vars_data(ds, indexers: dict, remaining_dims=None): """Select an arbitrary piece of an xarray dataset by using indexers.""" # to avoid the original dataset being affected (especially useful in unit tests) ds = ds.copy() if indexers: invalid_indexers = list(indexers) for var_name in ds: if ds[var_name].name in ds[var_name].dims: continue var_indexers = {} if remaining_dims: for dim in remaining_dims: if dim not in ds[var_name].dims: raise ValidationError( f'The specified dataset does not have a dimension called \'{dim}\'.' ) if dim in indexers: raise ValidationError( f'Dimension \'{dim}\' is also specified as indexers. Please ensure that a ' f'dimension is used exclusively either as indexers or as the selected ' f'dimension.') for dim in ds[var_name].dims: if dim in indexers: var_indexers[dim] = indexers[dim] for dim in invalid_indexers: if dim in ds[var_name].dims: invalid_indexers.remove(dim) ds[var_name] = ds[var_name].sel(method='nearest', **var_indexers) if remaining_dims: isel_indexers = { dim_name: 0 for dim_name in ds[var_name].dims if dim_name not in remaining_dims } ds[var_name] = ds[var_name].isel(**isel_indexers) if len(invalid_indexers) > 0: raise ValidationError( f'There are dimensions specified in indexers but do not match dimensions in ' f'any variables: {invalid_indexers}') return ds
def ds_arithmetics(ds: DatasetLike.TYPE, op: str, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Do arithmetic operations on the given dataset by providing a list of arithmetic operations and the corresponding constant. The operations will be applied to the dataset in the order in which they appear in the list. For example: 'log,+5,-2,/3,*2' Currently supported arithmetic operations: log,log10,log2,log1p,exp,+,-,/,* where: log - natural logarithm log10 - base 10 logarithm log2 - base 2 logarithm log1p - log(1+x) exp - the exponential The operations will be applied element-wise to all arrays of the dataset. :param ds: The dataset to which to apply arithmetic operations :param op: A comma separated list of arithmetic operations to apply :param monitor: a progress monitor. :return: The dataset with given arithmetic operations applied """ ds = DatasetLike.convert(ds) retset = ds with monitor.starting('Calculate result', total_work=len(op.split(','))): for item in op.split(','): with monitor.child(1).observing("Calculate"): item = item.strip() if item[0] == '+': retset = retset + float(item[1:]) elif item[0] == '-': retset = retset - float(item[1:]) elif item[0] == '*': retset = retset * float(item[1:]) elif item[0] == '/': retset = retset / float(item[1:]) elif item[:] == 'log': retset = xu.log(retset) elif item[:] == 'log10': retset = xu.log10(retset) elif item[:] == 'log2': retset = xu.log2(retset) elif item[:] == 'log1p': retset = xu.log1p(retset) elif item[:] == 'exp': retset = xu.exp(retset) else: raise ValidationError('Arithmetic operation {} not' ' implemented.'.format(item[0])) return retset
def open_dataset(self, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, protocol: str = None, monitor: Monitor = Monitor.NONE) -> Any: time_range = TimeRangeLike.convert(time_range) if time_range else None var_names = VarNamesLike.convert(var_names) if var_names else None selected_file_list = self._find_files(time_range) if not selected_file_list: msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format( self.id) if time_range is not None: msg += ' in given time range {}'.format( TimeRangeLike.format(time_range)) raise ValidationError(msg) files = self._get_urls_list(selected_file_list, _ODP_PROTOCOL_OPENDAP) try: ds = open_xarray_dataset(files, monitor=monitor) if region: ds = normalize_impl(ds) ds = subset_spatial_impl(ds, region) if var_names: ds = ds.drop([ var_name for var_name in ds.data_vars.keys() if var_name not in var_names ]) return ds except OSError as e: if time_range: raise ValidationError( "Cannot open remote dataset for time range {}:\n" "{}".format(TimeRangeLike.format(time_range), e), source=self) from e else: raise DataAccessError("Cannot open remote dataset:\n" "{}".format( TimeRangeLike.format(time_range), e), source=self) from e
def convert(cls, value, default=None) -> ExamplePoint: try: if isinstance(value, ExamplePoint): return value if isinstance(value, str): pair = value.split(',') return ExamplePoint(float(pair[0]), float(pair[1])) return ExamplePoint(value[0], value[1]) except Exception: raise ValidationError('Cannot convert value <%s> to %s.' % (repr(value), cls.name()))
def plot_contour(ds: xr.Dataset, var: VarName.TYPE, time: TimeLike.TYPE = None, indexers: DictLike.TYPE = None, title: str = None, filled: bool = True, properties: DictLike.TYPE = None, file: str = None) -> Figure: """ Create a contour plot of a variable given by dataset *ds* and variable name *var*. :param ds: the dataset containing the variable to plot :param var: the variable's name :param time: time slice index to plot, can be a string "YYYY-MM-DD" or an integer number :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary or a comma-separated string of key-value pairs that maps the variable's dimension names to constant labels. e.g. "layer=4". :param title: an optional title :param filled: whether the regions between two contours shall be filled :param properties: optional plot properties for Python matplotlib, e.g. "bins=512, range=(-1.5, +1.5), label='Sea Surface Temperature'" For full reference refer to https://matplotlib.org/api/lines_api.html and https://matplotlib.org/devdocs/api/_as_gen/matplotlib.patches.Patch.html#matplotlib.patches.Patch :param file: path to a file in which to save the plot :return: a matplotlib figure object or None if in IPython mode """ var_name = VarName.convert(var) if not var_name: raise ValidationError("Missing name for 'var'") var = ds[var_name] time = TimeLike.convert(time) indexers = DictLike.convert(indexers) or {} properties = DictLike.convert(properties) or {} figure = plt.figure(figsize=(8, 4)) ax = figure.add_subplot(111) var_data = get_var_data(var, indexers, time=time) if filled: var_data.plot.contourf(ax=ax, **properties) else: var_data.plot.contour(ax=ax, **properties) if title: ax.set_title(title) figure.tight_layout() if file: figure.savefig(file) return figure if not in_notebook() else None
def plot_hist(ds: xr.Dataset, var: VarName.TYPE, indexers: DictLike.TYPE = None, title: str = None, properties: DictLike.TYPE = None, file: str = None) -> Figure: """ Plot a variable, optionally save the figure in a file. The plot can either be shown using pyplot functionality, or saved, if a path is given. The following file formats for saving the plot are supported: eps, jpeg, jpg, pdf, pgf, png, ps, raw, rgba, svg, svgz, tif, tiff :param ds: Dataset that contains the variable named by *var*. :param var: The name of the variable to plot :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary or a comma-separated string of key-value pairs that maps the variable's dimension names to constant labels. e.g. "lon=12.6, layer=3, time='2012-05-02'". :param title: an optional title :param properties: optional histogram plot properties for Python matplotlib, e.g. "bins=512, range=(-1.5, +1.5), label='Sea Surface Temperature'" For full reference refer to https://matplotlib.org/devdocs/api/_as_gen/matplotlib.pyplot.hist.html and https://matplotlib.org/devdocs/api/_as_gen/matplotlib.patches.Patch.html#matplotlib.patches.Patch :param file: path to a file in which to save the plot :return: a matplotlib figure object or None if in IPython mode """ var_name = VarName.convert(var) if not var_name: raise ValidationError("Missing name for 'var'") var = ds[var] indexers = DictLike.convert(indexers) properties = DictLike.convert(properties) or {} figure = plt.figure(figsize=(8, 4)) ax = figure.add_subplot(111) figure.tight_layout() var_data = get_var_data(var, indexers) var_data.plot.hist(ax=ax, **properties) if title: ax.set_title(title) figure.tight_layout() if file: figure.savefig(file) return figure if not in_notebook() else None
def merge(ds_1: DatasetLike.TYPE, ds_2: DatasetLike.TYPE, ds_3: DatasetLike.TYPE = None, ds_4: DatasetLike.TYPE = None, join: str = 'outer', compat: str = 'no_conflicts') -> xr.Dataset: """ Merge up to four datasets to produce a new dataset with combined variables from each input dataset. This is a wrapper for the ``xarray.merge()`` function. For documentation refer to xarray documentation at http://xarray.pydata.org/en/stable/generated/xarray.Dataset.merge.html#xarray.Dataset.merge The *compat* argument indicates how to compare variables of the same name for potential conflicts: * "broadcast_equals": all values must be equal when variables are broadcast against each other to ensure common dimensions. * "equals": all values and dimensions must be the same. * "identical": all values, dimensions and attributes must be the same. * "no_conflicts": only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. :param ds_1: The first input dataset. :param ds_2: The second input dataset. :param ds_3: An optional 3rd input dataset. :param ds_4: An optional 4th input dataset. :param join: How to combine objects with different indexes. :param compat: How to compare variables of the same name for potential conflicts. :return: A new dataset with combined variables from each input dataset. """ ds_1 = DatasetLike.convert(ds_1) ds_2 = DatasetLike.convert(ds_2) ds_3 = DatasetLike.convert(ds_3) ds_4 = DatasetLike.convert(ds_4) datasets = [] for ds in (ds_1, ds_2, ds_3, ds_4): if ds is not None: included = False for ds2 in datasets: if ds is ds2: included = True if not included: datasets.append(ds) if len(datasets) == 0: raise ValidationError('At least two different datasets must be given') elif len(datasets) == 1: return datasets[0] else: return xr.merge(datasets, compat=compat, join=join)
def _get_min_max(data, monitor=None): """ Get min and max of a dataset, while accounting for all-NaN datasets and observing it with the monitor. """ with monitor.child(1).observing("find minimum"): data_min = data.min() if np.isnan(data_min): # Handle all-NaN dataset raise ValidationError('Can not create an animation of a dataset containing only NaN values.') else: with monitor.child(1).observing("find maximum"): data_max = data.max() return (data_min.values, data_max.values)
def fix_lon_360(ds: xr.Dataset) -> xr.Dataset: """ Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees. :param ds: The dataset whose longitudes are given in the range 0 to 360. :return: The fixed dataset. """ if 'lon' not in ds.coords: raise ValidationError('missing coordinate variable "lon"') if 'lon' not in ds.sizes: raise ValidationError('missing dimension "lon"') if len(ds.lon.shape) != 1: raise ValidationError('coordinate variable "lon" must be 1-dimensional') if len(ds.lon) < 2: raise ValidationError('coordinate variable "lon" must have more than one element') new_ds = ds.copy() lon_size = ds.sizes['lon'] lon_size_05 = lon_size // 2 for var_name in new_ds.variables: if var_name != 'lon': var = new_ds.variables[var_name] if len(var.dims) >= 1 and var.dims[-1] == 'lon': temp = var.values[..., : lon_size_05] var.values[..., : lon_size_05] = var.values[..., lon_size_05:] var.values[..., lon_size_05:] = temp delta_lon = new_ds['lon'][1] - new_ds['lon'][0] new_ds['lon'] = xr.DataArray(np.linspace(-180. + 0.5 * delta_lon, +180. - 0.5 * delta_lon, lon_size), dims=ds['lon'].dims, attrs=ds['lon'].attrs) new_ds['lon'].attrs['units'] = 'degrees east' return new_ds
def plot(ds: DatasetLike.TYPE, var: VarName.TYPE, indexers: DictLike.TYPE = None, title: str = None, properties: DictLike.TYPE = None, file: str = None) -> Figure: """ Create a 1D/line or 2D/image plot of a variable given by dataset *ds* and variable name *var*. :param ds: Dataset or Dataframe that contains the variable named by *var*. :param var: The name of the variable to plot :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary or a comma-separated string of key-value pairs that maps the variable's dimension names to constant labels. e.g. "lat=12.4, time='2012-05-02'". :param title: an optional plot title :param properties: optional plot properties for Python matplotlib, e.g. "bins=512, range=(-1.5, +1.5), label='Sea Surface Temperature'" For full reference refer to https://matplotlib.org/api/lines_api.html and https://matplotlib.org/devdocs/api/_as_gen/matplotlib.patches.Patch.html#matplotlib.patches.Patch :param file: path to a file in which to save the plot :return: a matplotlib figure object or None if in IPython mode """ ds = DatasetLike.convert(ds) var_name = VarName.convert(var) if not var_name: raise ValidationError("Missing name for 'var'") var = ds[var_name] indexers = DictLike.convert(indexers) properties = DictLike.convert(properties) or {} figure = plt.figure() ax = figure.add_subplot(111) var_data = get_var_data(var, indexers) var_data.plot(ax=ax, **properties) if title: ax.set_title(title) figure.tight_layout() if file: figure.savefig(file) return figure if not in_notebook() else None
def enso(ds: xr.Dataset, var: VarName.TYPE, file: str, region: str = 'n34', custom_region: PolygonLike.TYPE = None, threshold: float = None, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Calculate ENSO index, which is defined as a five month running mean of anomalies of monthly means of SST data in the given region. :param ds: A monthly SST dataset :param file: Path to the reference data file e.g. a climatology. A suitable reference dataset can be generated using the long_term_average operation :param var: Dataset variable to use for index calculation :param region: Region for index calculation, the default is Nino3.4 :param custom_region: If 'custom' is chosen as the 'region', this parameter has to be provided to set the desired region. :param threshold: If given, boolean El Nino/La Nina timeseries will be calculated and added to the output dataset, according to the given threshold. Where anomaly larger than then positive value of the threshold indicates El Nino and anomaly smaller than the negative of the given threshold indicates La Nina. :param monitor: a progress monitor. :return: A dataset that contains the index timeseries. """ regions = { 'N1+2': '-90, -10, -80, 0', 'N3': '-150, -5, -90, 5', 'N3.4': '-170, -5, -120, 5', 'N4': '160, -5, -150, 5', 'custom': custom_region } converted_region = PolygonLike.convert(regions[region]) if not converted_region: raise ValidationError( 'No region has been provided to ENSO index calculation') name = 'ENSO ' + region + ' Index' if 'custom' == region: name = 'ENSO Index over ' + PolygonLike.format(converted_region) return _generic_index_calculation(ds, var, converted_region, 5, file, name, threshold, monitor)
def handle_plot_polygon(region: PolygonLike.TYPE = None): """ Return extents of the given PolygonLike. :param region: PolygonLike to introspect :return: extents """ if region is None: return None extents, explicit_coords = get_extents(region) lon_min, lat_min, lon_max, lat_max = extents if not check_bounding_box(lat_min, lat_max, lon_min, lon_max): raise ValidationError( 'Provided plot extents do not form a valid bounding box ' 'within [-180.0,+180.0,-90.0,+90.0]') return extents
def read_zarr(path: str, file_system: str = 'Local', drop_variables: VarNamesLike.TYPE = None, decode_cf: bool = True, decode_times: bool = True, normalize: bool = True) -> xr.Dataset: """ Read a dataset from a Zarr directory, Zarr ZIP archive, or remote Zarr object storage. For the Zarr format, refer to http://zarr.readthedocs.io/en/stable/. :param path: Zarr directory path, Zarr ZIP archive path, or object storage path or bucket name. :param file_system: File system identifier, "Local" is your locally mounted file system, for Amazon S3 use "S3", for general Object Storage use "OBS". :param drop_variables: List of variables to be dropped. :param decode_cf: Whether to decode CF attributes and coordinate variables. :param decode_times: Whether to decode time information (convert time coordinates to ``datetime`` objects). :param normalize: Whether to normalize the dataset's geo- and time-coding upon opening. See operation ``normalize``. """ drop_variables = VarNamesLike.convert(drop_variables) if file_system == 'Local': ds = xr.open_zarr(path, drop_variables=drop_variables, decode_cf=decode_cf, decode_times=decode_times) elif file_system == 'S3' or file_system == 'OBS': import s3fs store = s3fs.S3Map(path, s3=(s3fs.S3FileSystem(anon=True))) ds = xr.open_zarr(store, drop_variables=drop_variables, decode_cf=decode_cf, decode_times=decode_times) else: raise ValidationError(f'Unknown file_system {file_system!r}') if normalize: return adjust_temporal_attrs(normalize_op(ds)) return ds
def plot_data_frame(df: pd.DataFrame, plot_type: str = 'line', file: str = None, **kwargs) -> Figure: """ Plot a data frame. This is a wrapper of pandas.DataFrame.plot() function. For further documentation please see http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html :param df: A pandas dataframe to plot :param plot_type: Plot type :param file: path to a file in which to save the plot :param kwargs: Keyword arguments to pass to the underlying pandas.DataFrame.plot function """ if not isinstance(df, pd.DataFrame): raise ValidationError('"df" must be of type "pandas.DataFrame"') ax = df.plot(kind=plot_type, figsize=(8, 4), **kwargs) figure = ax.get_figure() if file: figure.savefig(file) return figure if not in_notebook() else None
def write_csv(obj: DataFrameLike.TYPE, file: FileLike.TYPE, columns: VarNamesLike.TYPE = None, na_rep: str = '', delimiter: str = ',', quotechar: str = None, more_args: DictLike.TYPE = None, monitor: Monitor = Monitor.NONE): """ Write comma-separated values (CSV) to plain text file from a DataFrame or Dataset. :param obj: The object to write as CSV; must be a ``DataFrame`` or a ``Dataset``. :param file: The CSV file path. :param columns: The names of variables that should be converted to columns. If given, coordinate variables are included automatically. :param delimiter: Delimiter to use. :param na_rep: A string representation of a missing value (no-data value). :param quotechar: The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. :param more_args: Other optional keyword arguments. Please refer to Pandas documentation of ``pandas.to_csv()`` function. :param monitor: optional progress monitor """ if obj is None: raise ValidationError('obj must not be None') columns = VarNamesLike.convert(columns) if isinstance(obj, pd.DataFrame): # The following code is needed, because Pandas treats any kw given in kwargs as being set, even if just None. kwargs = DictLike.convert(more_args) if kwargs is None: kwargs = {} if columns: kwargs.update(columns=columns) if delimiter: kwargs.update(sep=delimiter) if na_rep: kwargs.update(na_rep=na_rep) if quotechar: kwargs.update(quotechar=quotechar) with monitor.starting('Writing to CSV', 1): obj.to_csv(file, index_label='index', **kwargs) monitor.progress(1) elif isinstance(obj, xr.Dataset): var_names = [var_name for var_name in obj.data_vars if columns is None or var_name in columns] dim_names = None data_vars = [] for var_name in var_names: data_var = obj.data_vars[var_name] if dim_names is None: dim_names = data_var.dims elif dim_names != data_var.dims: raise ValidationError('Not all variables have the same dimensions. ' 'Please select variables so that their dimensions are equal.') data_vars.append(data_var) if dim_names is None: raise ValidationError('None of the selected variables has a dimension.') coord_vars = [] for dim_name in dim_names: if dim_name in obj.coords: coord_var = obj.coords[dim_name] else: coord_var = None for data_var in obj.coords.values(): if len(data_var.dims) == 1 and data_var.dims[0] == dim_name: coord_var = data_var break if coord_var is None: raise ValueError(f'No coordinate variable found for dimension "{dim_name}"') coord_vars.append(coord_var) coord_indexes = [range(len(coord_var)) for coord_var in coord_vars] num_coords = len(coord_vars) num_rows = 1 for coord_var in coord_vars: num_rows *= len(coord_var) stream = open(file, 'w') if isinstance(file, str) else file try: # Write header row stream.write('index') for i in range(num_coords): stream.write(delimiter) stream.write(coord_vars[i].name) for data_var in data_vars: stream.write(delimiter) stream.write(data_var.name) stream.write('\n') with monitor.starting('Writing CSV', num_rows): row = 0 for index in itertools.product(*coord_indexes): # Write data row stream.write(str(row)) for i in range(num_coords): coord_value = coord_vars[i].values[index[i]] stream.write(delimiter) stream.write(str(coord_value)) for data_var in data_vars: var_value = data_var.values[index] stream.write(delimiter) stream.write(str(var_value)) stream.write('\n') monitor.progress(1) row += 1 finally: if isinstance(file, str): stream.close() elif obj is None: raise ValidationError('obj must not be None') else: raise ValidationError('obj must be a pandas.DataFrame or a xarray.Dataset')
def animate_map(ds: xr.Dataset, var: VarName.TYPE = None, animate_dim: str = 'time', interval: int = 200, true_range: bool = False, indexers: DictLike.TYPE = None, region: PolygonLike.TYPE = None, projection: str = 'PlateCarree', central_lon: float = 0.0, title: str = None, contour_plot: bool = False, cmap_params: DictLike.TYPE = None, plot_properties: DictLike.TYPE = None, file: str = None, monitor: Monitor = Monitor.NONE) -> HTML: """ Create a geographic map animation for the variable given by dataset *ds* and variable name *var*. Creates an animation of the given variable from the given dataset on a map with coastal lines. In case no variable name is given, the first encountered variable in the dataset is animated. It is also possible to set extents of the animation. If no extents are given, a global animation is created. The following file formats for saving the animation are supported: html :param ds: the dataset containing the variable to animate :param var: the variable's name :param animate_dim: Dimension to animate, if none given defaults to time. :param interval: Delay between frames in milliseconds. Defaults to 200. :param true_range: If True, calculates colormap and colorbar configuration parameters from the whole dataset. Can potentially take a lot of time. Defaults to False, in which case the colormap is calculated from the first frame. :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary or a comma-separated string of key-value pairs that maps the variable's dimension names to constant labels. e.g. "layer=4". :param region: Region to animate :param projection: name of a global projection, see http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html :param central_lon: central longitude of the projection in degrees :param title: an optional title :param contour_plot: If true plot a filled contour plot of data, otherwise plots a pixelated colormesh :param cmap_params: optional additional colormap configuration parameters, e.g. "vmax=300, cmap='magma'" For full reference refer to http://xarray.pydata.org/en/stable/generated/xarray.plot.contourf.html :param plot_properties: optional plot properties for Python matplotlib, e.g. "bins=512, range=(-1.5, +1.5)" For full reference refer to https://matplotlib.org/api/lines_api.html and https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.contourf.html :param file: path to a file in which to save the animation :param monitor: A progress monitor. :return: An animation in HTML format """ if not isinstance(ds, xr.Dataset): raise NotImplementedError('Only gridded datasets are currently supported') var_name = None if not var: for key in ds.data_vars.keys(): var_name = key break else: var_name = VarName.convert(var) try: var = ds[var_name] except KeyError: raise ValidationError('Provided variable name "{}" does not exist in the given dataset'.format(var_name)) indexers = DictLike.convert(indexers) or {} properties = DictLike.convert(plot_properties) or {} cmap_params = DictLike.convert(cmap_params) or {} extents = None bounds = handle_plot_polygon(region) if bounds: lon_min, lat_min, lon_max, lat_max = bounds extents = [lon_min, lon_max, lat_min, lat_max] if len(ds.lat) < 2 or len(ds.lon) < 2: # Matplotlib can not plot datasets with less than these dimensions with # contourf and pcolormesh methods raise ValidationError('The minimum dataset spatial dimensions to create a map' ' plot are (2,2)') # See http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html# if projection == 'PlateCarree': proj = ccrs.PlateCarree(central_longitude=central_lon) elif projection == 'LambertCylindrical': proj = ccrs.LambertCylindrical(central_longitude=central_lon) elif projection == 'Mercator': proj = ccrs.Mercator(central_longitude=central_lon) elif projection == 'Miller': proj = ccrs.Miller(central_longitude=central_lon) elif projection == 'Mollweide': proj = ccrs.Mollweide(central_longitude=central_lon) elif projection == 'Orthographic': proj = ccrs.Orthographic(central_longitude=central_lon) elif projection == 'Robinson': proj = ccrs.Robinson(central_longitude=central_lon) elif projection == 'Sinusoidal': proj = ccrs.Sinusoidal(central_longitude=central_lon) elif projection == 'NorthPolarStereo': proj = ccrs.NorthPolarStereo(central_longitude=central_lon) elif projection == 'SouthPolarStereo': proj = ccrs.SouthPolarStereo(central_longitude=central_lon) else: raise ValidationError('illegal projection: "%s"' % projection) figure = plt.figure(figsize=(8, 4)) ax = plt.axes(projection=proj) if extents: ax.set_extent(extents, ccrs.PlateCarree()) else: ax.set_global() ax.coastlines() if not animate_dim: animate_dim = 'time' indexers[animate_dim] = var[animate_dim][0] var_data = get_var_data(var, indexers, remaining_dims=('lon', 'lat')) with monitor.starting("animate", len(var[animate_dim]) + 3): if true_range: data_min, data_max = _get_min_max(var, monitor=monitor) else: data_min, data_max = _get_min_max(var_data, monitor=monitor) cmap_params = determine_cmap_params(data_min, data_max, **cmap_params) plot_kwargs = {**properties, **cmap_params} # Plot the first frame to set-up the axes with the colorbar properly # transform keyword is for the coordinate our data is in, which in case of a # 'normal' lat/lon dataset is PlateCarree. if contour_plot: var_data.plot.contourf(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj}, add_colorbar=True, **plot_kwargs) else: var_data.plot.pcolormesh(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj}, add_colorbar=True, **plot_kwargs) if title: ax.set_title(title) figure.tight_layout() monitor.progress(1) def run(value): ax.clear() if extents: ax.set_extent(extents, ccrs.PlateCarree()) else: ax.set_global() ax.coastlines() indexers[animate_dim] = value var_data = get_var_data(var, indexers, remaining_dims=('lon', 'lat')) var_data.plot.contourf(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj}, add_colorbar=False, **plot_kwargs) if title: ax.set_title(title) monitor.progress(1) return ax anim = animation.FuncAnimation(figure, run, [i for i in var[animate_dim]], interval=interval, blit=False, repeat=False) anim_html = anim.to_jshtml() # Prevent the animation for running after it's finished del anim # Delete the rogue temp-file try: os.remove('None0000000.png') except FileNotFoundError: pass if file: with open(file, 'w') as outfile: outfile.write(anim_html) monitor.progress(1) return HTML(anim_html)
def temporal_aggregation(ds: DatasetLike.TYPE, method: str = 'mean', output_resolution: str = 'month', custom_resolution: str = None, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Perform aggregation of dataset according to the given method and output resolution. Note that the operation does not perform weighting. Depending on the combination of input and output resolutions, as well as aggregation method, the resulting dataset might yield unexpected results. Resolution 'month' will result in a monthly dataset with each month denoted by its first date. Resolution 'season' will result in a dataset aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first date of the season. The operation also works with custom resolution strings, see: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases If ``custom_resolution`` is provided, it will override ``output_resolution``. Some examples: 'QS-JUN' produces an output dataset on a quarterly resolution where the year ends in 1st of June and each quarter is denoted by its first date '8MS' produces an output dataset on an eight-month resolution where each period is denoted by the first date. Note that such periods will not be consistent over years. '8D' produces a dataset on an eight day resolution :param ds: Dataset to aggregate :param method: Aggregation method :param output_resolution: Desired temporal resolution of the output dataset :param custom_resolution: Custom temporal resolution, overrides output_resolution :return: Aggregated dataset """ ds = DatasetLike.convert(ds) # Check if time dtype is what we want if 'datetime64[ns]' != ds.time.dtype: raise ValidationError( 'Temporal aggregation operation expects a dataset with the' ' time coordinate of type datetime64[ns], but received' ' {}. Running the normalize operation on this' ' dataset may help'.format(ds.time.dtype)) # Try to figure out the input frequency try: in_freq = ds.attrs['time_coverage_resolution'] except KeyError: raise ValidationError( 'Could not determine temporal resolution of input dataset.' ' Running the adjust_temporal_attrs operation beforehand may' ' help.') if custom_resolution: freq = custom_resolution else: frequencies = {'month': 'MS', 'season': 'QS-DEC'} freq = frequencies[output_resolution] _validate_freq(in_freq, freq) with monitor.observing("resample dataset"): try: retset = getattr(resampler, method)(ds.resample(time=freq, keep_attrs=True)) except AttributeError: raise ValidationError( f'Provided aggregation method {method} is not valid.') for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + \ ' time: {} within years'.format(method) except KeyError: retset[var].attrs['cell_methods'] = 'time: {} within years'.format( method) return adjust_temporal_attrs(retset)
def _lta_general(ds: xr.Dataset, monitor: Monitor): """ Try to carry out a long term average in a general case, notably in the case of having seasonal datasets :param ds: Dataset to aggregate :param monitor: Progress monitor :return: Aggregated dataset """ time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc) time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc) total_work = 100 retset = ds # The dataset should feature time periods consistent over years # and denoted with the same dates each year if not _is_seasonal(ds.time): raise ValidationError( "A long term average dataset can not be created for" " a dataset with inconsistent seasons.") # Get 'representative year' c = 0 for group in ds.time.groupby('time.year'): c = c + 1 if c == 1: rep_year = group[1].time continue if c == 2 and len(group[1].time) > len(rep_year): rep_year = group[1].time break with monitor.starting('LTA', total_work=total_work): monitor.progress(work=0) step = total_work / len(rep_year.time) kwargs = {'monitor': monitor, 'step': step} retset = retset.groupby('time.month', squeeze=False).apply(_groupby_day, **kwargs) # Make the return dataset CF compliant retset = retset.stack(time=('month', 'day')) # Turn month, day coordinates to time retset = retset.reset_index('time') retset = retset.drop(['month', 'day']) retset['time'] = rep_year.time climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max], (len(rep_year), 1)), dims=['time', 'nv'], name='climatology_bounds') retset['climatology_bounds'] = climatology_bounds retset.time.attrs = ds.time.attrs retset.time.attrs['climatology'] = 'climatology_bounds' for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + ' time: mean over years' except KeyError: retset[var].attrs['cell_methods'] = 'time: mean over years' return retset
def coregister(ds_master: xr.Dataset, ds_replica: xr.Dataset, method_us: str = 'linear', method_ds: str = 'mean', monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Perform coregistration of two datasets by resampling the replica dataset unto the grid of the master. If upsampling has to be performed, this is achieved using interpolation, if downsampling has to be performed, the pixels of the replica dataset are aggregated to form a coarser grid. The returned dataset will contain the lat/lon intersection of provided master and replica datasets, resampled unto the master grid frequency. This operation works on datasets whose spatial dimensions are defined on pixel-registered and equidistant in lat/lon coordinates grids. E.g., data points define the middle of a pixel and pixels have the same size across the dataset. This operation will resample all variables in a dataset, as the lat/lon grid is defined per dataset. It works only if all variables in the dataset have lat and lon as dimensions. For an overview of downsampling/upsampling methods used in this operation, please see https://github.com/CAB-LAB/gridtools Whether upsampling or downsampling has to be performed is determined automatically based on the relationship of the grids of the provided datasets. :param ds_master: The dataset whose grid is used for resampling :param ds_replica: The dataset that will be resampled :param method_us: Interpolation method to use for upsampling. :param method_ds: Interpolation method to use for downsampling. :param monitor: a progress monitor. :return: The replica dataset resampled on the grid of the master """ try: grids = (('replica', ds_replica['lat'].values, -90), ('replica', ds_replica['lon'].values, -180), ('master', ds_master['lat'].values, -90), ('master', ds_master['lon'].values, -180)) except KeyError: raise ValidationError( 'Coregistration requires that both datasets are' ' spatial datasets with lon and lat dimensions. The' ' dimensionality of the provided master dataset is: {},' ' the dimensionality of the provided replica dataset is:' ' {}. Running the normalize operation might help in' ' case spatial dimensions have different' ' names'.format(ds_master.dims, ds_replica.dims)) # Don't do anything if datasets already have the same spatial definition if _grids_equal(ds_master, ds_replica): return ds_replica # Check if all arrays of the replica dataset have the required dimensionality for key in ds_replica.data_vars: if not _is_valid_array(ds_replica[key]): raise ValidationError( '{} data array of replica dataset is not valid for' ' coregistration. The data array is expected to' ' have lat and lon dimensions. The data array has' ' the following dimensions: {}. Consider running' ' select_var operation to exclude this' ' data array'.format(key, ds_replica[key].dims)) # Check if the grids of the provided datasets are equidistant and pixel # registered for array in grids: if not _within_bounds(array[1], array[2]): raise ValidationError( 'The {} dataset grid does not fall into required' ' boundaries. Required boundaries are ({}, {}),' ' dataset boundaries are ({}, {}). Running the' ' normalize operation' ' may help.'.format(array[0], array[2], abs(array[2]), array[1][0], array[1][-1])) if not _is_equidistant(array[1]): raise ValidationError('The {} dataset grid is not' ' equidistant, can not perform' ' coregistration'.format(array[0])) if not _is_pixel_registered(array[1], array[2]): raise ValidationError('The {} dataset grid is not' ' pixel-registered, can not perform' ' coregistration'.format(array[0])) # Co-register methods_us = {'nearest': 10, 'linear': 11} methods_ds = { 'first': 50, 'last': 51, 'mean': 54, 'mode': 56, 'var': 57, 'std': 58 } return _resample_dataset(ds_master, ds_replica, methods_us[method_us], methods_ds[method_ds], monitor)
def pearson_correlation_scalar( ds_x: DatasetLike.TYPE, ds_y: DatasetLike.TYPE, var_x: VarName.TYPE, var_y: VarName.TYPE, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis. Performs a simple correlation analysis on two data variables and returns a correlation coefficient and the corresponding p_value. Positive correlation implies that as x grows, so does y. Negative correlation implies that as x increases, y decreases. For more information how to interpret the results, see `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_, and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_. :param ds_x: The 'x' dataset :param ds_y: The 'y' dataset :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset :param monitor: a progress monitor. :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value} """ ds_x = DatasetLike.convert(ds_x) ds_y = DatasetLike.convert(ds_y) var_x = VarName.convert(var_x) var_y = VarName.convert(var_y) array_y = ds_y[var_y] array_x = ds_x[var_x] if (array_x.dims != array_y.dims): raise ValidationError( 'Both datasets should feature the same' ' dimensionality. Currently provided ds_x[var_x] ' f'has {array_x.dims}, provided ds_y[var_y]' f' has {array_y.dims}') for dim in array_x.dims: if len(array_x[dim]) != len(array_y[dim]): raise ValidationError( 'All dimensions of both provided data variables' f' must be the same length. Currently {dim} of ds_x[var_x]' f' has {len(array_x[dim])} values, while' f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.' ' You may want to try to coregister the datasets beforehand.') n_vals = 1 for dim in array_x.dims: n_vals = n_vals * len(array_x[dim]) if n_vals < 3: raise ValidationError( 'There should be no less than 3 values in both data variables' f' to perform the correlation. Currently there are {n_vals} values' ) with monitor.observing("Calculate Pearson correlation"): cc, pv = pearsonr(array_x.stack(z=array_x.dims), array_y.stack(z=array_y.dims)) return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
def pearson_correlation(ds_x: DatasetLike.TYPE, ds_y: DatasetLike.TYPE, var_x: VarName.TYPE, var_y: VarName.TYPE, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis. Perform Pearson correlation on two datasets and produce a lon/lat map of correlation coefficients and the correspoding p_values. In case two 3D lon/lat/time datasets are provided, pixel by pixel correlation will be performed. It is also possible two pro Perform Pearson correlation analysis on two time/lat/lon datasets and produce a lat/lon map of correlation coefficients and p_values of underlying timeseries in the provided datasets. The lat/lon definition of both datasets has to be the same. The length of the time dimension should be equal, but not neccessarily have the same definition. E.g., it is possible to correlate different times of the same area. There are 'x' and 'y' datasets. Positive correlations imply that as x grows, so does y. Negative correlations imply that as x increases, y decreases. For more information how to interpret the results, see `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_, and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_. :param ds_x: The 'x' dataset :param ds_y: The 'y' dataset :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset :param monitor: a progress monitor. :return: a dataset containing a map of correlation coefficients and p_values """ ds_x = DatasetLike.convert(ds_x) ds_y = DatasetLike.convert(ds_y) var_x = VarName.convert(var_x) var_y = VarName.convert(var_y) array_y = ds_y[var_y] array_x = ds_x[var_x] # Further validate inputs if array_x.dims == array_y.dims: if len(array_x.dims) != 3 or len(array_y.dims) != 3: raise ValidationError( 'A correlation coefficient map can only be produced' ' if both provided datasets are 3D datasets with' ' lon/lat/time dimensionality, or if a combination' ' of a 3D lon/lat/time dataset and a 1D timeseries' ' is provided.') if array_x.values.shape != array_y.values.shape: raise ValidationError( f'The provided variables {var_x} and {var_y} do not have the' ' same shape, Pearson correlation can not be' ' performed. Please review operation' ' documentation') if (not ds_x['lat'].equals(ds_y['lat']) or not ds_x['lon'].equals(ds_y['lon'])): raise ValidationError( 'When performing a pixel by pixel correlation the' ' datasets have to have the same lat/lon' ' definition. Consider running coregistration' ' first') elif (((len(array_x.dims) == 3) and (len(array_y.dims) != 1)) or ((len(array_x.dims) == 1) and (len(array_y.dims) != 3)) or ((len(array_x.dims) != 3) and (len(array_y.dims) == 1)) or ((len(array_x.dims) != 1) and (len(array_y.dims) == 3))): raise ValidationError( 'A correlation coefficient map can only be produced' ' if both provided datasets are 3D datasets with' ' lon/lat/time dimensionality, or if a combination' ' of a 3D lon/lat/time dataset and a 1D timeseries' ' is provided.') if len(array_x['time']) != len(array_y['time']): raise ValidationError( 'The length of the time dimension differs between' ' the given datasets. Can not perform the calculation' ', please review operation documentation.') if len(array_x['time']) < 3: raise ValidationError( 'The length of the time dimension should not be less' ' than three to run the calculation.') # Do pixel by pixel correlation retset = _pearsonr(array_x, array_y, monitor) retset.attrs['Cate_Description'] = f'Correlation between {var_y} {var_x}' return adjust_spatial_attrs(retset)