def no_op(num_steps: int = 20, step_duration: float = 0.5, fail_before: bool = False, fail_after: bool = False, error_type: str = 'Value', monitor: Monitor = Monitor.NONE) -> bool: """ An operation that basically does nothing but spending configurable time. It may be useful for testing purposes. :param num_steps: Number of steps to iterate. :param step_duration: How much time to spend in each step in seconds. :param fail_before: If the operation should fail before spending time doing nothing (raise a ValidationError). :param fail_after: If the operation should fail after spending time doing nothing (raise a ValueError). :param error_type: The type of error to raise. :param monitor: A progress monitor. :return: Always True """ import time with monitor.starting('Computing nothing', num_steps): if fail_before: error_class = _ERROR_TYPES[error_type] raise error_class( f'This is a test: intentionally failed with a {error_type} error' f' before {num_steps} times doing anything.') for i in range(num_steps): time.sleep(step_duration) monitor.progress( 1.0, 'Step %s of %s doing nothing' % (i + 1, num_steps)) if fail_after: error_class = _ERROR_TYPES[error_type] raise error_class( f'Intentionally failed failed with a {error_type} error' f' after {num_steps} times doing nothing.') return True
def no_op(num_steps: int = 20, step_duration: float = 0.5, fail_before: bool = False, fail_after: bool = False, error_type: str = 'Value', monitor: Monitor = Monitor.NONE) -> bool: """ An operation that basically does nothing but spending configurable time. It may be useful for testing purposes. :param num_steps: Number of steps to iterate. :param step_duration: How much time to spend in each step in seconds. :param fail_before: If the operation should fail before spending time doing nothing (raise a ValidationError). :param fail_after: If the operation should fail after spending time doing nothing (raise a ValueError). :param error_type: The type of error to raise. :param monitor: A progress monitor. :return: Always True """ import time with monitor.starting('Computing nothing', num_steps): if fail_before: error_class = _ERROR_TYPES[error_type] raise error_class(f'This is a test: intentionally failed with a {error_type} error' f' before {num_steps} times doing anything.') for i in range(num_steps): time.sleep(step_duration) monitor.progress(1.0, 'Step %s of %s doing nothing' % (i + 1, num_steps)) if fail_after: error_class = _ERROR_TYPES[error_type] raise error_class(f'Intentionally failed failed with a {error_type} error' f' after {num_steps} times doing nothing.') return True
def anomaly_external(ds: xr.Dataset, file: str, transform: str = None, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Calculate anomaly with external reference data, for example, a climatology. The given reference dataset is expected to consist of 12 time slices, one for each month. The returned dataset will contain the variable names found in both - the reference and the given dataset. Names found in the given dataset, but not in the reference, will be dropped from the resulting dataset. The calculated anomaly will be against the corresponding month of the reference data. E.g. January against January, etc. In case spatial extents differ between the reference and the given dataset, the anomaly will be calculated on the intersection. :param ds: The dataset to calculate anomalies from :param file: Path to reference data file :param transform: Apply the given transformation before calculating the anomaly. For supported operations see help on 'ds_arithmetics' operation. :param monitor: a progress monitor. :return: The anomaly dataset """ # Check if the time coordinate is of dtype datetime try: if ds.time.dtype != 'datetime64[ns]': raise ValidationError( 'The dataset provided for anomaly calculation' ' is required to have a time coordinate of' ' dtype datetime64[ns]. Running the normalize' ' operation on this dataset might help.') except AttributeError: raise ValidationError('The dataset provided for anomaly calculation' ' is required to have a time coordinate.') clim = xr.open_dataset(file) ret = ds.copy() if transform: ret = ds_arithmetics(ds, transform) # Group by months, subtract the appropriate slice from the reference # Note that this requires that 'time' coordinate labels are of type # datetime64[ns] total_work = 100 step = 100 / 12 with monitor.starting('Anomaly', total_work=total_work): monitor.progress(work=0) kwargs = {'ref': clim, 'monitor': monitor, 'step': step} ret = ret.groupby(ds['time.month']).apply(_group_anomaly, **kwargs) # Running groupby results in a redundant 'month' variable being added to # the dataset ret = ret.drop('month') return ret
def ds_arithmetics(ds: DatasetLike.TYPE, op: str, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Do arithmetic operations on the given dataset by providing a list of arithmetic operations and the corresponding constant. The operations will be applied to the dataset in the order in which they appear in the list. For example: 'log,+5,-2,/3,*2' Currently supported arithmetic operations: log,log10,log2,log1p,exp,+,-,/,* where: log - natural logarithm log10 - base 10 logarithm log2 - base 2 logarithm log1p - log(1+x) exp - the exponential The operations will be applied element-wise to all arrays of the dataset. :param ds: The dataset to which to apply arithmetic operations :param op: A comma separated list of arithmetic operations to apply :param monitor: a progress monitor. :return: The dataset with given arithmetic operations applied """ ds = DatasetLike.convert(ds) retset = ds with monitor.starting('Calculate result', total_work=len(op.split(','))): for item in op.split(','): with monitor.child(1).observing("Calculate"): item = item.strip() if item[0] == '+': retset = retset + float(item[1:]) elif item[0] == '-': retset = retset - float(item[1:]) elif item[0] == '*': retset = retset * float(item[1:]) elif item[0] == '/': retset = retset / float(item[1:]) elif item[:] == 'log': retset = np.log(retset) elif item[:] == 'log10': retset = np.log10(retset) elif item[:] == 'log2': retset = np.log2(retset) elif item[:] == 'log1p': retset = np.log1p(retset) elif item[:] == 'exp': retset = np.exp(retset) else: raise ValidationError('Arithmetic operation {} not' ' implemented.'.format(item[0])) return retset
def ds_arithmetics(ds: DatasetLike.TYPE, op: str, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Do arithmetic operations on the given dataset by providing a list of arithmetic operations and the corresponding constant. The operations will be applied to the dataset in the order in which they appear in the list. For example: 'log,+5,-2,/3,*2' Currently supported arithmetic operations: log,log10,log2,log1p,exp,+,-,/,* where: log - natural logarithm log10 - base 10 logarithm log2 - base 2 logarithm log1p - log(1+x) exp - the exponential The operations will be applied element-wise to all arrays of the dataset. :param ds: The dataset to which to apply arithmetic operations :param op: A comma separated list of arithmetic operations to apply :param monitor: a progress monitor. :return: The dataset with given arithmetic operations applied """ ds = DatasetLike.convert(ds) retset = ds with monitor.starting('Calculate result', total_work=len(op.split(','))): for item in op.split(','): with monitor.child(1).observing("Calculate"): item = item.strip() if item[0] == '+': retset = retset + float(item[1:]) elif item[0] == '-': retset = retset - float(item[1:]) elif item[0] == '*': retset = retset * float(item[1:]) elif item[0] == '/': retset = retset / float(item[1:]) elif item[:] == 'log': retset = xu.log(retset) elif item[:] == 'log10': retset = xu.log10(retset) elif item[:] == 'log2': retset = xu.log2(retset) elif item[:] == 'log1p': retset = xu.log1p(retset) elif item[:] == 'exp': retset = xu.exp(retset) else: raise ValueError('Arithmetic operation {} not' ' implemented.'.format(item[0])) return retset
def _resample_dataset(ds_master: xr.Dataset, ds_replica: xr.Dataset, method_us: int, method_ds: int, monitor: Monitor) -> xr.Dataset: """ Resample replica onto the grid of the master. This does spatial resampling the whole dataset, e.g., all variables in the replica dataset. This method works only if both datasets have (time, lat, lon) dimensions. Note that dataset attributes are not propagated due to currently undecided CDM attributes' set. :param ds_master: xr.Dataset whose lat/lon coordinates are used as the resampling grid :param ds_replica: xr.Dataset that will be resampled on the masters' grid :param method_us: Interpolation method for upsampling, see resampling.py :param method_ds: Interpolation method for downsampling, see resampling.py :param monitor: a progress monitor. :return: xr.Dataset The resampled replica dataset """ # Find lat/lon bounds of the intersection of master and replica grids. The # bounds should fall on pixel boundaries for both spatial dimensions for # both datasets lat_min, lat_max = _find_intersection(ds_master['lat'].values, ds_replica['lat'].values, global_bounds=(-90, 90)) lon_min, lon_max = _find_intersection(ds_master['lon'].values, ds_replica['lon'].values, global_bounds=(-180, 180)) # Subset replica dataset and master grid. We're not using here the subset # operation, because the subset operation may produce datasets that cross # the anti-meridian by design. However, such a disjoint dataset can not be # resampled using our current resampling methods. lat_slice = slice(lat_min, lat_max) lon_slice = slice(lon_min, lon_max) lon = ds_master['lon'].sel(lon=lon_slice) lat = ds_master['lat'].sel(lat=lat_slice) ds_replica = ds_replica.sel(lon=lon_slice, lat=lat_slice) # Don't do anything if datasets already have the same spatial definition if _grids_equal(ds_master, ds_replica): return ds_replica with monitor.starting("coregister dataset", len(ds_replica.data_vars)): kwargs = { 'lon': lon, 'lat': lat, 'method_us': method_us, 'method_ds': method_ds, 'parent_monitor': monitor } retset = ds_replica.apply(_resample_array, keep_attrs=True, **kwargs) return adjust_spatial_attrs(retset)
def _lta_daily(ds: xr.Dataset, monitor: Monitor): """ Carry out a long term average of a daily dataset :param ds: Dataset to aggregate :param monitor: Progress monitor :return: Aggregated dataset """ time_min = pd.Timestamp(ds.time.values[0]) time_max = pd.Timestamp(ds.time.values[-1]) total_work = 100 retset = ds with monitor.starting('LTA', total_work=total_work): monitor.progress(work=0) step = total_work / 366 kwargs = {'monitor': monitor, 'step': step} retset = retset.groupby('time.month', squeeze=False).apply(_groupby_day, **kwargs) # Make the return dataset CF compliant retset = retset.stack(time=('month', 'day')) # Get rid of redundant dates drop = [(2, 29), (2, 30), (2, 31), (4, 31), (6, 31), (9, 31), (11, 31)] retset = retset.drop(drop, dim='time') # Turn month, day coordinates to time retset = retset.reset_index('time') retset = retset.drop(['month', 'day']) time_coord = pd.date_range(start='{}-01-01'.format(time_min.year), end='{}-12-31'.format(time_min.year), freq='D') if len(time_coord) == 366: time_coord = time_coord.drop( np.datetime64('{}-02-29'.format(time_min.year))) retset['time'] = time_coord climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max], (365, 1)), dims=['time', 'nv'], name='climatology_bounds') retset['climatology_bounds'] = climatology_bounds retset.time.attrs = ds.time.attrs retset.time.attrs['climatology'] = 'climatology_bounds' for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + ' time: mean over years' except KeyError: retset[var].attrs['cell_methods'] = 'time: mean over years' return retset
def _lta_daily(ds: xr.Dataset, monitor: Monitor): """ Carry out a long term average of a daily dataset :param ds: Dataset to aggregate :param monitor: Progress monitor :return: Aggregated dataset """ time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc) time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc) total_work = 100 retset = ds with monitor.starting('LTA', total_work=total_work): monitor.progress(work=0) step = total_work / 366 kwargs = {'monitor': monitor, 'step': step} retset = retset.groupby('time.month', squeeze=False).apply(_groupby_day, **kwargs) # Make the return dataset CF compliant retset = retset.stack(time=('month', 'day')) # Get rid of redundant dates drop = [(2, 29), (2, 30), (2, 31), (4, 31), (6, 31), (9, 31), (11, 31)] retset = retset.drop(drop, dim='time') # Turn month, day coordinates to time retset = retset.reset_index('time') retset = retset.drop(['month', 'day']) time_coord = pd.date_range(start='{}-01-01'.format(time_min.year), end='{}-12-31'.format(time_min.year), freq='D') if len(time_coord) == 366: time_coord = time_coord.drop(np.datetime64('{}-02-29'.format(time_min.year))) retset['time'] = time_coord climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max], (365, 1)), dims=['time', 'nv'], name='climatology_bounds') retset['climatology_bounds'] = climatology_bounds retset.time.attrs = ds.time.attrs retset.time.attrs['climatology'] = 'climatology_bounds' for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + ' time: mean over years' except KeyError: retset[var].attrs['cell_methods'] = 'time: mean over years' return retset
def update_indices(self, update_file_lists: bool = False, monitor: Monitor = Monitor.NONE): with monitor.starting('Updating indices', 100): self._init_data_sources() monitor.progress(work=10 if update_file_lists else 100) if update_file_lists: child_monitor = monitor.child(work=90) with child_monitor.starting('Updating file lists', len(self._data_sources)): for data_source in self._data_sources: data_source.update_file_list() child_monitor.progress(work=1)
def reduce(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None, dim: DimNamesLike.TYPE = None, method: str = 'mean', monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Reduce the given variables of the given dataset along the given dimensions. If no variables are given, all variables of the dataset will be reduced. If no dimensions are given, all dimensions will be reduced. If no variables have been given explicitly, it can be set that only variables featuring numeric values should be reduced. :param ds: Dataset to reduce :param var: Variables in the dataset to reduce :param dim: Dataset dimensions along which to reduce :param method: reduction method :param monitor: A progress monitor """ ufuncs = { 'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean, 'median': np.nanmedian, 'sum': np.nansum } ds = DatasetLike.convert(ds) if not var: var = list(ds.data_vars.keys()) var_names = VarNamesLike.convert(var) if not dim: dim = list(ds.coords.keys()) else: dim = DimNamesLike.convert(dim) retset = ds.copy() for var_name in var_names: intersection = [ value for value in dim if value in retset[var_name].dims ] with monitor.starting("Reduce dataset", total_work=100): monitor.progress(5) with monitor.child(95).observing("Reduce"): retset[var_name] = retset[var_name].reduce(ufuncs[method], dim=intersection, keep_attrs=True) return retset
def _fetch_solr_json(base_url, query_args, offset=0, limit=3500, timeout=10, monitor: Monitor = Monitor.NONE): """ Return JSON value read from paginated Solr web-service. """ combined_json_dict = None num_found = -1 # we don't know ahead of time how much request are necessary with monitor.starting("Loading", 10): while True: monitor.progress(work=1) paging_query_args = dict(query_args or {}) # noinspection PyArgumentList paging_query_args.update(offset=offset, limit=limit, format='application/solr+json') url = base_url + '?' + urllib.parse.urlencode(paging_query_args) try: with urllib.request.urlopen(url, timeout=timeout) as response: json_text = response.read() json_dict = json.loads(json_text.decode('utf-8')) if num_found is -1: num_found = json_dict.get('response', {}).get('numFound', 0) if not combined_json_dict: combined_json_dict = json_dict if num_found < limit: break else: docs = json_dict.get('response', {}).get('docs', []) combined_json_dict.get('response', {}).get('docs', []).extend(docs) if num_found < offset + limit: break except (urllib.error.HTTPError, urllib.error.URLError) as e: raise DataAccessError( "Downloading CCI Open Data Portal index failed: {}\n{}". format(e, base_url)) from e except socket.timeout: raise DataAccessError( "Downloading CCI Open Data Portal index failed: connection timeout\n{}" .format(base_url)) offset += limit return combined_json_dict
def add_local_data_source(self, data_source_id: str, file_path_pattern: str, monitor: Monitor): """ Adds a local data source made up of the specified files. :param data_source_id: The identifier of the local data source. :param file_path_pattern: The files path containing wildcards. :param monitor: a progress monitor. :return: JSON-serializable list of 'local' data sources, sorted by name. """ data_store = DATA_STORE_REGISTRY.get_data_store('local') if data_store is None: raise ValueError('Unknown data store: "%s"' % 'local') with monitor.starting('Adding local data source', 100): # TODO use monitor, while extracting metadata data_store.add_pattern(data_source_id=data_source_id, files=file_path_pattern) return self.get_data_sources('local', monitor=monitor.child(100))
def _resample_dataset(ds_master: xr.Dataset, ds_replica: xr.Dataset, method_us: int, method_ds: int, monitor: Monitor) -> xr.Dataset: """ Resample replica onto the grid of the master. This does spatial resampling the whole dataset, e.g., all variables in the replica dataset. This method works only if both datasets have (time, lat, lon) dimensions. Note that dataset attributes are not propagated due to currently undecided CDM attributes' set. :param ds_master: xr.Dataset whose lat/lon coordinates are used as the resampling grid :param ds_replica: xr.Dataset that will be resampled on the masters' grid :param method_us: Interpolation method for upsampling, see resampling.py :param method_ds: Interpolation method for downsampling, see resampling.py :param monitor: a progress monitor. :return: xr.Dataset The resampled replica dataset """ # Find lat/lon bounds of the intersection of master and replica grids. The # bounds should fall on pixel boundaries for both spatial dimensions for # both datasets lat_min, lat_max = _find_intersection(ds_master['lat'].values, ds_replica['lat'].values, global_bounds=(-90, 90)) lon_min, lon_max = _find_intersection(ds_master['lon'].values, ds_replica['lon'].values, global_bounds=(-180, 180)) # Subset replica dataset and master grid. We're not using here the subset # operation, because the subset operation may produce datasets that cross # the anti-meridian by design. However, such a disjoint dataset can not be # resampled using our current resampling methods. lat_slice = slice(lat_min, lat_max) lon_slice = slice(lon_min, lon_max) lon = ds_master['lon'].sel(lon=lon_slice) lat = ds_master['lat'].sel(lat=lat_slice) ds_replica = ds_replica.sel(lon=lon_slice, lat=lat_slice) # Don't do anything if datasets already have the same spatial definition if _grids_equal(ds_master, ds_replica): return ds_replica with monitor.starting("coregister dataset", len(ds_replica.data_vars)): kwargs = {'lon': lon, 'lat': lat, 'method_us': method_us, 'method_ds': method_ds, 'parent_monitor': monitor} retset = ds_replica.apply(_resample_array, keep_attrs=True, **kwargs) return adjust_spatial_attrs(retset)
def _generic_index_calculation( ds: xr.Dataset, var: VarName.TYPE, region: PolygonLike.TYPE, window: int, file: str, name: str, threshold: float = None, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ A generic index calculation. Where an index is defined as an anomaly against the given reference of a moving average of the given window size of the given given region of the given variable of the given dataset. :param ds: Dataset from which to calculate the index :param var: Variable from which to calculate index :param region: Spatial subset from which to calculate the index :param window: Window size for the moving average :param file: Path to the reference file :param threshold: Absolute threshold that indicates an ENSO event :param name: Name of the index :param monitor: a progress monitor. :return: A dataset that contains the index timeseries """ var = VarName.convert(var) region = PolygonLike.convert(region) with monitor.starting("Calculate the index", total_work=2): ds = select_var(ds, var) ds_subset = subset_spatial(ds, region) anom = anomaly_external(ds_subset, file, monitor=monitor.child(1)) with monitor.child(1).observing("Calculate mean"): ts = anom.mean(dim=['lat', 'lon']) df = pd.DataFrame(data=ts[var].values, columns=[name], index=ts.time.values) retval = df.rolling(window=window, center=True).mean().dropna() if threshold is None: return retval retval['El Nino'] = pd.Series((retval[name] > threshold), index=retval.index) retval['La Nina'] = pd.Series((retval[name] < -threshold), index=retval.index) return retval
def reduce(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None, dim: DimNamesLike.TYPE = None, method: str = 'mean', monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Reduce the given variables of the given dataset along the given dimensions. If no variables are given, all variables of the dataset will be reduced. If no dimensions are given, all dimensions will be reduced. If no variables have been given explicitly, it can be set that only variables featuring numeric values should be reduced. :param ds: Dataset to reduce :param var: Variables in the dataset to reduce :param dim: Dataset dimensions along which to reduce :param method: reduction method :param monitor: A progress monitor """ ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean, 'median': np.nanmedian, 'sum': np.nansum} ds = DatasetLike.convert(ds) if not var: var = list(ds.data_vars.keys()) var_names = VarNamesLike.convert(var) if not dim: dim = list(ds.coords.keys()) else: dim = DimNamesLike.convert(dim) retset = ds.copy() for var_name in var_names: intersection = [value for value in dim if value in retset[var_name].dims] with monitor.starting("Reduce dataset", total_work=100): monitor.progress(5) with monitor.child(95).observing("Reduce"): retset[var_name] = retset[var_name].reduce(ufuncs[method], dim=intersection, keep_attrs=True) return retset
def tseries_mean(ds: xr.Dataset, var: VarNamesLike.TYPE, std_suffix: str = '_std', calculate_std: bool = True, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Extract spatial mean timeseries of the provided variables, return the dataset that in addition to all the information in the given dataset contains also timeseries data for the provided variables, following naming convention 'var_name1_ts_mean' If a data variable with more dimensions than time/lat/lon is provided, the data will be reduced by taking the mean of all data values at a single time position resulting in one dimensional timeseries data variable. :param ds: The dataset from which to perform timeseries extraction. :param var: Variables for which to perform timeseries extraction :param calculate_std: Whether to calculate std in addition to mean :param std_suffix: Std suffix to use for resulting datasets, if std is calculated. :param monitor: a progress monitor. :return: Dataset with timeseries variables """ if not var: var = '*' retset = select_var(ds, var) names = retset.data_vars.keys() with monitor.starting("Calculate mean", total_work=len(names)): for name in names: dims = list(ds[name].dims) dims.remove('time') with monitor.child(1).observing("Calculate mean"): retset[name] = retset[name].mean(dim=dims, keep_attrs=True) retset[name].attrs[ 'Cate_Description'] = 'Mean aggregated over {} at each point in time.'.format( dims) std_name = name + std_suffix retset[std_name] = ds[name].std(dim=dims) retset[std_name].attrs[ 'Cate_Description'] = 'Accompanying std values for variable \'{}\''.format( name) return retset
def _generic_index_calculation(ds: xr.Dataset, var: VarName.TYPE, region: PolygonLike.TYPE, window: int, file: str, name: str, threshold: float = None, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ A generic index calculation. Where an index is defined as an anomaly against the given reference of a moving average of the given window size of the given given region of the given variable of the given dataset. :param ds: Dataset from which to calculate the index :param var: Variable from which to calculate index :param region: Spatial subset from which to calculate the index :param window: Window size for the moving average :param file: Path to the reference file :param threshold: Absolute threshold that indicates an ENSO event :param name: Name of the index :param monitor: a progress monitor. :return: A dataset that contains the index timeseries """ var = VarName.convert(var) region = PolygonLike.convert(region) with monitor.starting("Calculate the index", total_work=2): ds = select_var(ds, var) ds_subset = subset_spatial(ds, region) anom = anomaly_external(ds_subset, file, monitor=monitor.child(1)) with monitor.child(1).observing("Calculate mean"): ts = anom.mean(dim=['lat', 'lon']) df = pd.DataFrame(data=ts[var].values, columns=[name], index=ts.time) retval = df.rolling(window=window, center=True).mean().dropna() if threshold is None: return retval retval['El Nino'] = pd.Series((retval[name] > threshold), index=retval.index) retval['La Nina'] = pd.Series((retval[name] < -threshold), index=retval.index) return retval
def _lta_monthly(ds: xr.Dataset, monitor: Monitor): """ Carry out a long term average on a monthly dataset :param ds: Dataset to aggregate :param monitor: Progress monitor :return: Aggregated dataset """ time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc) time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc) total_work = 100 retset = ds with monitor.starting('LTA', total_work=total_work): monitor.progress(work=0) step = total_work / 12 kwargs = {'monitor': monitor, 'step': step} retset = retset.groupby('time.month', squeeze=False).apply(_mean, **kwargs) # Make the return dataset CF compliant retset = retset.rename({'month': 'time'}) retset['time'] = pd.date_range('{}-01-01'.format(time_min.year), freq='MS', periods=12) climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max], (12, 1)), dims=['time', 'nv'], name='climatology_bounds') retset['climatology_bounds'] = climatology_bounds retset.time.attrs = ds.time.attrs retset.time.attrs['climatology'] = 'climatology_bounds' for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + ' time: mean over years' except KeyError: retset[var].attrs['cell_methods'] = 'time: mean over years' return retset
def _fetch_solr_json(base_url, query_args, offset=0, limit=3500, timeout=10, monitor: Monitor = Monitor.NONE): """ Return JSON value read from paginated Solr web-service. """ combined_json_dict = None num_found = -1 # we don't know ahead of time how much request are necessary with monitor.starting("Loading", 10): while True: monitor.progress(work=1) if monitor.is_cancelled(): raise InterruptedError paging_query_args = dict(query_args or {}) paging_query_args.update(offset=offset, limit=limit, format='application/solr+json') url = base_url + '?' + urllib.parse.urlencode(paging_query_args) with urllib.request.urlopen(url, timeout=timeout) as response: json_text = response.read() json_dict = json.loads(json_text.decode('utf-8')) if num_found is -1: num_found = json_dict.get('response', {}).get('numFound', 0) if not combined_json_dict: combined_json_dict = json_dict if num_found < limit: break else: docs = json_dict.get('response', {}).get('docs', []) combined_json_dict.get('response', {}).get('docs', []).extend(docs) if num_found < offset + limit: break offset += limit return combined_json_dict
def _make_local(self, local_ds: LocalDataSource, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): # local_name = local_ds.name local_id = local_ds.name time_range = TimeRangeLike.convert(time_range) if time_range else None region = PolygonLike.convert(region) if region else None var_names = VarNamesLike.convert( var_names) if var_names else None # type: Sequence compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False encoding_update = dict() if compression_enabled: encoding_update.update({ 'zlib': True, 'complevel': compression_level }) if region or var_names: protocol = _ODP_PROTOCOL_OPENDAP else: protocol = _ODP_PROTOCOL_HTTP local_path = os.path.join(local_ds.data_store.data_store_path, local_id) if not os.path.exists(local_path): os.makedirs(local_path) selected_file_list = self._find_files(time_range) if protocol == _ODP_PROTOCOL_OPENDAP: files = self._get_urls_list(selected_file_list, protocol) monitor.start('Sync ' + self.name, total_work=len(files)) for idx, dataset_uri in enumerate(files): child_monitor = monitor.child(work=1) file_name = os.path.basename(dataset_uri) local_filepath = os.path.join(local_path, file_name) time_coverage_start = selected_file_list[idx][1] time_coverage_end = selected_file_list[idx][2] remote_netcdf = None local_netcdf = None try: remote_netcdf = NetCDF4DataStore(dataset_uri) local_netcdf = NetCDF4DataStore(local_filepath, mode='w', persist=True) local_netcdf.set_attributes(remote_netcdf.get_attrs()) remote_dataset = xr.Dataset.load_store(remote_netcdf) process_region = False if region: geo_lat_min = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_min') geo_lat_max = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_max') geo_lon_min = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_min') geo_lon_max = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_max') geo_lat_res = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_resolution') geo_lon_res = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_resolution') if not (isnan(geo_lat_min) or isnan(geo_lat_max) or isnan(geo_lon_min) or isnan(geo_lon_max) or isnan(geo_lat_res) or isnan(geo_lon_res)): process_region = True [lat_min, lon_min, lat_max, lon_max] = region.bounds lat_min = floor( (lat_min - geo_lat_min) / geo_lat_res) lat_max = ceil( (lat_max - geo_lat_min) / geo_lat_res) lon_min = floor( (lon_min - geo_lon_min) / geo_lon_res) lon_max = ceil( (lon_max - geo_lon_min) / geo_lon_res) # TODO (kbernat): check why dataset.sel fails! remote_dataset = remote_dataset.isel( drop=False, lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max)) geo_lat_max = lat_max * geo_lat_res + geo_lat_min geo_lat_min += lat_min * geo_lat_res geo_lon_max = lon_max * geo_lon_res + geo_lon_min geo_lon_min += lon_min * geo_lon_res if not var_names: var_names = [ var_name for var_name in remote_netcdf.variables.keys() ] var_names.extend([ coord_name for coord_name in remote_dataset.coords.keys() if coord_name not in var_names ]) child_monitor.start(label=file_name, total_work=len(var_names)) for sel_var_name in var_names: var_dataset = remote_dataset.drop([ var_name for var_name in remote_dataset.variables.keys() if var_name != sel_var_name ]) if compression_enabled: var_dataset.variables.get( sel_var_name).encoding.update(encoding_update) local_netcdf.store_dataset(var_dataset) child_monitor.progress(work=1, msg=sel_var_name) if process_region: local_netcdf.set_attribute('geospatial_lat_min', geo_lat_min) local_netcdf.set_attribute('geospatial_lat_max', geo_lat_max) local_netcdf.set_attribute('geospatial_lon_min', geo_lon_min) local_netcdf.set_attribute('geospatial_lon_max', geo_lon_max) finally: if remote_netcdf: remote_netcdf.close() if local_netcdf: local_netcdf.close() local_ds.add_dataset( os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) child_monitor.done() else: outdated_file_list = [] for file_rec in selected_file_list: filename, _, _, file_size, url = file_rec dataset_file = os.path.join(local_path, filename) # todo (forman, 20160915): must perform better checks on dataset_file if it is... # ... outdated or incomplete or corrupted. # JSON also includes "checksum" and "checksum_type" fields. if not os.path.isfile(dataset_file) or ( file_size and os.path.getsize(dataset_file) != file_size): outdated_file_list.append(file_rec) if outdated_file_list: with monitor.starting('Sync ' + self.name, len(outdated_file_list)): bytes_to_download = sum( [file_rec[3] for file_rec in outdated_file_list]) dl_stat = _DownloadStatistics(bytes_to_download) file_number = 1 for filename, coverage_from, coverage_to, file_size, url in outdated_file_list: if monitor.is_cancelled(): raise InterruptedError dataset_file = os.path.join(local_path, filename) sub_monitor = monitor.child(work=1.0) # noinspection PyUnusedLocal def reporthook(block_number, read_size, total_file_size): dl_stat.handle_chunk(read_size) if monitor.is_cancelled(): raise InterruptedError sub_monitor.progress(work=read_size, msg=str(dl_stat)) sub_monitor_msg = "file %d of %d" % ( file_number, len(outdated_file_list)) with sub_monitor.starting(sub_monitor_msg, file_size): urllib.request.urlretrieve(url[protocol], filename=dataset_file, reporthook=reporthook) file_number += 1 local_ds.add_dataset(os.path.join(local_id, filename), (coverage_from, coverage_to)) local_ds.save() monitor.done()
def temporal_agg(source: str, start_date: str = None, end_date: str = None, var: VarNamesLike.TYPE = None, level: str = 'mon', method: str = 'mean', save_data: bool = False, monitor: Monitor = Monitor.NONE) -> (xr.Dataset, str): """ Perform temporal aggregation of the given data source to the given level using the given method for the given time range. Only full time periods of the given time range will be aggregated. Depending on the given time range, data size, as well as internet connection quality, this operation can potentially take a very long time to finish. Careful consideration is needed in choosing the var parameter to create meaningful outputs. This is unique for each data source. The aggregation result is saved into the local data store for later reuse. :param source: Data source to aggregate :param start_date: Start date of aggregation. If not given, data source start date is used instead :param end_date: End date of aggregation. If not given, data source end date is used instead :param var: If given, only these dataset variables will be preserved in the result :param level: Aggregation level :param method: Aggregation method :param save_data: Whether to save data downloaded during this operation. This can potentially be a lot of data. :param monitor: A progress monitor to use :return: The local data source identifier for the aggregated data """ # Raise not implemented, while not finished raise ValueError("Operation is not implemented.") var = VarNamesLike.convert(var) # Select the appropriate data source data_store_list = DATA_STORE_REGISTRY.get_data_stores() data_sources = query_data_sources(data_store_list, name=source) if len(data_sources) == 0: raise ValueError("No data_source found for the given query " "term {}".format(source)) elif len(data_sources) > 1: raise ValueError("{} data_sources found for the given query " "term {}".format(data_sources, source)) data_source = data_sources[0] source_info = data_source.cache_info # We have to do this to have temporal coverage info in meta_info data_source._init_file_list() # Check if the data source temporal resolution is known known_res = ('day', '8-days', 'mon', 'yr') fq = data_source.meta_info['time_frequency'] if (not fq) or (fq not in known_res): raise ValueError("The given data source features unknown time " "resolution: {}".format(fq)) # Check if the operation supports the desired aggregation step valid_steps = list() valid_steps.append(('day', 'mon')) if (fq, level) not in valid_steps: raise ValueError("Currently the operation does not support aggregation" " from {} to {}".format(fq, level)) # Determine start and end dates if not start_date: start_date = data_source.meta_info['temporal_coverage_start'] start_date = to_datetime(start_date) # If start_date is not start of the month, move it to the 1st of next # month if start_date.day != 1: try: start_date = datetime(start_date.year, start_date.month + 1, 1) except ValueError: # We have tried to set the month to 13 start_date = datetime(start_date.year + 1, 1, 1) if not end_date: end_date = data_source.meta_info['temporal_coverage_end'] end_date = to_datetime(end_date) # If end date is not end of the month, move it to the last day of the # previous month if not _is_end_of_month(end_date): try: end_date = datetime(end_date.year, end_date.month - 1, 27) except ValueError: # We have tried to set the month to 0 end_date = datetime(end_date.year - 1, 12, 31) end_date = _end_of_month(end_date.year, end_date.month) # Determine the count of processing periods n_periods = (end_date.year - start_date.year + 1) * 12\ + end_date.month - start_date.month - 11 # 2000-4-1, 2000-6-30 -> 12 + 2 -11 = 3 if n_periods < 1: raise ValueError("The given time range does not contain any full " "calendar months to do aggregation with.") # Set up the monitor total_work = 100 with monitor.starting('Aggregate', total_work=total_work): monitor.progress(work=0) step = total_work * 0.9 / n_periods # Process the data source period by period tmin = start_date while tmin < end_date: tmax = _end_of_month(tmin.year, tmin.month) # Determine if the data for the given period are already downloaded # If at least one file of the given time range is present, we # don't delete the data for this period, we do the syncing anyway was_already_downloaded = False dt_range = to_datetime_range(tmin, tmax) for date in source_info: if dt_range[0] <= date <= dt_range[1]: was_already_downloaded = True # One is enough break worked = monitor._worked data_source.sync(dt_range, monitor=monitor.child(work=step * 0.9)) if worked == monitor._worked: monitor.progress(work=step * 0.9) ds = data_source.open_dataset(dt_range) # Filter the dataset ds = select_var(ds, var) # Do the aggregation # Save the dataset for this period into local data store # Close and delete the files if needed ds.close() # delete data for the current period,if it should be deleted and it # was not already downloaded. if (not save_data) and (not was_already_downloaded): data_source.delete_local(dt_range) monitor.progress(work=step * 0.1) # tmin for next iteration try: tmin = datetime(tmin.year, tmin.month + 1, 1) except ValueError: # Couldn't add a month -> end of year tmin = datetime(tmin.year + 1, 1, 1) pass monitor.progress(work=step * 0.1) # Return the local data source id return None
def long_term_average(source: str, year_min: int, year_max: int, file: str, var: VarNamesLike.TYPE = None, save: bool = False, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Perform the long term monthly average of the given monthly or daily data source for the given range of years. Depending on the given year range, data size, as well as internet connection quality, this operation can potentially take a very long time to finish. Careful consideration is needed in choosing the var parameter to create meaningful outputs. This is unique for each data source. :param source: The data source from which to extract the monthly average :param year_min: The earliest year of the desired time range :param year_max: The most recent year of the desired time range :param file: filepath where to save the long term average dataset :param var: If given, only these variable names will be preserved in the output. :param save: If True, saves the data downloaded during this operation. This can potentially be a very large amount of data. :param monitor: A progress monitor to use :return: The Long Term Average dataset. """ var = VarNamesLike.convert(var) n_years = year_max - year_min + 1 res = 0 total_work = 100 # Select the appropriate data source data_store_list = DATA_STORE_REGISTRY.get_data_stores() data_sources = query_data_sources(data_store_list, name=source) if len(data_sources) == 0: raise ValueError("No data_source found for the given query\ term {}".format(source)) elif len(data_sources) > 1: raise ValueError("{} data_sources found for the given query\ term {}".format(data_sources, source)) data_source = data_sources[0] source_info = data_source.cache_info # Check if we have a monthly data source fq = data_source.meta_info['time_frequency'] if fq != 'mon': raise ValueError("Only monthly datasets are supported for time being.") with monitor.starting('LTA', total_work=total_work): # Set up the monitor monitor.progress(work=0) step = total_work * 0.9 / n_years # Process the data source year by year year = year_min while year != year_max + 1: tmin = "{}-01-01".format(year) tmax = "{}-12-31".format(year) # Determine if the data for the given year are already downloaded # If at least one file of the given time range is present, we # don't delete the data for this year, we do the syncing anyway. was_already_downloaded = False dt_range = to_datetime_range(tmin, tmax) for date in source_info: if dt_range[0] <= date <= dt_range[1]: was_already_downloaded = True # One is enough break worked = monitor._worked data_source.sync(dt_range, monitor=monitor.child(work=step * 0.9)) if worked == monitor._worked: monitor.progress(work=step * 0.9) ds = data_source.open_dataset(dt_range) # Filter the dataset ds = select_var(ds, var) try: if res == 0: res = ds / n_years else: # Xarray doesn't do automatic alignment for in place # operations, hence we have to do it manually res = res + ds.reindex_like(res) / n_years except TypeError: raise TypeError('One or more data arrays feature a dtype that\ can not be divided. Consider using the var\ parameter to filter the dataset.') ds.close() # delete data for the current year, if it should be deleted and it # was not already downloaded. if (not save) and (not was_already_downloaded): data_source.delete_local(dt_range) monitor.progress(work=step * 0.1) year = year + 1 monitor.progress(msg='Saving the LTA dataset') save_dataset(res, file) monitor.progress(total_work * 0.1) return res
def plot_hovmoeller(ds: xr.Dataset, var: VarName.TYPE = None, x_axis: DimName.TYPE = None, y_axis: DimName.TYPE = None, method: str = 'mean', contour: bool = True, title: str = None, file: str = None, monitor: Monitor = Monitor.NONE, **kwargs) -> Figure: """ Create a Hovmoeller plot of the given dataset. Dimensions other than the ones defined as x and y axis will be aggregated using the given method to produce the plot. :param ds: Dataset to plot :param var: Name of the variable to plot :param x_axis: Dimension to show on x axis :param y_axis: Dimension to show on y axis :param method: Aggregation method :param contour: Whether to produce a contour plot :param title: Plot title :param file: path to a file in which to save the plot :param monitor: A progress monitor :param kwargs: Keyword arguments to pass to underlying xarray plotting fuction """ var_name = None if not var: for key in ds.data_vars.keys(): var_name = key break else: var_name = VarName.convert(var) var = ds[var_name] if not x_axis: x_axis = var.dims[0] else: x_axis = DimName.convert(x_axis) if not y_axis: try: y_axis = var.dims[1] except IndexError: raise ValidationError('Given dataset variable should have at least two dimensions.') else: y_axis = DimName.convert(y_axis) if x_axis == y_axis: raise ValidationError('Dimensions should differ between plot axis.') dims = list(var.dims) try: dims.remove(x_axis) dims.remove(y_axis) except ValueError: raise ValidationError('Given dataset variable: {} does not feature requested dimensions:\ {}, {}.'.format(var_name, x_axis, y_axis)) ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean, 'median': np.nanmedian, 'sum': np.nansum} with monitor.starting("Plot Hovmoeller", total_work=100): monitor.progress(5) with monitor.child(90).observing("Aggregate"): var = var.reduce(ufuncs[method], dim=dims) monitor.progress(5) figure = plt.figure() ax = figure.add_subplot(111) if x_axis == 'time': figure.autofmt_xdate() if contour: var.plot.contourf(ax=ax, x=x_axis, y=y_axis, **kwargs) else: var.plot.pcolormesh(ax=ax, x=x_axis, y=y_axis, **kwargs) if title: ax.set_title(title) figure.tight_layout() if file: figure.savefig(file) return figure if not in_notebook() else None
def write_csv(obj: DataFrameLike.TYPE, file: FileLike.TYPE, columns: VarNamesLike.TYPE = None, na_rep: str = '', delimiter: str = ',', quotechar: str = None, more_args: DictLike.TYPE = None, monitor: Monitor = Monitor.NONE): """ Write comma-separated values (CSV) to plain text file from a DataFrame or Dataset. :param obj: The object to write as CSV; must be a ``DataFrame`` or a ``Dataset``. :param file: The CSV file path. :param columns: The names of variables that should be converted to columns. If given, coordinate variables are included automatically. :param delimiter: Delimiter to use. :param na_rep: A string representation of a missing value (no-data value). :param quotechar: The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. :param more_args: Other optional keyword arguments. Please refer to Pandas documentation of ``pandas.to_csv()`` function. :param monitor: optional progress monitor """ if obj is None: raise ValidationError('obj must not be None') columns = VarNamesLike.convert(columns) if isinstance(obj, pd.DataFrame): # The following code is needed, because Pandas treats any kw given in kwargs as being set, even if just None. kwargs = DictLike.convert(more_args) if kwargs is None: kwargs = {} if columns: kwargs.update(columns=columns) if delimiter: kwargs.update(sep=delimiter) if na_rep: kwargs.update(na_rep=na_rep) if quotechar: kwargs.update(quotechar=quotechar) with monitor.starting('Writing to CSV', 1): obj.to_csv(file, index_label='index', **kwargs) monitor.progress(1) elif isinstance(obj, xr.Dataset): var_names = [var_name for var_name in obj.data_vars if columns is None or var_name in columns] dim_names = None data_vars = [] for var_name in var_names: data_var = obj.data_vars[var_name] if dim_names is None: dim_names = data_var.dims elif dim_names != data_var.dims: raise ValidationError('Not all variables have the same dimensions. ' 'Please select variables so that their dimensions are equal.') data_vars.append(data_var) if dim_names is None: raise ValidationError('None of the selected variables has a dimension.') coord_vars = [] for dim_name in dim_names: if dim_name in obj.coords: coord_var = obj.coords[dim_name] else: coord_var = None for data_var in obj.coords.values(): if len(data_var.dims) == 1 and data_var.dims[0] == dim_name: coord_var = data_var break if coord_var is None: raise ValueError(f'No coordinate variable found for dimension "{dim_name}"') coord_vars.append(coord_var) coord_indexes = [range(len(coord_var)) for coord_var in coord_vars] num_coords = len(coord_vars) num_rows = 1 for coord_var in coord_vars: num_rows *= len(coord_var) stream = open(file, 'w') if isinstance(file, str) else file try: # Write header row stream.write('index') for i in range(num_coords): stream.write(delimiter) stream.write(coord_vars[i].name) for data_var in data_vars: stream.write(delimiter) stream.write(data_var.name) stream.write('\n') with monitor.starting('Writing CSV', num_rows): row = 0 for index in itertools.product(*coord_indexes): # Write data row stream.write(str(row)) for i in range(num_coords): coord_value = coord_vars[i].values[index[i]] stream.write(delimiter) stream.write(str(coord_value)) for data_var in data_vars: var_value = data_var.values[index] stream.write(delimiter) stream.write(str(var_value)) stream.write('\n') monitor.progress(1) row += 1 finally: if isinstance(file, str): stream.close() elif obj is None: raise ValidationError('obj must not be None') else: raise ValidationError('obj must be a pandas.DataFrame or a xarray.Dataset')
def detect_outliers(ds: xr.Dataset, var: VarNamesLike.TYPE, threshold_low: float = 0.05, threshold_high: float = 0.95, quantiles: bool = True, mask: bool = False, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Detect outliers in the given Dataset. When mask=True the input dataset should not contain nan values, otherwise all existing nan values will be marked as 'outliers' in the mask data array added to the output dataset. :param ds: The dataset or dataframe for which to do outlier detection :param var: Variable or variables in the dataset to which to do outlier detection. Note that when multiple variables are selected, absolute threshold values might not make much sense. Wild cards can be used to select multiple variables matching a pattern. :param threshold_low: Values less or equal to this will be removed/masked :param threshold_high: Values greater or equal to this will be removed/masked :param quantiles: If True, threshold values are treated as quantiles, otherwise as absolute values. :param mask: If True, an ancillary variable containing flag values for outliers will be added to the dataset. Otherwise, outliers will be replaced with nan directly in the data variables. :param monitor: A progress monitor. :return: The dataset with outliers masked or replaced with nan """ ds = DatasetLike.convert(ds) # Create a list of variable names on which to perform outlier detection # based on the input comma separated list that can contain wildcards var_patterns = VarNamesLike.convert(var) all_vars = list(ds.data_vars.keys()) variables = list() for pattern in var_patterns: leave = fnmatch.filter(all_vars, pattern) variables = variables + leave # For each array in the dataset for which we should detect outliers, detect # outliers ret_ds = ds.copy() with monitor.starting("detect_outliers", total_work=len(variables) * 3): for var_name in variables: if quantiles: # Get threshold values with monitor.child(1).observing("quantile low"): threshold_low = ret_ds[var_name].quantile(threshold_low) with monitor.child(1).observing("quantile high"): threshold_high = ret_ds[var_name].quantile(threshold_high) else: monitor.progress(2) # If not mask, put nans in the data arrays for min/max outliers if not mask: arr = ret_ds[var_name] attrs = arr.attrs ret_ds[var_name] = arr.where((arr > threshold_low) & (arr < threshold_high)) ret_ds[var_name].attrs = attrs else: # Create and add a data variable containing the mask for this data # variable _mask_outliers(ret_ds, var_name, threshold_low, threshold_high) monitor.progress(1) return ret_ds
def _lta_general(ds: xr.Dataset, monitor: Monitor): """ Try to carry out a long term average in a general case, notably in the case of having seasonal datasets :param ds: Dataset to aggregate :param monitor: Progress monitor :return: Aggregated dataset """ time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc) time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc) total_work = 100 retset = ds # The dataset should feature time periods consistent over years # and denoted with the same dates each year if not _is_seasonal(ds.time): raise ValidationError("A long term average dataset can not be created for" " a dataset with inconsistent seasons.") # Get 'representative year' c = 0 for group in ds.time.groupby('time.year'): c = c + 1 if c == 1: rep_year = group[1].time continue if c == 2 and len(group[1].time) > len(rep_year): rep_year = group[1].time break with monitor.starting('LTA', total_work=total_work): monitor.progress(work=0) step = total_work / len(rep_year.time) kwargs = {'monitor': monitor, 'step': step} retset = retset.groupby('time.month', squeeze=False).apply(_groupby_day, **kwargs) # Make the return dataset CF compliant retset = retset.stack(time=('month', 'day')) # Turn month, day coordinates to time retset = retset.reset_index('time') retset = retset.drop(['month', 'day']) retset['time'] = rep_year.time climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max], (len(rep_year), 1)), dims=['time', 'nv'], name='climatology_bounds') retset['climatology_bounds'] = climatology_bounds retset.time.attrs = ds.time.attrs retset.time.attrs['climatology'] = 'climatology_bounds' for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + ' time: mean over years' except KeyError: retset[var].attrs['cell_methods'] = 'time: mean over years' return retset
def anomaly_external(ds: xr.Dataset, file: str, transform: str = None, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Calculate anomaly with external reference data, for example, a climatology. The given reference dataset is expected to consist of 12 time slices, one for each month. The returned dataset will contain the variable names found in both - the reference and the given dataset. Names found in the given dataset, but not in the reference, will be dropped from the resulting dataset. The calculated anomaly will be against the corresponding month of the reference data. E.g. January against January, etc. In case spatial extents differ between the reference and the given dataset, the anomaly will be calculated on the intersection. :param ds: The dataset to calculate anomalies from :param file: Path to reference data file :param transform: Apply the given transformation before calculating the anomaly. For supported operations see help on 'ds_arithmetics' operation. :param monitor: a progress monitor. :return: The anomaly dataset """ # Check if the time coordinate is of dtype datetime try: if ds.time.dtype != 'datetime64[ns]': raise ValidationError('The dataset provided for anomaly calculation' ' is required to have a time coordinate of' ' dtype datetime64[ns]. Running the normalize' ' operation on this dataset might help.') except AttributeError: raise ValidationError('The dataset provided for anomaly calculation' ' is required to have a time coordinate.') try: if ds.attrs['time_coverage_resolution'] != 'P1M': raise ValidationError('anomaly_external expects a monthly dataset' ' got: {} instead.'.format(ds.attrs['time_coverate_resolution'])) except KeyError: try: ds = adjust_temporal_attrs(ds) if ds.attrs['time_coverage_resolution'] != 'P1M': raise ValidationError('anomaly_external expects a monthly dataset' ' got: {} instead.'.format(ds.attrs['time_coverate_resolution'])) except KeyError: raise ValidationError('Could not determine temporal resolution of' ' of the given input dataset.') clim = xr.open_dataset(file) try: if len(clim.time) != 12: raise ValidationError('The reference dataset is expected to be a ' 'monthly climatology. The provided dataset has' ' a time dimension with length: {}'.format(len(clim.time))) except AttributeError: raise ValidationError('The reference dataset is required to ' 'have a time coordinate.') ret = ds.copy() if transform: ret = ds_arithmetics(ds, transform) # Group by months, subtract the appropriate slice from the reference # Note that this requires that 'time' coordinate labels are of type # datetime64[ns] total_work = 100 step = 100 / 12 with monitor.starting('Anomaly', total_work=total_work): monitor.progress(work=0) kwargs = {'ref': clim, 'monitor': monitor, 'step': step} ret = ret.groupby(ds['time.month']).apply(_group_anomaly, **kwargs) # Running groupby results in a redundant 'month' variable being added to # the dataset ret = ret.drop('month') ret.attrs = ds.attrs # The dataset may be cropped return adjust_spatial_attrs(ret)
def plot_hovmoeller(ds: xr.Dataset, var: VarName.TYPE = None, x_axis: DimName.TYPE = None, y_axis: DimName.TYPE = None, method: str = 'mean', contour: bool = True, title: str = None, file: str = None, monitor: Monitor = Monitor.NONE, **kwargs) -> Figure: """ Create a Hovmoeller plot of the given dataset. Dimensions other than the ones defined as x and y axis will be aggregated using the given method to produce the plot. :param ds: Dataset to plot :param var: Name of the variable to plot :param x_axis: Dimension to show on x axis :param y_axis: Dimension to show on y axis :param method: Aggregation method :param contour: Whether to produce a contour plot :param title: Plot title :param file: path to a file in which to save the plot :param monitor: A progress monitor :param kwargs: Keyword arguments to pass to underlying xarray plotting fuction """ var_name = None if not var: for key in ds.data_vars.keys(): var_name = key break else: var_name = VarName.convert(var) var = ds[var_name] if not x_axis: x_axis = var.dims[0] else: x_axis = DimName.convert(x_axis) if not y_axis: try: y_axis = var.dims[1] except IndexError: raise ValidationError( 'Given dataset variable should have at least two dimensions.') else: y_axis = DimName.convert(y_axis) if x_axis == y_axis: raise ValidationError('Dimensions should differ between plot axis.') dims = list(var.dims) try: dims.remove(x_axis) dims.remove(y_axis) except ValueError: raise ValidationError( 'Given dataset variable: {} does not feature requested dimensions:\ {}, {}.'.format(var_name, x_axis, y_axis)) ufuncs = { 'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean, 'median': np.nanmedian, 'sum': np.nansum } with monitor.starting("Plot Hovmoeller", total_work=100): monitor.progress(5) with monitor.child(90).observing("Aggregate"): var = var.reduce(ufuncs[method], dim=dims) monitor.progress(5) figure = plt.figure() ax = figure.add_subplot(111) if x_axis == 'time': figure.autofmt_xdate() if contour: var.plot.contourf(ax=ax, x=x_axis, y=y_axis, **kwargs) else: var.plot.pcolormesh(ax=ax, x=x_axis, y=y_axis, **kwargs) if title: ax.set_title(title) figure.tight_layout() if file: figure.savefig(file) return figure if not in_notebook() else None
def _lta_general(ds: xr.Dataset, monitor: Monitor): """ Try to carry out a long term average in a general case, notably in the case of having seasonal datasets :param ds: Dataset to aggregate :param monitor: Progress monitor :return: Aggregated dataset """ time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc) time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc) total_work = 100 retset = ds # The dataset should feature time periods consistent over years # and denoted with the same dates each year if not _is_seasonal(ds.time): raise ValidationError( "A long term average dataset can not be created for" " a dataset with inconsistent seasons.") # Get 'representative year' c = 0 for group in ds.time.groupby('time.year'): c = c + 1 if c == 1: rep_year = group[1].time continue if c == 2 and len(group[1].time) > len(rep_year): rep_year = group[1].time break with monitor.starting('LTA', total_work=total_work): monitor.progress(work=0) step = total_work / len(rep_year.time) kwargs = {'monitor': monitor, 'step': step} retset = retset.groupby('time.month', squeeze=False).apply(_groupby_day, **kwargs) # Make the return dataset CF compliant retset = retset.stack(time=('month', 'day')) # Turn month, day coordinates to time retset = retset.reset_index('time') retset = retset.drop(['month', 'day']) retset['time'] = rep_year.time climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max], (len(rep_year), 1)), dims=['time', 'nv'], name='climatology_bounds') retset['climatology_bounds'] = climatology_bounds retset.time.attrs = ds.time.attrs retset.time.attrs['climatology'] = 'climatology_bounds' for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + ' time: mean over years' except KeyError: retset[var].attrs['cell_methods'] = 'time: mean over years' return retset
def data_frame_find_closest(gdf: gpd.GeoDataFrame, location: GeometryLike.TYPE, max_results: int = 1, max_dist: float = 180, dist_col_name: str = 'distance', monitor: Monitor = Monitor.NONE) -> gpd.GeoDataFrame: """ Find the *max_results* records closest to given *location* in the given GeoDataFrame *gdf*. Return a new GeoDataFrame containing the closest records. If *dist_col_name* is given, store the actual distances in this column. Distances are great-circle distances measured in degrees from a representative center of the given *location* geometry to the representative centres of each geometry in the *gdf*. :param gdf: The GeoDataFrame. :param location: A location given as arbitrary geometry. :param max_results: Maximum number of results. :param max_dist: Ignore records whose distance is greater than this value in degrees. :param dist_col_name: Optional name of a new column that will store the actual distances. :param monitor: A progress monitor. :return: A new GeoDataFrame containing the closest records. """ location = GeometryLike.convert(location) location_point = location.representative_point() target_crs = dict(init='epsg:4326') try: source_crs = gdf.crs or target_crs except AttributeError: source_crs = target_crs reprojection_func = _get_reprojection_func(source_crs, target_crs) try: geometries = gdf.geometry except AttributeError as e: raise ValidationError('Missing default geometry column in data frame.') from e num_rows = len(geometries) indexes = list() # PERF: Note, this operation may be optimized by computing the great-circle distances using numpy array math! total_work = 100 num_work_rows = 1 + num_rows // total_work with monitor.starting('Finding closest records', total_work): for i in range(num_rows): geometry = geometries.iloc[i] if geometry is not None: # noinspection PyBroadException try: representative_point = geometry.representative_point() except BaseException: # For some geometries shapely.representative_point() raises AttributeError or ValueError. # E.g. features that span the poles will raise ValueError. # The quick and dirty solution here is to catch such exceptions and ignore them. representative_point = None if representative_point is not None: representative_point = _transform_coordinates(representative_point, reprojection_func) if representative_point is not None: # noinspection PyTypeChecker dist = great_circle_distance(location_point, representative_point) if dist <= max_dist: indexes.append((i, dist)) if i % num_work_rows == 0: monitor.progress(work=1) indexes = sorted(indexes, key=lambda item: item[1]) num_results = min(max_results, len(indexes)) indexes, distances = zip(*indexes[0:num_results]) new_gdf = gdf.iloc[list(indexes)] if not isinstance(new_gdf, gpd.GeoDataFrame): new_gdf = gpd.GeoDataFrame(new_gdf, crs=source_crs) if dist_col_name: new_gdf[dist_col_name] = np.array(distances) return new_gdf
def _pearsonr(x: xr.DataArray, y: xr.DataArray, monitor: Monitor) -> xr.Dataset: """ Calculate Pearson correlation coefficients and p-values for testing non-correlation of lon/lat/time xarray datasets for each lon/lat point. Heavily influenced by scipy.stats.pearsonr The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed, and not necessarily zero-mean. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. :param x: lon/lat/time xr.DataArray :param y: xr.DataArray of the same spatiotemporal extents and resolution as x. :param monitor: Monitor to use for monitoring the calculation :return: A dataset containing the correlation coefficients and p_values on the lon/lat grid of x and y. References ---------- http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation """ with monitor.starting("Calculate Pearson correlation", total_work=6): n = len(x['time']) xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time') xm['time'] = [i for i in range(0, len(xm.time))] ym['time'] = [i for i in range(0, len(ym.time))] xm_ym = xm * ym r_num = xm_ym.sum(dim='time') xm_squared = np.square(xm) ym_squared = np.square(ym) r_den = np.sqrt( xm_squared.sum(dim='time') * ym_squared.sum(dim='time')) r_den = r_den.where(r_den != 0) r = r_num / r_den # Presumably, if abs(r) > 1, then it is only some small artifact of floating # point arithmetic. # At this point r should be a lon/lat dataArray, so it should be safe to # load it in memory explicitly. This may take time as it will kick-start # deferred processing. # Comparing with NaN produces warnings that can be safely ignored default_warning_settings = np.seterr(invalid='ignore') with monitor.child(1).observing("task 1"): negativ_r = r.values < -1.0 with monitor.child(1).observing("task 2"): r.values[negativ_r] = -1.0 with monitor.child(1).observing("task 3"): positiv_r = r.values > 1.0 with monitor.child(1).observing("task 4"): r.values[positiv_r] = 1.0 np.seterr(**default_warning_settings) r.attrs = { 'description': 'Correlation coefficients between' ' {} and {}.'.format(x.name, y.name) } df = n - 2 t_squared = np.square(r) * (df / ((1.0 - r.where(r != 1)) * (1.0 + r.where(r != -1)))) prob = df / (df + t_squared) with monitor.child(1).observing("task 5"): prob_values_in = prob.values with monitor.child(1).observing("task 6"): prob.values = betainc(0.5 * df, 0.5, prob_values_in) prob.attrs = { 'description': 'Rough indicator of probability of an' ' uncorrelated system producing datasets that have a Pearson' ' correlation at least as extreme as the one computed from' ' these datsets. Not entirely reliable, but reasonable for' ' datasets larger than 500 or so.' } retset = xr.Dataset({'corr_coef': r, 'p_value': prob}) return retset
def animate_map(ds: xr.Dataset, var: VarName.TYPE = None, animate_dim: str = 'time', interval: int = 200, true_range: bool = False, indexers: DictLike.TYPE = None, region: PolygonLike.TYPE = None, projection: str = 'PlateCarree', central_lon: float = 0.0, title: str = None, contour_plot: bool = False, cmap_params: DictLike.TYPE = None, plot_properties: DictLike.TYPE = None, file: str = None, monitor: Monitor = Monitor.NONE) -> HTML: """ Create a geographic map animation for the variable given by dataset *ds* and variable name *var*. Creates an animation of the given variable from the given dataset on a map with coastal lines. In case no variable name is given, the first encountered variable in the dataset is animated. It is also possible to set extents of the animation. If no extents are given, a global animation is created. The following file formats for saving the animation are supported: html :param ds: the dataset containing the variable to animate :param var: the variable's name :param animate_dim: Dimension to animate, if none given defaults to time. :param interval: Delay between frames in milliseconds. Defaults to 200. :param true_range: If True, calculates colormap and colorbar configuration parameters from the whole dataset. Can potentially take a lot of time. Defaults to False, in which case the colormap is calculated from the first frame. :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary or a comma-separated string of key-value pairs that maps the variable's dimension names to constant labels. e.g. "layer=4". :param region: Region to animate :param projection: name of a global projection, see http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html :param central_lon: central longitude of the projection in degrees :param title: an optional title :param contour_plot: If true plot a filled contour plot of data, otherwise plots a pixelated colormesh :param cmap_params: optional additional colormap configuration parameters, e.g. "vmax=300, cmap='magma'" For full reference refer to http://xarray.pydata.org/en/stable/generated/xarray.plot.contourf.html :param plot_properties: optional plot properties for Python matplotlib, e.g. "bins=512, range=(-1.5, +1.5)" For full reference refer to https://matplotlib.org/api/lines_api.html and https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.contourf.html :param file: path to a file in which to save the animation :param monitor: A progress monitor. :return: An animation in HTML format """ if not isinstance(ds, xr.Dataset): raise NotImplementedError('Only gridded datasets are currently supported') var_name = None if not var: for key in ds.data_vars.keys(): var_name = key break else: var_name = VarName.convert(var) try: var = ds[var_name] except KeyError: raise ValidationError('Provided variable name "{}" does not exist in the given dataset'.format(var_name)) indexers = DictLike.convert(indexers) or {} properties = DictLike.convert(plot_properties) or {} cmap_params = DictLike.convert(cmap_params) or {} extents = None bounds = handle_plot_polygon(region) if bounds: lon_min, lat_min, lon_max, lat_max = bounds extents = [lon_min, lon_max, lat_min, lat_max] if len(ds.lat) < 2 or len(ds.lon) < 2: # Matplotlib can not plot datasets with less than these dimensions with # contourf and pcolormesh methods raise ValidationError('The minimum dataset spatial dimensions to create a map' ' plot are (2,2)') # See http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html# if projection == 'PlateCarree': proj = ccrs.PlateCarree(central_longitude=central_lon) elif projection == 'LambertCylindrical': proj = ccrs.LambertCylindrical(central_longitude=central_lon) elif projection == 'Mercator': proj = ccrs.Mercator(central_longitude=central_lon) elif projection == 'Miller': proj = ccrs.Miller(central_longitude=central_lon) elif projection == 'Mollweide': proj = ccrs.Mollweide(central_longitude=central_lon) elif projection == 'Orthographic': proj = ccrs.Orthographic(central_longitude=central_lon) elif projection == 'Robinson': proj = ccrs.Robinson(central_longitude=central_lon) elif projection == 'Sinusoidal': proj = ccrs.Sinusoidal(central_longitude=central_lon) elif projection == 'NorthPolarStereo': proj = ccrs.NorthPolarStereo(central_longitude=central_lon) elif projection == 'SouthPolarStereo': proj = ccrs.SouthPolarStereo(central_longitude=central_lon) else: raise ValidationError('illegal projection: "%s"' % projection) figure = plt.figure(figsize=(8, 4)) ax = plt.axes(projection=proj) if extents: ax.set_extent(extents, ccrs.PlateCarree()) else: ax.set_global() ax.coastlines() if not animate_dim: animate_dim = 'time' indexers[animate_dim] = var[animate_dim][0] var_data = get_var_data(var, indexers, remaining_dims=('lon', 'lat')) with monitor.starting("animate", len(var[animate_dim]) + 3): if true_range: data_min, data_max = _get_min_max(var, monitor=monitor) else: data_min, data_max = _get_min_max(var_data, monitor=monitor) cmap_params = determine_cmap_params(data_min, data_max, **cmap_params) plot_kwargs = {**properties, **cmap_params} # Plot the first frame to set-up the axes with the colorbar properly # transform keyword is for the coordinate our data is in, which in case of a # 'normal' lat/lon dataset is PlateCarree. if contour_plot: var_data.plot.contourf(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj}, add_colorbar=True, **plot_kwargs) else: var_data.plot.pcolormesh(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj}, add_colorbar=True, **plot_kwargs) if title: ax.set_title(title) figure.tight_layout() monitor.progress(1) def run(value): ax.clear() if extents: ax.set_extent(extents, ccrs.PlateCarree()) else: ax.set_global() ax.coastlines() indexers[animate_dim] = value var_data = get_var_data(var, indexers, remaining_dims=('lon', 'lat')) var_data.plot.contourf(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj}, add_colorbar=False, **plot_kwargs) if title: ax.set_title(title) monitor.progress(1) return ax anim = animation.FuncAnimation(figure, run, [i for i in var[animate_dim]], interval=interval, blit=False, repeat=False) anim_html = anim.to_jshtml() # Prevent the animation for running after it's finished del anim # Delete the rogue temp-file try: os.remove('None0000000.png') except FileNotFoundError: pass if file: with open(file, 'w') as outfile: outfile.write(anim_html) monitor.progress(1) return HTML(anim_html)
def _make_local(self, local_ds: LocalDataSource, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): local_id = local_ds.id time_range = TimeRangeLike.convert(time_range) region = PolygonLike.convert(region) var_names = VarNamesLike.convert(var_names) time_range, region, var_names = self._apply_make_local_fixes( time_range, region, var_names) compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False do_update_of_verified_time_coverage_start_once = True verified_time_coverage_start = None verified_time_coverage_end = None encoding_update = dict() if compression_enabled: encoding_update.update({ 'zlib': True, 'complevel': compression_level }) if region or var_names: protocol = _ODP_PROTOCOL_OPENDAP else: protocol = _ODP_PROTOCOL_HTTP local_path = os.path.join(local_ds.data_store.data_store_path, local_id) if not os.path.exists(local_path): os.makedirs(local_path) selected_file_list = self._find_files(time_range) if not selected_file_list: msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format( self.id) if time_range is not None: msg += ' in given time range {}'.format( TimeRangeLike.format(time_range)) raise DataAccessError(msg) try: if protocol == _ODP_PROTOCOL_OPENDAP: do_update_of_variables_meta_info_once = True do_update_of_region_meta_info_once = True files = self._get_urls_list(selected_file_list, protocol) monitor.start('Sync ' + self.id, total_work=len(files)) for idx, dataset_uri in enumerate(files): child_monitor = monitor.child(work=1) file_name = os.path.basename(dataset_uri) local_filepath = os.path.join(local_path, file_name) time_coverage_start = selected_file_list[idx][1] time_coverage_end = selected_file_list[idx][2] try: child_monitor.start(label=file_name, total_work=1) remote_dataset = xr.open_dataset(dataset_uri) if var_names: remote_dataset = remote_dataset.drop([ var_name for var_name in remote_dataset.data_vars.keys() if var_name not in var_names ]) if region: remote_dataset = normalize_impl(remote_dataset) remote_dataset = subset_spatial_impl( remote_dataset, region) geo_lon_min, geo_lat_min, geo_lon_max, geo_lat_max = region.bounds remote_dataset.attrs[ 'geospatial_lat_min'] = geo_lat_min remote_dataset.attrs[ 'geospatial_lat_max'] = geo_lat_max remote_dataset.attrs[ 'geospatial_lon_min'] = geo_lon_min remote_dataset.attrs[ 'geospatial_lon_max'] = geo_lon_max if do_update_of_region_meta_info_once: local_ds.meta_info['bbox_maxx'] = geo_lon_max local_ds.meta_info['bbox_minx'] = geo_lon_min local_ds.meta_info['bbox_maxy'] = geo_lat_max local_ds.meta_info['bbox_miny'] = geo_lat_min do_update_of_region_meta_info_once = False if compression_enabled: for sel_var_name in remote_dataset.variables.keys( ): remote_dataset.variables.get( sel_var_name).encoding.update( encoding_update) remote_dataset.to_netcdf(local_filepath) child_monitor.progress(work=1, msg=str(time_coverage_start)) finally: if do_update_of_variables_meta_info_once: variables_info = local_ds.meta_info.get( 'variables', []) local_ds.meta_info['variables'] = [ var_info for var_info in variables_info if var_info.get('name') in remote_dataset. variables.keys() and var_info.get( 'name') not in remote_dataset.dims.keys() ] do_update_of_variables_meta_info_once = False local_ds.add_dataset( os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) if do_update_of_verified_time_coverage_start_once: verified_time_coverage_start = time_coverage_start do_update_of_verified_time_coverage_start_once = False verified_time_coverage_end = time_coverage_end child_monitor.done() else: outdated_file_list = [] for file_rec in selected_file_list: filename, _, _, file_size, url = file_rec dataset_file = os.path.join(local_path, filename) # todo (forman, 20160915): must perform better checks on dataset_file if it is... # ... outdated or incomplete or corrupted. # JSON also includes "checksum" and "checksum_type" fields. if not os.path.isfile(dataset_file) or ( file_size and os.path.getsize(dataset_file) != file_size): outdated_file_list.append(file_rec) if outdated_file_list: with monitor.starting('Sync ' + self.id, len(outdated_file_list)): bytes_to_download = sum( [file_rec[3] for file_rec in outdated_file_list]) dl_stat = _DownloadStatistics(bytes_to_download) file_number = 1 for filename, coverage_from, coverage_to, file_size, url in outdated_file_list: dataset_file = os.path.join(local_path, filename) sub_monitor = monitor.child(work=1.0) # noinspection PyUnusedLocal def reporthook(block_number, read_size, total_file_size): dl_stat.handle_chunk(read_size) sub_monitor.progress(work=read_size, msg=str(dl_stat)) sub_monitor_msg = "file %d of %d" % ( file_number, len(outdated_file_list)) with sub_monitor.starting(sub_monitor_msg, file_size): urllib.request.urlretrieve( url[protocol], filename=dataset_file, reporthook=reporthook) file_number += 1 local_ds.add_dataset( os.path.join(local_id, filename), (coverage_from, coverage_to)) if do_update_of_verified_time_coverage_start_once: verified_time_coverage_start = coverage_from do_update_of_verified_time_coverage_start_once = False verified_time_coverage_end = coverage_to except OSError as e: raise DataAccessError( "Copying remote data source failed: {}".format(e), source=self) from e local_ds.meta_info['temporal_coverage_start'] = TimeLike.format( verified_time_coverage_start) local_ds.meta_info['temporal_coverage_end'] = TimeLike.format( verified_time_coverage_end) local_ds.save(True)
def _pearsonr(x: xr.DataArray, y: xr.DataArray, monitor: Monitor) -> xr.Dataset: """ Calculate Pearson correlation coefficients and p-values for testing non-correlation of lon/lat/time xarray datasets for each lon/lat point. Heavily influenced by scipy.stats.pearsonr The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed, and not necessarily zero-mean. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. :param x: lon/lat/time xr.DataArray :param y: xr.DataArray of the same spatiotemporal extents and resolution as x. :param monitor: Monitor to use for monitoring the calculation :return: A dataset containing the correlation coefficients and p_values on the lon/lat grid of x and y. References ---------- http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation """ with monitor.starting("Calculate Pearson correlation", total_work=6): n = len(x['time']) xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time') xm.time.values = [i for i in range(0, len(xm.time))] ym.time.values = [i for i in range(0, len(ym.time))] xm_ym = xm * ym r_num = xm_ym.sum(dim='time') xm_squared = np.square(xm) ym_squared = np.square(ym) r_den = np.sqrt(xm_squared.sum(dim='time') * ym_squared.sum(dim='time')) r_den = r_den.where(r_den != 0) r = r_num / r_den # Presumably, if abs(r) > 1, then it is only some small artifact of floating # point arithmetic. # At this point r should be a lon/lat dataArray, so it should be safe to # load it in memory explicitly. This may take time as it will kick-start # deferred processing. # Comparing with NaN produces warnings that can be safely ignored default_warning_settings = np.seterr(invalid='ignore') with monitor.child(1).observing("task 1"): negativ_r = r.values < -1.0 with monitor.child(1).observing("task 2"): r.values[negativ_r] = -1.0 with monitor.child(1).observing("task 3"): positiv_r = r.values > 1.0 with monitor.child(1).observing("task 4"): r.values[positiv_r] = 1.0 np.seterr(**default_warning_settings) r.attrs = {'description': 'Correlation coefficients between' ' {} and {}.'.format(x.name, y.name)} df = n - 2 t_squared = np.square(r) * (df / ((1.0 - r.where(r != 1)) * (1.0 + r.where(r != -1)))) prob = df / (df + t_squared) with monitor.child(1).observing("task 5"): prob_values_in = prob.values with monitor.child(1).observing("task 6"): prob.values = betainc(0.5 * df, 0.5, prob_values_in) prob.attrs = {'description': 'Rough indicator of probability of an' ' uncorrelated system producing datasets that have a Pearson' ' correlation at least as extreme as the one computed from' ' these datsets. Not entirely reliable, but reasonable for' ' datasets larger than 500 or so.'} retset = xr.Dataset({'corr_coef': r, 'p_value': prob}) return retset
def data_frame_aggregate(df: DataFrameLike.TYPE, var_names: VarNamesLike.TYPE = None, aggregate_geometry: bool = False, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Aggregate columns into count, mean, median, sum, std, min, and max. Return a new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of the GeoDataFrame are to be aggregated. All geometries are merged union-like. The return data type will always be the same as the input data type. :param df: The (Geo)DataFrame to be analysed :param var_names: Variables to be aggregated ('None' uses all aggregatable columns) :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame :param monitor: Monitor for progress bar :return: returns either DataFrame or GeoDataFrame. Keeps input data type """ vns = VarNamesLike.convert(var_names) df_is_geo = isinstance(df, gpd.GeoDataFrame) aggregations = ["count", "mean", "median", "sum", "std", "min", "max"] # Check var names integrity (aggregatable, exists in data frame) types_accepted_for_agg = ['float64', 'int64', 'bool'] agg_columns = list(df.select_dtypes(include=types_accepted_for_agg).columns) if df_is_geo: agg_columns.append('geometry') columns = list(df.columns) if vns is None: vns = agg_columns diff = list(set(vns) - set(columns)) if len(diff) > 0: raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!') diff = list(set(vns) - set(agg_columns)) if len(diff) > 0: raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!') try: df['geometry'] except KeyError as e: raise ValidationError('Variable geometry not in GEO data frame!') from e # Aggregate columns if vns is None: df_buff = df.select_dtypes(include=types_accepted_for_agg).agg(aggregations) else: df_buff = df[vns].select_dtypes(include=types_accepted_for_agg).agg(aggregations) res = {} for n in df_buff.columns: for a in aggregations: val = df_buff[n][a] h = n + '_' + a res[h] = [val] df_agg = pd.DataFrame(res) # Aggregate (union) geometry if GeoDataFrame if df_is_geo and aggregate_geometry: total_work = 100 num_work_rows = 1 + len(df) // total_work with monitor.starting('Aggregating geometry: ', total_work): multi_polygon = shapely.geometry.MultiPolygon() i = 0 for rec in df.geometry: if monitor.is_cancelled(): break # noinspection PyBroadException try: multi_polygon = multi_polygon.union(other=rec) except Exception: pass if i % num_work_rows == 0: monitor.progress(work=1) i += 1 df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs) return df_agg
def data_frame_aggregate(df: DataFrameLike.TYPE, var_names: VarNamesLike.TYPE = None, aggregate_geometry: bool = False, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Aggregate columns into count, mean, median, sum, std, min, and max. Return a new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of the GeoDataFrame are to be aggregated. All geometries are merged union-like. The return data type will always be the same as the input data type. :param df: The (Geo)DataFrame to be analysed :param var_names: Variables to be aggregated ('None' uses all aggregatable columns) :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame :param monitor: Monitor for progress bar :return: returns either DataFrame or GeoDataFrame. Keeps input data type """ vns = VarNamesLike.convert(var_names) df_is_geo = isinstance(df, gpd.GeoDataFrame) aggregations = ["count", "mean", "median", "sum", "std", "min", "max"] # Check var names integrity (aggregatable, exists in data frame) types_accepted_for_agg = ['float64', 'int64', 'bool'] agg_columns = list( df.select_dtypes(include=types_accepted_for_agg).columns) if df_is_geo: agg_columns.append('geometry') columns = list(df.columns) if vns is None: vns = agg_columns diff = list(set(vns) - set(columns)) if len(diff) > 0: raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!') diff = list(set(vns) - set(agg_columns)) if len(diff) > 0: raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!') try: df['geometry'] except KeyError as e: raise ValidationError( 'Variable geometry not in GEO data frame!') from e # Aggregate columns if vns is None: df_buff = df.select_dtypes( include=types_accepted_for_agg).agg(aggregations) else: df_buff = df[vns].select_dtypes( include=types_accepted_for_agg).agg(aggregations) res = {} for n in df_buff.columns: for a in aggregations: val = df_buff[n][a] h = n + '_' + a res[h] = [val] df_agg = pd.DataFrame(res) # Aggregate (union) geometry if GeoDataFrame if df_is_geo and aggregate_geometry: total_work = 100 num_work_rows = 1 + len(df) // total_work with monitor.starting('Aggregating geometry: ', total_work): multi_polygon = shapely.geometry.MultiPolygon() i = 0 for rec in df.geometry: if monitor.is_cancelled(): break # noinspection PyBroadException try: multi_polygon = multi_polygon.union(other=rec) except Exception: pass if i % num_work_rows == 0: monitor.progress(work=1) i += 1 df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs) return df_agg
def long_term_average(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE) -> xr.Dataset: """ Perform long term average of the given dataset by doing a mean of monthly values over the time range covered by the dataset. E.g. it averages all January values, all February values, etc, to create a dataset with twelve time slices each containing a mean of respective monthly values. For further information on climatological datasets, see http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics :param ds: A monthly dataset to average :param var: If given, only these variables will be preserved in the resulting dataset :param monitor: A progress monitor :return: A climatological long term average dataset """ ds = DatasetLike.convert(ds) # Check if time dtype is what we want if 'datetime64[ns]' != ds.time.dtype: raise ValueError( 'Long term average operation expects a dataset with the' ' time coordinate of type datetime64[ns], but received' ' {}. Running the normalize operation on this' ' dataset may help'.format(ds.time.dtype)) # Check if we have a monthly dataset try: if ds.attrs['time_coverage_resolution'] != 'P1M': raise ValueError( 'Long term average operation expects a monthly dataset' ' running temporal aggregation on this dataset' ' beforehand may help.') except KeyError: raise ValueError('Could not determine temporal resolution. Running' ' the adjust_temporal_attrs operation beforehand may' ' help.') var = VarNamesLike.convert(var) # Shallow retset = ds.copy() if var: retset = select_var(retset, var) time_min = pd.Timestamp(ds.time.values[0]) time_max = pd.Timestamp(ds.time.values[-1]) total_work = 100 with monitor.starting('LTA', total_work=total_work): monitor.progress(work=0) step = total_work / 12 kwargs = {'monitor': monitor, 'step': step} retset = retset.groupby('time.month', squeeze=False).apply(_mean, **kwargs) # Make the return dataset CF compliant retset = retset.rename({'month': 'time'}) retset['time'] = pd.date_range('{}-01-01'.format(time_min.year), freq='MS', periods=12) climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max], (12, 1)), dims=['time', 'nv'], name='climatology_bounds') retset['climatology_bounds'] = climatology_bounds retset.time.attrs = ds.time.attrs retset.time.attrs['climatology'] = 'climatology_bounds' for var in retset.data_vars: try: retset[var].attrs['cell_methods'] = \ retset[var].attrs['cell_methods'] + ' time: mean over years' except KeyError: retset[var].attrs['cell_methods'] = 'time: mean over years' return retset
def data_frame_find_closest( gdf: gpd.GeoDataFrame, location: GeometryLike.TYPE, max_results: int = 1, max_dist: float = 180, dist_col_name: str = 'distance', monitor: Monitor = Monitor.NONE) -> gpd.GeoDataFrame: """ Find the *max_results* records closest to given *location* in the given GeoDataFrame *gdf*. Return a new GeoDataFrame containing the closest records. If *dist_col_name* is given, store the actual distances in this column. Distances are great-circle distances measured in degrees from a representative center of the given *location* geometry to the representative centres of each geometry in the *gdf*. :param gdf: The GeoDataFrame. :param location: A location given as arbitrary geometry. :param max_results: Maximum number of results. :param max_dist: Ignore records whose distance is greater than this value in degrees. :param dist_col_name: Optional name of a new column that will store the actual distances. :param monitor: A progress monitor. :return: A new GeoDataFrame containing the closest records. """ location = GeometryLike.convert(location) location_point = location.representative_point() target_crs = dict(init='epsg:4326') try: source_crs = gdf.crs or target_crs except AttributeError: source_crs = target_crs reprojection_func = _get_reprojection_func(source_crs, target_crs) try: geometries = gdf.geometry except AttributeError as e: raise ValidationError( 'Missing default geometry column in data frame.') from e num_rows = len(geometries) indexes = list() # PERF: Note, this operation may be optimized by computing the great-circle distances using numpy array math! total_work = 100 num_work_rows = 1 + num_rows // total_work with monitor.starting('Finding closest records', total_work): for i in range(num_rows): geometry = geometries.iloc[i] if geometry is not None: # noinspection PyBroadException try: representative_point = geometry.representative_point() except BaseException: # For some geometries shapely.representative_point() raises AttributeError or ValueError. # E.g. features that span the poles will raise ValueError. # The quick and dirty solution here is to catch such exceptions and ignore them. representative_point = None if representative_point is not None: representative_point = _transform_coordinates( representative_point, reprojection_func) if representative_point is not None: # noinspection PyTypeChecker dist = great_circle_distance(location_point, representative_point) if dist <= max_dist: indexes.append((i, dist)) if i % num_work_rows == 0: monitor.progress(work=1) indexes = sorted(indexes, key=lambda item: item[1]) num_results = min(max_results, len(indexes)) indexes, distances = zip(*indexes[0:num_results]) new_gdf = gdf.iloc[list(indexes)] if not isinstance(new_gdf, gpd.GeoDataFrame): new_gdf = gpd.GeoDataFrame(new_gdf, crs=source_crs) if dist_col_name: new_gdf[dist_col_name] = np.array(distances) return new_gdf