def test_filter_ungriddeddata(aeronetsunv3lev2_subset, filter_name, num_sites): obs_data = aeronetsunv3lev2_subset f = Filter(filter_name) num = len(f.apply(obs_data).unique_station_names) assert num == num_sites
def test_filter_griddeddata(data_tm5, filter_name, mean): # use copy so that this fixture can be used elsewhere without being c # changed by this method globally model = data_tm5.copy() f = Filter(filter_name) # europe only land subset = f.apply(model) np.testing.assert_allclose(np.nanmean(subset.cube.data), mean)
def colocate_gridded_ungridded(gridded_data, ungridded_data, ts_type=None, start=None, stop=None, filter_name=None, regrid_res_deg=None, remove_outliers=True, vert_scheme=None, harmonise_units=True, regrid_scheme='areaweighted', var_ref=None, var_outlier_ranges=None, var_ref_outlier_ranges=None, update_baseyear_gridded=None, ignore_station_names=None, apply_time_resampling_constraints=None, min_num_obs=None, colocate_time=False, var_keep_outliers=True, var_ref_keep_outliers=False, use_climatology_ref=False, resample_how=None, **kwargs): """Colocate gridded with ungridded data (low level method) For high-level colocation see :class:`pyaerocom.colocation_auto.Colocator` and :class:`pyaerocom.colocation_auto.ColocationSetup` Note ---- Uses the variable that is contained in input :class:`GriddedData` object (since these objects only contain a single variable). If this variable is not contained in observation data (or contained but using a different variable name) you may specify the obs variable to be used via input arg `var_ref` Parameters ---------- gridded_data : GriddedData gridded data object (e.g. model results). ungridded_data : UngriddedData ungridded data object (e.g. observations). ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly.). start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used. stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). If None, then it is set to 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude mountain sites. regrid_res_deg : int or dict, optional regrid resolution in degrees. If specified, the input gridded data object will be regridded in lon / lat dimension to the input resolution (if input is integer, both lat and lon are regridded to that resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg` to specify regrid resolutions, respectively). remove_outliers : bool if True, outliers are removed from model and obs data before colocation, else not. Outlier ranges can be specified via input args `var_outlier_ranges` and `var_ref_outlier_ranges`. vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. harmonise_units : bool if True, units are attempted to be harmonised (note: raises Exception if True and units cannot be harmonised). var_ref : :obj:`str`, optional variable against which data in :attr:`gridded_data` is supposed to be compared. If None, then the same variable is used (i.e. `gridded_data.var_name`). var_outlier_ranges : dict, optional dictionary specifying outlier ranges for dataset to be analysed (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then the pyaerocom default outlier ranges are used for the input variable. Defaults to None. var_ref_outlier_ranges : dict, optional like `var_outlier_ranges` but for reference dataset. update_baseyear_gridded : int, optional optional input that can be set in order to re-define the time dimension in the gridded data object to be analysed. E.g., if the data object is a climatology (one year of data) that has set the base year of the time dimension to a value other than the specified input start / stop time this may be used to update the time in order to make colocation possible. ignore_station_names : str or list, optional station name or pattern or list of station names or patterns that should be ignored apply_time_resampling_constraints : bool, optional if True, then time resampling constraints are applied as provided via :attr:`min_num_obs` or if that one is unspecified, as defined in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used (which defaults to True !!). min_num_obs : int or dict, optional minimum number of observations for resampling of time colocate_time : bool if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. var_keep_outliers : bool if True, then no outliers will be removed from dataset to be analysed, even if `remove_outliers` is True. That is because for model evaluation often only outliers are supposed to be removed in the observations but not in the model. var_ref_keep_outliers : bool if True, then no outliers will be removed from the reference dataset, even if `remove_outliers` is True. use_climatology_ref : bool if True, climatological timeseries are used from observations resample_how : str or dict string specifying how data should be aggregated when resampling in time. Default is "mean". Can also be a nested dictionary, e.g. resample_how={'daily': {'hourly' : 'max'}} would use the maximum value to aggregate from hourly to daily, rather than the mean. **kwargs additional keyword args (passed to :func:`UngriddedData.to_station_data_all`) Returns ------- ColocatedData instance of colocated data Raises ------ VarNotAvailableError if grid data variable is not available in ungridded data object AttributeError if instance of input :class:`UngriddedData` object contains more than one dataset TimeMatchError if gridded data time range does not overlap with input time range ColocationError if none of the data points in input :class:`UngriddedData` matches the input colocation constraints """ if var_outlier_ranges is None: var_outlier_ranges = {} if var_ref_outlier_ranges is None: var_ref_outlier_ranges = {} if filter_name is None: filter_name = const.DEFAULT_REG_FILTER try: gridded_data.check_dimcoords_tseries() except DimensionOrderError: gridded_data.reorder_dimensions_tseries() var = gridded_data.var_name aerocom_var = gridded_data.var_name_aerocom _check_var_registered(var, aerocom_var, gridded_data) if var_ref is None: if aerocom_var is not None: var_ref = aerocom_var else: var_ref = var if remove_outliers: low, high, low_ref, high_ref = None, None, None, None if var in var_outlier_ranges: low, high = var_outlier_ranges[var] if var_ref in var_ref_outlier_ranges: low_ref, high_ref = var_ref_outlier_ranges[var_ref] if not var_ref in ungridded_data.contains_vars: raise VarNotAvailableError('Variable {} is not available in ungridded ' 'data (which contains {})'.format( var_ref, ungridded_data.contains_vars)) elif len(ungridded_data.contains_datasets) > 1: raise AttributeError('Colocation can only be performed with ' 'ungridded data objects that only contain a ' 'single dataset. Use method `extract_dataset` of ' 'UngriddedData object to extract single datasets') dataset_ref = ungridded_data.contains_datasets[0] if update_baseyear_gridded is not None: # update time dimension in gridded data gridded_data.base_year = update_baseyear_gridded grid_ts_type_src = gridded_data.ts_type grid_ts_type = TsType(gridded_data.ts_type) if isinstance(ts_type, str): ts_type = TsType(ts_type) if ts_type is None or grid_ts_type < ts_type: ts_type = grid_ts_type elif grid_ts_type > ts_type and not colocate_time: gridded_data = gridded_data.resample_time( str(ts_type), apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how) grid_ts_type = ts_type # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) if start < grid_start: start = grid_start if stop > grid_stop: stop = grid_stop # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) # create instance of Filter class (may, in the future, also include all # filter options, e.g. start, stop, variables, only land, only oceans, and # may also be linked with other data object, e.g. if data is only supposed # to be used if other data object exceeds a certain threshold... but for # now, only region and altitude range) regfilter = Filter(name=filter_name) # apply filter to data ungridded_data = regfilter.apply(ungridded_data) #crop time gridded_data = regfilter.apply(gridded_data) if start > grid_start or stop < grid_stop: gridded_data = gridded_data.crop(time_range=(start, stop)) if regrid_res_deg is not None: gridded_data = _regrid_gridded(gridded_data, regrid_scheme, regrid_res_deg) if remove_outliers and not var_ref_keep_outliers: #called twice if used via Colocator, this should go out here ungridded_data.remove_outliers(var_ref, inplace=True, low=low_ref, high=high_ref) if use_climatology_ref: col_freq = 'monthly' obs_start = const.CLIM_START obs_stop = const.CLIM_STOP else: col_freq = str(grid_ts_type) #TS_TYPE_TO_PANDAS_FREQ[grid_ts_type] obs_start = start obs_stop = stop latitude = gridded_data.latitude.points longitude = gridded_data.longitude.points lat_range = [np.min(latitude), np.max(latitude)] lon_range = [np.min(longitude), np.max(longitude)] ungridded_data = ungridded_data.filter_by_meta(latitude=lat_range, longitude=lon_range) # get timeseries from all stations in provided time resolution # (time resampling is done below in main loop) all_stats = ungridded_data.to_station_data_all( vars_to_convert=var_ref, start=obs_start, stop=obs_stop, by_station_name=True, ignore_index=ignore_station_names, **kwargs) obs_stat_data = all_stats['stats'] ungridded_lons = all_stats['longitude'] ungridded_lats = all_stats['latitude'] if len(obs_stat_data) == 0: raise VarNotAvailableError('Variable {} is not available in specified ' 'time interval ({}-{})'.format( var_ref, start, stop)) # make sure the gridded data is in the right dimension if gridded_data.ndim > 3: if vert_scheme is None: vert_scheme = 'mean' if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES: raise ValueError( 'Vertical scheme {} is not supported'.format(vert_scheme)) grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons, latitude=ungridded_lats, vert_scheme=vert_scheme) pd_freq = TsType(col_freq).to_pandas_freq() time_idx = make_datetime_index(start, stop, pd_freq) coldata = np.empty((2, len(time_idx), len(obs_stat_data))) lons = [] lats = [] alts = [] station_names = [] ungridded_unit = None ts_type_src_ref = None if not harmonise_units: gridded_unit = str(gridded_data.units) else: gridded_unit = None # loop over all stations and append to colocated data object for i, obs_stat in enumerate(obs_stat_data): # ToDo: consider removing to keep ts_type_src_ref (this was probably # introduced for EBAS were the original data frequency is not constant # but can vary from site to site) if ts_type_src_ref is None: ts_type_src_ref = obs_stat['ts_type_src'] elif obs_stat['ts_type_src'] != ts_type_src_ref: spl = ts_type_src_ref.split(';') if not obs_stat['ts_type_src'] in spl: spl.append(obs_stat['ts_type_src']) ts_type_src_ref = ';'.join(spl) if ungridded_unit is None: try: ungridded_unit = obs_stat['var_info'][var_ref]['units'] except KeyError as e: #variable information or unit is not defined logger.exception(repr(e)) try: unit = obs_stat['var_info'][var_ref]['units'] except Exception: unit = None if not unit == ungridded_unit: raise ValueError( 'Cannot perform colocation. Ungridded data ' 'object contains different units ({})'.format(var_ref)) # get observations (Note: the index of the observation time series # is already in the specified frequency format, and thus, does not # need to be updated, for details (or if errors occur), cf. # UngriddedData.to_station_data, where the conversion happens) # get model station data grid_stat = grid_stat_data[i] if harmonise_units: grid_unit = grid_stat.get_unit(var) obs_unit = obs_stat.get_unit(var_ref) if not grid_unit == obs_unit: grid_stat.convert_unit(var, obs_unit) if gridded_unit is None: gridded_unit = obs_unit if remove_outliers and not var_keep_outliers: # don't check if harmonise_units is active, because the # remove_outliers method checks units based on AeroCom default # variables, and a variable mapping might be active, i.e. # sometimes models use abs550aer for absorption coefficients # with units [m-1] and not for AAOD (which is the AeroCom default # and unitless. Hence, unit check in remove_outliers works only # if the variable name (and unit) corresonds to AeroCom default) #chk_unit = not harmonise_units grid_stat.remove_outliers(var, low=low, high=high, check_unit=True) _df = _colocate_site_data_helper( stat_data=grid_stat, stat_data_ref=obs_stat, var=var, var_ref=var_ref, ts_type=col_freq, resample_how=resample_how, apply_time_resampling_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, use_climatology_ref=use_climatology_ref) # this try/except block was introduced on 23/2/2021 as temporary fix from # v0.10.0 -> v0.10.1 as a result of multi-weekly obsdata (EBAS) that # can end up resulting in incorrect number of timestamps after resampling # (the error was discovered using EBASMC, concpm10, 2019 and colocation # frequency monthly) try: # assign the unified timeseries data to the colocated data array coldata[0, :, i] = _df['ref'].values coldata[1, :, i] = _df['data'].values except ValueError as e: const.print_log.warning( f'Failed to colocate time for station {obs_stat.station_name}. ' f'This station will be skipped (error: {e})') lons.append(obs_stat.longitude) lats.append(obs_stat.latitude) alts.append(obs_stat.altitude) station_names.append(obs_stat.station_name) try: revision = ungridded_data.data_revision[dataset_ref] except Exception: try: revision = ungridded_data._get_data_revision_helper(dataset_ref) except MetaDataError: revision = 'MULTIPLE' except Exception: revision = 'n/a' files = [os.path.basename(x) for x in gridded_data.from_files] meta = { 'data_source': [dataset_ref, gridded_data.name], 'var_name': [var_ref, var], 'ts_type': col_freq, # will be updated below if resampling 'filter_name': filter_name, 'ts_type_src': [ts_type_src_ref, grid_ts_type_src], 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'var_units': [ungridded_unit, gridded_unit], 'vert_scheme': vert_scheme, 'data_level': 3, 'revision_ref': revision, 'from_files': files, 'from_files_ref': None, 'stations_ignored': ignore_station_names, 'colocate_time': colocate_time, 'obs_is_clim': use_climatology_ref, 'pyaerocom': pya_ver, 'apply_constraints': apply_time_resampling_constraints, 'min_num_obs': min_num_obs, 'outliers_removed': remove_outliers } meta.update(regfilter.to_dict()) # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'var_units': ('data_source', meta['var_units']), 'ts_type_src': ('data_source', meta['ts_type_src']), 'time': time_idx, 'station_name': station_names, 'latitude': ('station_name', lats), 'longitude': ('station_name', lons), 'altitude': ('station_name', alts) } dims = ['data_source', 'time', 'station_name'] data = ColocatedData(data=coldata, coords=coords, dims=dims, name=var, attrs=meta) # add correct units for lat / lon dimensions data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name data.latitude.attrs['units'] = str(gridded_data.latitude.units) data.longitude.attrs[ 'standard_name'] = gridded_data.longitude.standard_name data.longitude.attrs['units'] = str(gridded_data.longitude.units) if col_freq != str(ts_type): data = data.resample_time( to_ts_type=ts_type, colocate_time=colocate_time, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how, **kwargs) return data