def _init_log(self): logbase = chk_make_subdir(self.out_basedir, 'log_files_analysis') logdir = chk_make_subdir(logbase, datetime.today().strftime('%Y%m%d')) if self.start is None: start_str = 'ModelStart' else: start_str = to_datestring_YYYYMMDD(self.start) if self.stop is None: if isinstance(self.start, int): #is year stop_str = to_datestring_YYYYMMDD(self.start + 1) else: stop_str = 'None' else: stop_str = to_datestring_YYYYMMDD(self.stop) fname = ('result_log_{}_{}_{}.csv' .format(self.obs_id, start_str, stop_str)) self._log = log = open(os.path.join(logdir, fname), 'w+') log.write('Analysis configuration\n') for k, v in self._setup.items(): if k == 'model_id': continue elif k == 'ts_type_setup': log.write('TS_TYPES (<read>: <analyse>)\n') for key, val in v.items(): if key == 'read_alt': continue log.write(' {}:{}\n'.format(key, val)) if v['read_alt']: log.write(' Alternative TS_TYPES (read)\n') for key, val in v['read_alt'].items(): log.write(' {}:{}\n'.format(key, val)) else: log.write('{}: {}\n'.format(k, v))
def _coldata_save_name(self, model_data, ts_type_ana, start=None, stop=None): """Based on current setup, get savename of colocated data file """ if start is None: start = model_data.start else: start = to_pandas_timestamp(start) if stop is None: stop = model_data.stop else: stop = to_pandas_timestamp(stop) start_str = to_datestring_YYYYMMDD(start) stop_str = to_datestring_YYYYMMDD(stop) ts_type_src = model_data.ts_type coll_data_name = ColocatedData._aerocom_savename(model_data.var_name, self.obs_id, self.model_id, ts_type_src, start_str, stop_str, ts_type_ana, self.filter_name) return coll_data_name + '.nc'
def _coldata_savename(self, model_data, start=None, stop=None, ts_type=None, var_name=None): """Based on current setup, get savename of colocated data file """ if start is None: start = model_data.start else: start = to_pandas_timestamp(start) if stop is None: stop = model_data.stop else: stop = to_pandas_timestamp(stop) if ts_type is None: ts_type = model_data.ts_type if var_name is None: var_name = model_data.var_name start_str = to_datestring_YYYYMMDD(start) stop_str = to_datestring_YYYYMMDD(stop) if isinstance(self.obs_name, str): obs_id = self.obs_name else: obs_id = self.obs_id if isinstance(self.model_name, str): model_id = self.model_name else: model_id = model_data.data_id col_data_name = ColocatedData._aerocom_savename( var_name=var_name, obs_id=obs_id, model_id=model_id, start_str=start_str, stop_str=stop_str, ts_type=ts_type, filter_name=self.filter_name) return col_data_name + '.nc'
def colocate_gridded_ungridded(gridded_data, ungridded_data, ts_type=None, start=None, stop=None, filter_name=None, regrid_res_deg=None, remove_outliers=True, vert_scheme=None, harmonise_units=True, var_ref=None, var_outlier_ranges=None, var_ref_outlier_ranges=None, update_baseyear_gridded=None, ignore_station_names=None, apply_time_resampling_constraints=None, min_num_obs=None, colocate_time=False, var_keep_outliers=True, var_ref_keep_outliers=False, **kwargs): """Colocate gridded with ungridded data Note ---- Uses the variable that is contained in input :class:`GriddedData` object (since these objects only contain a single variable) Parameters ---------- gridded_data : GriddedData gridded data (e.g. model results) ungridded_data : UngriddedData ungridded data (e.g. observations) ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly.). The colocation itself is done in the highest available resolution and resampling to `ts_type` is done afterwards. You may change this behaviour by setting input param `resample_first=True` (default is False). start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). If None, then it is set to 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude mountain sites. regrid_res_deg : :obj:`int`, optional regrid resolution in degrees. If specified, the input gridded data object will be regridded in lon / lat dimension to the input resolution. (BETA feature) remove_outliers : bool if True, outliers are removed from model and obs data before colocation, else not. vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. harmonise_units : bool if True, units are attempted to be harmonised (note: raises Exception if True and units cannot be harmonised). var_ref : :obj:`str`, optional variable against which data in :attr:`gridded_data` is supposed to be compared. If None, then the same variable is used (i.e. `gridded_data.var_name`). var_outlier_ranges : dict, optional dictionary specifying outlier ranges for dataset to be analysed (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then the pyaerocom default outlier ranges are used for the input variable. Defaults to None. var_ref_outlier_ranges : dict, optional like `var_outlier_ranges` but for reference dataset. update_baseyear_gridded : int, optional optional input that can be set in order to re-define the time dimension in the gridded data object to be analysed. E.g., if the data object is a climatology (one year of data) that has set the base year of the time dimension to a value other than the specified input start / stop time this may be used to update the time in order to make colocation possible. ignore_station_names : str or list, optional station name or pattern or list of station names or patterns that should be ignored apply_time_resampling_constraints : bool, optional if True, then time resampling constraints are applied as provided via :attr:`min_num_obs` or if that one is unspecified, as defined in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used (which defaults to True !!). min_num_obs : int or dict, optional minimum number of observations for resampling of time colocate_time : bool if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. var_keep_outliers : bool if True, then no outliers will be removed from dataset to be analysed, even if `remove_outliers` is True. That is because for model evaluation often only outliers are supposed to be removed in the observations but not in the model. var_ref_keep_outliers : bool if True, then no outliers will be removed from the reference dataset, even if `remove_outliers` is True. **kwargs additional keyword args (passed to :func:`UngriddedData.to_station_data_all`) Returns ------- ColocatedData instance of colocated data Raises ------ VarNotAvailableError if grid data variable is not available in ungridded data object AttributeError if instance of input :class:`UngriddedData` object contains more than one dataset TimeMatchError if gridded data time range does not overlap with input time range ColocationError if none of the data points in input :class:`UngriddedData` matches the input colocation constraints """ if var_outlier_ranges is None: var_outlier_ranges = {} if var_ref_outlier_ranges is None: var_ref_outlier_ranges = {} if filter_name is None: filter_name = 'WORLD-wMOUNTAINS' var = gridded_data.var_name aerocom_var = gridded_data.var_name_aerocom if var_ref is None: var_ref = aerocom_var if remove_outliers: low, high, low_ref, high_ref = None, None, None, None if var in var_outlier_ranges: low, high = var_outlier_ranges[var] if var_ref in var_ref_outlier_ranges: low_ref, high_ref = var_ref_outlier_ranges[var_ref] if not var_ref in ungridded_data.contains_vars: raise VarNotAvailableError('Variable {} is not available in ungridded ' 'data (which contains {})'.format( var_ref, ungridded_data.contains_vars)) elif len(ungridded_data.contains_datasets) > 1: raise AttributeError('Colocation can only be performed with ' 'ungridded data objects that only contain a ' 'single dataset. Use method `extract_dataset` of ' 'UngriddedData object to extract single datasets') dataset_ref = ungridded_data.contains_datasets[0] if update_baseyear_gridded is not None: # update time dimension in gridded data gridded_data.base_year = update_baseyear_gridded # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) grid_ts_type = gridded_data.ts_type if ts_type is None: ts_type = grid_ts_type if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) if start < grid_start: start = grid_start if stop > grid_stop: stop = grid_stop # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) # create instance of Filter class (may, in the future, also include all # filter options, e.g. start, stop, variables, only land, only oceans, and # may also be linked with other data object, e.g. if data is only supposed # to be used if other data object exceeds a certain threshold... but for # now, only region and altitude range) regfilter = Filter(name=filter_name) # apply filter to data ungridded_data = regfilter(ungridded_data) #crop time gridded_data = gridded_data.crop(time_range=(start, stop)) if regrid_res_deg is not None: lons = gridded_data.longitude.points lats = gridded_data.latitude.points lons_new = np.arange(lons.min(), lons.max(), regrid_res_deg) lats_new = np.arange(lats.min(), lats.max(), regrid_res_deg) gridded_data = gridded_data.interpolate(latitude=lats_new, longitude=lons_new) ungridded_freq = None # that keeps ungridded data in original resolution if not colocate_time: gridded_data = gridded_data.resample_time(to_ts_type=ts_type) ungridded_freq = ts_type # converts ungridded data directly to desired resolution # ts_type that is used for colocation col_ts_type = gridded_data.ts_type # pandas frequency string that corresponds to col_ts_type col_freq = TS_TYPE_TO_PANDAS_FREQ[col_ts_type] if remove_outliers and not var_ref_keep_outliers: ungridded_data.remove_outliers(var_ref, inplace=True, low=low_ref, high=high_ref) all_stats = ungridded_data.to_station_data_all( vars_to_convert=var_ref, start=start, stop=stop, freq=ungridded_freq, by_station_name=True, ignore_index=ignore_station_names, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, **kwargs) obs_stat_data = all_stats['stats'] ungridded_lons = all_stats['longitude'] ungridded_lats = all_stats['latitude'] # resampling constraints may have been altered in case input was None, # thus overwrite vi = obs_stat_data[0]['var_info'][var_ref] if 'apply_constraints' in vi: apply_time_resampling_constraints = vi['apply_constraints'] min_num_obs = vi['min_num_obs'] if len(obs_stat_data) == 0: raise VarNotAvailableError('Variable {} is not available in specified ' 'time interval ({}-{})'.format( var_ref, start, stop)) # make sure the gridded data is in the right dimension try: gridded_data.check_dimcoords_tseries() except DimensionOrderError: gridded_data.reorder_dimensions_tseries() if gridded_data.ndim > 3: if vert_scheme is None: vert_scheme = 'mean' if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES: raise ValueError( 'Vertical scheme {} is not supported'.format(vert_scheme)) grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons, latitude=ungridded_lats, vert_scheme=vert_scheme) # Generate time index of ColocatedData object time_idx = pd.DatetimeIndex(start=start, end=stop, freq=col_freq) #periods = time_idx.to_period(col_freq) # ============================================================================= # if col_freq in PANDAS_RESAMPLE_OFFSETS: # offs = np.timedelta64(1, '[{}]'.format(PANDAS_RESAMPLE_OFFSETS[col_freq])) # time_idx = time_idx + offs # ============================================================================= coldata = np.empty((2, len(time_idx), len(obs_stat_data))) lons = [] lats = [] alts = [] station_names = [] ungridded_unit = None ts_type_src_ref = None if not harmonise_units: gridded_unit = str(gridded_data.units) else: gridded_unit = None # loop over all stations and append to colocated data object for i, obs_stat in enumerate(obs_stat_data): if ts_type_src_ref is None: ts_type_src_ref = obs_stat['ts_type_src'] elif obs_stat['ts_type_src'] != ts_type_src_ref: spl = ts_type_src_ref.split(';') if not obs_stat['ts_type_src'] in spl: spl.append(obs_stat['ts_type_src']) ts_type_src_ref = ';'.join(spl) if ungridded_unit is None: try: ungridded_unit = obs_stat['var_info'][var_ref]['units'] except KeyError as e: #variable information or unit is not defined logger.exception(repr(e)) try: unit = obs_stat['var_info'][var_ref]['units'] except: unit = None if not unit == ungridded_unit: raise ValueError( 'Cannot perform colocation. Ungridded data ' 'object contains different units ({})'.format(var_ref)) # get observations (Note: the index of the observation time series # is already in the specified frequency format, and thus, does not # need to be updated, for details (or if errors occur), cf. # UngriddedData.to_station_data, where the conversion happens) # get model data corresponding to station grid_stat = grid_stat_data[i] if harmonise_units: grid_unit = grid_stat.get_unit(var) obs_unit = obs_stat.get_unit(var_ref) if not grid_unit == obs_unit: grid_stat.convert_unit(var, obs_unit) if gridded_unit is None: gridded_unit = obs_unit if remove_outliers and not var_keep_outliers: # don't check if harmonise_units is active, because the # remove_outliers method checks units based on AeroCom default # variables, and a variable mapping might be active, i.e. # sometimes models use abs550aer for absorption coefficients # with units [m-1] and not for AAOD (which is the AeroCom default # and unitless. Hence, unit check in remove_outliers works only # if the variable name (and unit) corresonds to AeroCom default) #chk_unit = not harmonise_units grid_stat.remove_outliers(var, low=low, high=high, check_unit=True) # get grid and obs timeseries data (that may be sampled in arbitrary # time resolution, particularly the obs data) grid_ts = grid_stat[var] obs_ts = obs_stat[var_ref] # resample to the colocation frequency obs_ts1 = obs_ts.resample(col_freq).mean() grid_ts1 = grid_ts.resample(col_freq).mean() # fill up missing time stamps _df = pd.concat([obs_ts1, grid_ts1], axis=1, keys=['o', 'm']) # assign the unified timeseries data to the colocated data array coldata[0, :, i] = _df['o'].values coldata[1, :, i] = _df['m'].values lons.append(obs_stat.longitude) lats.append(obs_stat.latitude) alts.append(obs_stat.altitude) station_names.append(obs_stat.station_name) try: revision = ungridded_data.data_revision[dataset_ref] except: try: revision = ungridded_data._get_data_revision_helper(dataset_ref) except MetaDataError: revision = 'MULTIPLE' except: revision = 'n/a' files = [os.path.basename(x) for x in gridded_data.from_files] meta = { 'data_source': [dataset_ref, gridded_data.name], 'var_name': [var_ref, var], 'ts_type': col_ts_type, 'filter_name': filter_name, 'ts_type_src': [ts_type_src_ref, grid_ts_type], 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'var_units': [ungridded_unit, gridded_unit], 'vert_scheme': vert_scheme, 'data_level': 3, 'revision_ref': revision, 'from_files': files, 'from_files_ref': None, 'stations_ignored': ignore_station_names, 'colocate_time': colocate_time, 'apply_constraints': apply_time_resampling_constraints, 'min_num_obs': min_num_obs, 'outliers_removed': remove_outliers } meta.update(regfilter.to_dict()) # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'var_units': ('data_source', meta['var_units']), 'ts_type_src': ('data_source', meta['ts_type_src']), 'time': time_idx, 'station_name': station_names, 'latitude': ('station_name', lats), 'longitude': ('station_name', lons), 'altitude': ('station_name', alts) } dims = ['data_source', 'time', 'station_name'] data = ColocatedData(data=coldata, coords=coords, dims=dims, name=var, attrs=meta) if colocate_time and grid_ts_type != ts_type: data = data.resample_time( to_ts_type=ts_type, colocate_time=True, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, **kwargs) return data
def colocate_gridded_gridded(gridded_data, gridded_data_ref, ts_type=None, start=None, stop=None, filter_name=None, regrid_res_deg=None, remove_outliers=True, vert_scheme=None, harmonise_units=True, regrid_scheme='areaweighted', var_outlier_ranges=None, var_ref_outlier_ranges=None, update_baseyear_gridded=None, apply_time_resampling_constraints=None, min_num_obs=None, colocate_time=False, var_keep_outliers=True, var_ref_keep_outliers=False, **kwargs): """Colocate 2 gridded data objects Todo ---- - think about vertical dimension (vert_scheme input not used at the moment) Parameters ---------- gridded_data : GriddedData gridded data (e.g. model results) gridded_data_ref : GriddedData reference dataset that is used to evaluate :attr:`gridded_data` (e.g. gridded observation data) ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly..) start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). If None, then it is set to 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude mountain sites. regrid_res_deg : :obj:`int`, optional regrid resolution in degrees. If specified, the input gridded data objects will be regridded in lon / lat dimension to the input resolution. (BETA feature) remove_outliers : bool if True, outliers are removed from model and obs data before colocation, else not. vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. harmonise_units : bool if True, units are attempted to be harmonised (note: raises Exception if True and units cannot be harmonised). regrid_scheme : str iris scheme used for regridding (defaults to area weighted regridding) var_outlier_ranges : :obj:`dict`, optional dictionary specifying outlier ranges for dataset to be analysed (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then the pyaerocom default outlier ranges are used for the input variable. Defaults to None. var_ref_outlier_ranges : dict, optional like `var_outlier_ranges` but for reference dataset. update_baseyear_gridded : int, optional optional input that can be set in order to redefine the time dimension in the gridded data object to be analysed. E.g., if the data object is a climatology (one year of data) that has set the base year of the time dimension to a value other than the specified input start / stop time this may be used to update the time in order to make colocation possible. apply_time_resampling_constraints : bool, optional if True, then time resampling constraints are applied as provided via :attr:`min_num_obs` or if that one is unspecified, as defined in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used (which defaults to True !!). min_num_obs : int or dict, optional minimum number of observations for resampling of time colocate_time : bool if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. var_keep_outliers : bool if True, then no outliers will be removed from dataset to be analysed, even if `remove_outliers` is True. That is because for model evaluation often only outliers are supposed to be removed in the observations but not in the model. var_ref_keep_outliers : bool if True, then no outliers will be removed from the reference dataset, even if `remove_outliers` is True. **kwargs additional keyword args (not used here, but included such that factory class can handle different methods with different inputs) Returns ------- ColocatedData instance of colocated data """ if vert_scheme is not None: raise NotImplementedError('Input vert_scheme cannot yet be handled ' 'for gridded / gridded colocation...') if ts_type is None: ts_type = 'monthly' if var_outlier_ranges is None: var_outlier_ranges = {} if var_ref_outlier_ranges is None: var_ref_outlier_ranges = {} if filter_name is None: filter_name = 'WORLD-wMOUNTAINS' if gridded_data.var_info.has_unit: if harmonise_units and not gridded_data.units == gridded_data_ref.units: try: gridded_data_ref.convert_unit(gridded_data.units) except: raise DataUnitError('Failed to merge data unit of reference ' 'gridded data object ({}) to data unit ' 'of gridded data object ({})'.format( gridded_data.units, gridded_data_ref.units)) var, var_ref = gridded_data.var_name, gridded_data_ref.var_name if remove_outliers: low, high, low_ref, high_ref = None, None, None, None if var in var_outlier_ranges: low, high = var_outlier_ranges[var] if var_ref in var_ref_outlier_ranges: low_ref, high_ref = var_ref_outlier_ranges[var_ref] if update_baseyear_gridded is not None: # update time dimension in gridded data gridded_data.base_year = update_baseyear_gridded # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) grid_start_ref = to_pandas_timestamp(gridded_data_ref.start) grid_stop_ref = to_pandas_timestamp(gridded_data_ref.stop) grid_ts_type = gridded_data.ts_type if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) if grid_start_ref > start: start = grid_start_ref if grid_stop_ref < stop: stop = grid_stop_ref # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) gridded_data = gridded_data.crop(time_range=(start, stop)) gridded_data_ref = gridded_data_ref.crop(time_range=(start, stop)) if regrid_res_deg is not None: lons = gridded_data_ref.longitude.points lats = gridded_data_ref.latitude.points lons_new = np.arange(lons.min(), lons.max(), regrid_res_deg) lats_new = np.arange(lats.min(), lats.max(), regrid_res_deg) gridded_data_ref = gridded_data_ref.interpolate(latitude=lats_new, longitude=lons_new) # get both objects in same time resolution if not colocate_time: gridded_data = gridded_data.resample_time(ts_type) gridded_data_ref = gridded_data_ref.resample_time(ts_type) # guess bounds (for area weighted regridding, which is the default) gridded_data._check_lonlat_bounds() gridded_data_ref._check_lonlat_bounds() # perform regridding gridded_data = gridded_data.regrid(gridded_data_ref, scheme=regrid_scheme) # perform region extraction (if applicable) regfilter = Filter(name=filter_name) gridded_data = regfilter(gridded_data) gridded_data_ref = regfilter(gridded_data_ref) if not gridded_data.shape == gridded_data_ref.shape: raise ColocationError('Shape mismatch between two colocated data ' 'arrays, please debug') files_ref = [os.path.basename(x) for x in gridded_data_ref.from_files] files = [os.path.basename(x) for x in gridded_data.from_files] meta = { 'data_source': [gridded_data_ref.data_id, gridded_data.data_id], 'var_name': [var_ref, var], 'ts_type': ts_type, 'filter_name': filter_name, 'ts_type_src': [gridded_data_ref.ts_type, grid_ts_type], 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'var_units': [str(gridded_data_ref.units), str(gridded_data.units)], 'vert_scheme': vert_scheme, 'data_level': 3, 'revision_ref': gridded_data_ref.data_revision, 'from_files': files, 'from_files_ref': files_ref, 'colocate_time': colocate_time, 'apply_constraints': apply_time_resampling_constraints, 'min_num_obs': min_num_obs } meta.update(regfilter.to_dict()) if remove_outliers: if not var_keep_outliers: gridded_data.remove_outliers(low, high) if not var_ref_keep_outliers: gridded_data_ref.remove_outliers(low_ref, high_ref) data = gridded_data.grid.data if isinstance(data, np.ma.core.MaskedArray): data = data.filled(np.nan) data_ref = gridded_data_ref.grid.data if isinstance(data_ref, np.ma.core.MaskedArray): data_ref = data_ref.filled(np.nan) arr = np.asarray((data_ref, data)) time = gridded_data.time_stamps().astype('datetime64[ns]') lats = gridded_data.latitude.points lons = gridded_data.longitude.points # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'var_units': ('data_source', meta['var_units']), 'ts_type_src': ('data_source', meta['ts_type_src']), 'time': time, 'latitude': lats, 'longitude': lons } dims = ['data_source', 'time', 'latitude', 'longitude'] data = ColocatedData(data=arr, coords=coords, dims=dims, name=gridded_data.var_name, attrs=meta) if colocate_time and grid_ts_type != ts_type: data = data.resample_time( to_ts_type=ts_type, colocate_time=True, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, **kwargs) return data
def test_to_datestring_YYYYMMDD(input, expected): assert helpers.to_datestring_YYYYMMDD(input) == expected
def colocate_gridded_gridded(gridded_data, gridded_data_ref, ts_type=None, start=None, stop=None, filter_name=None, regrid_scheme='areaweighted', vert_scheme=None, **kwargs): """Colocate 2 gridded data objects Todo ---- - Complete docstring - think about vertical dimension (vert_scheme input not used at the moment) """ if ts_type is None: ts_type = 'yearly' if filter_name is None: filter_name = 'WORLD-wMOUNTAINS' if gridded_data.var_info.has_unit: if not gridded_data.unit == gridded_data_ref.unit: try: gridded_data_ref.convert_unit(gridded_data.unit) except: raise DataUnitError('Failed to merge data unit of reference ' 'gridded data object ({}) to data unit ' 'of gridded data object ({})'.format( gridded_data.unit, gridded_data_ref.unit)) # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) grid_ts_type = gridded_data.ts_type if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) gridded_data = gridded_data.crop(time_range=(start, stop)) gridded_data_ref = gridded_data_ref.crop(time_range=(start, stop)) # get both objects in same time resolution gridded_data = gridded_data.downscale_time(ts_type) gridded_data_ref = gridded_data_ref.downscale_time(ts_type) # guess bounds (for area weighted regridding, which is the default) gridded_data._check_lonlat_bounds() gridded_data_ref._check_lonlat_bounds() # perform regridding gridded_data = gridded_data.regrid(gridded_data_ref, scheme=regrid_scheme) # perform region extraction (if applicable) regfilter = Filter(name=filter_name) gridded_data = regfilter(gridded_data) gridded_data_ref = regfilter(gridded_data_ref) if not gridded_data.shape == gridded_data_ref.shape: raise ColocationError('Shape mismatch between two colocated data ' 'arrays, please debug') meta = { 'data_source': [gridded_data_ref.name, gridded_data.name], 'var_name': [gridded_data.var_name, gridded_data_ref.var_name], 'ts_type': ts_type, 'filter_name': filter_name, 'ts_type_src': grid_ts_type, 'ts_type_src_ref': gridded_data_ref.ts_type, 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'unit': str(gridded_data.unit), 'data_level': 'colocated', 'revision_ref': gridded_data_ref.data_revision } meta.update(regfilter.to_dict()) data_ref = gridded_data_ref.grid.data if isinstance(data_ref, np.ma.core.MaskedArray): data_ref = data_ref.filled(np.nan) arr = np.asarray((data_ref, gridded_data.grid.data)) time = gridded_data.time_stamps().astype('datetime64[ns]') # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'time': time, 'longitude': gridded_data.longitude.points, 'latitude': gridded_data.latitude.points } dims = ['data_source', 'time', 'latitude', 'longitude'] return ColocatedData(data=arr, coords=coords, dims=dims, name=gridded_data.var_name, attrs=meta)
def colocate_gridded_ungridded(gridded_data, ungridded_data, ts_type='daily', start=None, stop=None, filter_name='WORLD-wMOUNTAINS', var_ref=None, vert_scheme=None, **kwargs): """Colocate gridded with ungridded data of 2D data 2D means, that the vertical direction is only sampled at one altitude or the variable is of integrated nature (or averaged) so that the dimensionality of the grid data is (or can be -> cf. input parameter `vert_scheme`) reduced to dimensionality time, lat, lon. Note ---- Uses the variable that is contained in input :class:`GriddedData` object (since these objects only contain a single variable) Parameters ---------- gridded_data : GriddedData gridded data (e.g. model results) ungridded_data : UngriddedData ungridded data (e.g. observations) var_name : str variable to be colocated ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly..) start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). Default is 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude stations at altitudes exceeding 1000 m. var_ref : :obj:`str`, optional variable against which data in :attr:`gridded_data` is supposed to be compared. If None, then the same variable is used (i.e. `gridded_data.var_name`). vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. **kwargs additional keyword args (not used here, but included such that factory class can handle different methods with different inputs) Returns ------- ColocatedData instance of colocated data Raises ------ VarNotAvailableError if grid data variable is not available in ungridded data object AttributeError if instance of input :class:`UngriddedData` object contains more than one dataset TimeMatchError if gridded data time range does not overlap with input time range ColocationError if none of the data points in input :class:`UngriddedData` matches the input colocation constraints """ var = gridded_data.var_info.var_name if var_ref is None: var_ref = var if gridded_data.var_info.has_unit: if not gridded_data.unit == ungridded_data.unit[var_ref]: try: gridded_data.convert_unit(ungridded_data.unit[var_ref]) except: raise DataUnitError('Failed to merge data unit of ' 'gridded data object ({}) to data unit ' 'of ungridded data object ({})'.format( gridded_data.unit, ungridded_data.unit[var_ref])) if not var_ref in ungridded_data.contains_vars: raise VarNotAvailableError('Variable {} is not available in ungridded ' 'data (which contains {})'.format( var_ref, ungridded_data.contains_vars)) elif len(ungridded_data.contains_datasets) > 1: raise AttributeError('Colocation can only be performed with ' 'ungridded data objects that only contain a ' 'single dataset. Use method `extract_dataset` of ' 'UngriddedData object to extract single datasets') dataset_ref = ungridded_data.contains_datasets[0] # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) grid_ts_type = gridded_data.ts_type if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) # create instance of Filter class (may, in the future, also include all # filter options, e.g. start, stop, variables, only land, only oceans, and # may also be linked with other data object, e.g. if data is only supposed # to be used if other data object exceeds a certain threshold... but for # now, only region and altitude range) regfilter = Filter(name=filter_name) # apply filter to data ungridded_data = regfilter(ungridded_data) ungridded_lons = ungridded_data.longitude ungridded_lats = ungridded_data.latitude #crop time gridded_data = gridded_data.crop(time_range=(start, stop)) # downscale time (if applicable) grid_data = gridded_data.downscale_time(to_ts_type=ts_type) # conver grid_stat_data = grid_data.to_time_series(longitude=ungridded_lons, latitude=ungridded_lats, vert_scheme=vert_scheme) # pandas frequency string for TS type freq_pd = TS_TYPE_TO_PANDAS_FREQ[ts_type] freq_np = TS_TYPE_TO_NUMPY_FREQ[ts_type] start = pd.Timestamp(start.to_datetime64().astype( 'datetime64[{}]'.format(freq_np))) #stop = pd.Timestamp(stop.to_datetime64().astype('datetime64[{}]'.format(freq_np))) obs_stat_data = ungridded_data.to_station_data_all(vars_to_convert=var_ref, start=start, stop=stop, freq=freq_pd, interp_nans=False) obs_vals = [] grid_vals = [] lons = [] lats = [] alts = [] station_names = [] # TIME INDEX ARRAY FOR COLLOCATED DATA OBJECT TIME_IDX = pd.DatetimeIndex(freq=freq_pd, start=start, end=stop) ts_type_src_ref = None for i, obs_data in enumerate(obs_stat_data): if obs_data is not None: if ts_type_src_ref is None: ts_type_src_ref = obs_data['ts_type_src'] elif not obs_data['ts_type_src'] == ts_type_src_ref: raise ValueError( 'Cannot perform colocation. Ungridded data ' 'object contains different source frequencies') # get observations (Note: the index of the observation time series # is already in the specified frequency format, and thus, does not # need to be updated, for details (or if errors occur), cf. # UngriddedData.to_station_data, where the conversion happens) obs_tseries = obs_data[var_ref] # get model data corresponding to station grid_tseries = grid_stat_data[i][var] if sum(grid_tseries.isnull()) > 0: raise Exception('DEVELOPER: PLEASE DEBUG AND FIND SOLUTION') elif not len(grid_tseries) == len(TIME_IDX): raise Exception('DEVELOPER: PLEASE DEBUG AND FIND SOLUTION') # make sure, time index is defined in the right way (i.e. # according to TIME_INDEX, e.g. if ts_type='monthly', it should # not be the mid or end of month) grid_tseries = pd.Series(grid_tseries.values, index=TIME_IDX) # the following command takes care of filling up with NaNs where # data is missing df = pd.DataFrame( { 'ungridded': obs_tseries, 'gridded': grid_tseries }, index=TIME_IDX) grid_vals_temp = df['gridded'].values obs_vals.append(df['ungridded'].values) grid_vals.append(grid_vals_temp) lons.append(obs_data.longitude) lats.append(obs_data.latitude) alts.append(obs_data.altitude) station_names.append(obs_data.station_name) if len(obs_vals) == 0: raise ColocationError('No observations could be found that match ' 'the colocation constraints') try: revision = ungridded_data.data_revision[dataset_ref] except: revision = 'n/a' meta = { 'data_source': [dataset_ref, gridded_data.name], 'var_name': [var, var_ref], 'ts_type': ts_type, 'filter_name': filter_name, 'ts_type_src': grid_ts_type, 'ts_type_src_ref': ts_type_src_ref, 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'unit': str(gridded_data.unit), 'data_level': 'colocated', 'revision_ref': revision } meta.update(regfilter.to_dict()) grid_vals = np.asarray(grid_vals) obs_vals = np.asarray(obs_vals) stat_dim, time_dim = grid_vals.shape arr = np.array((obs_vals, grid_vals)) arr = np.swapaxes(arr, 1, 2) #.reshape((2, time_dim, stat_dim)) # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'time': TIME_IDX, 'station_name': station_names, 'latitude': ('station_name', lats), 'longitude': ('station_name', lons), 'altitude': ('station_name', alts) } dims = ['data_source', 'time', 'station_name'] data = ColocatedData(data=arr, coords=coords, dims=dims, name=var, attrs=meta) return data
def colocate_gridded_ungridded(gridded_data, ungridded_data, ts_type=None, start=None, stop=None, filter_name=None, regrid_res_deg=None, remove_outliers=True, vert_scheme=None, harmonise_units=True, regrid_scheme='areaweighted', var_ref=None, var_outlier_ranges=None, var_ref_outlier_ranges=None, update_baseyear_gridded=None, ignore_station_names=None, apply_time_resampling_constraints=None, min_num_obs=None, colocate_time=False, var_keep_outliers=True, var_ref_keep_outliers=False, use_climatology_ref=False, resample_how=None, **kwargs): """Colocate gridded with ungridded data (low level method) For high-level colocation see :class:`pyaerocom.colocation_auto.Colocator` and :class:`pyaerocom.colocation_auto.ColocationSetup` Note ---- Uses the variable that is contained in input :class:`GriddedData` object (since these objects only contain a single variable). If this variable is not contained in observation data (or contained but using a different variable name) you may specify the obs variable to be used via input arg `var_ref` Parameters ---------- gridded_data : GriddedData gridded data object (e.g. model results). ungridded_data : UngriddedData ungridded data object (e.g. observations). ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly.). start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used. stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). If None, then it is set to 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude mountain sites. regrid_res_deg : int or dict, optional regrid resolution in degrees. If specified, the input gridded data object will be regridded in lon / lat dimension to the input resolution (if input is integer, both lat and lon are regridded to that resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg` to specify regrid resolutions, respectively). remove_outliers : bool if True, outliers are removed from model and obs data before colocation, else not. Outlier ranges can be specified via input args `var_outlier_ranges` and `var_ref_outlier_ranges`. vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. harmonise_units : bool if True, units are attempted to be harmonised (note: raises Exception if True and units cannot be harmonised). var_ref : :obj:`str`, optional variable against which data in :attr:`gridded_data` is supposed to be compared. If None, then the same variable is used (i.e. `gridded_data.var_name`). var_outlier_ranges : dict, optional dictionary specifying outlier ranges for dataset to be analysed (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then the pyaerocom default outlier ranges are used for the input variable. Defaults to None. var_ref_outlier_ranges : dict, optional like `var_outlier_ranges` but for reference dataset. update_baseyear_gridded : int, optional optional input that can be set in order to re-define the time dimension in the gridded data object to be analysed. E.g., if the data object is a climatology (one year of data) that has set the base year of the time dimension to a value other than the specified input start / stop time this may be used to update the time in order to make colocation possible. ignore_station_names : str or list, optional station name or pattern or list of station names or patterns that should be ignored apply_time_resampling_constraints : bool, optional if True, then time resampling constraints are applied as provided via :attr:`min_num_obs` or if that one is unspecified, as defined in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used (which defaults to True !!). min_num_obs : int or dict, optional minimum number of observations for resampling of time colocate_time : bool if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. var_keep_outliers : bool if True, then no outliers will be removed from dataset to be analysed, even if `remove_outliers` is True. That is because for model evaluation often only outliers are supposed to be removed in the observations but not in the model. var_ref_keep_outliers : bool if True, then no outliers will be removed from the reference dataset, even if `remove_outliers` is True. use_climatology_ref : bool if True, climatological timeseries are used from observations resample_how : str or dict string specifying how data should be aggregated when resampling in time. Default is "mean". Can also be a nested dictionary, e.g. resample_how={'daily': {'hourly' : 'max'}} would use the maximum value to aggregate from hourly to daily, rather than the mean. **kwargs additional keyword args (passed to :func:`UngriddedData.to_station_data_all`) Returns ------- ColocatedData instance of colocated data Raises ------ VarNotAvailableError if grid data variable is not available in ungridded data object AttributeError if instance of input :class:`UngriddedData` object contains more than one dataset TimeMatchError if gridded data time range does not overlap with input time range ColocationError if none of the data points in input :class:`UngriddedData` matches the input colocation constraints """ if var_outlier_ranges is None: var_outlier_ranges = {} if var_ref_outlier_ranges is None: var_ref_outlier_ranges = {} if filter_name is None: filter_name = const.DEFAULT_REG_FILTER try: gridded_data.check_dimcoords_tseries() except DimensionOrderError: gridded_data.reorder_dimensions_tseries() var = gridded_data.var_name aerocom_var = gridded_data.var_name_aerocom _check_var_registered(var, aerocom_var, gridded_data) if var_ref is None: if aerocom_var is not None: var_ref = aerocom_var else: var_ref = var if remove_outliers: low, high, low_ref, high_ref = None, None, None, None if var in var_outlier_ranges: low, high = var_outlier_ranges[var] if var_ref in var_ref_outlier_ranges: low_ref, high_ref = var_ref_outlier_ranges[var_ref] if not var_ref in ungridded_data.contains_vars: raise VarNotAvailableError('Variable {} is not available in ungridded ' 'data (which contains {})'.format( var_ref, ungridded_data.contains_vars)) elif len(ungridded_data.contains_datasets) > 1: raise AttributeError('Colocation can only be performed with ' 'ungridded data objects that only contain a ' 'single dataset. Use method `extract_dataset` of ' 'UngriddedData object to extract single datasets') dataset_ref = ungridded_data.contains_datasets[0] if update_baseyear_gridded is not None: # update time dimension in gridded data gridded_data.base_year = update_baseyear_gridded grid_ts_type_src = gridded_data.ts_type grid_ts_type = TsType(gridded_data.ts_type) if isinstance(ts_type, str): ts_type = TsType(ts_type) if ts_type is None or grid_ts_type < ts_type: ts_type = grid_ts_type elif grid_ts_type > ts_type and not colocate_time: gridded_data = gridded_data.resample_time( str(ts_type), apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how) grid_ts_type = ts_type # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) if start < grid_start: start = grid_start if stop > grid_stop: stop = grid_stop # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) # create instance of Filter class (may, in the future, also include all # filter options, e.g. start, stop, variables, only land, only oceans, and # may also be linked with other data object, e.g. if data is only supposed # to be used if other data object exceeds a certain threshold... but for # now, only region and altitude range) regfilter = Filter(name=filter_name) # apply filter to data ungridded_data = regfilter.apply(ungridded_data) #crop time gridded_data = regfilter.apply(gridded_data) if start > grid_start or stop < grid_stop: gridded_data = gridded_data.crop(time_range=(start, stop)) if regrid_res_deg is not None: gridded_data = _regrid_gridded(gridded_data, regrid_scheme, regrid_res_deg) if remove_outliers and not var_ref_keep_outliers: #called twice if used via Colocator, this should go out here ungridded_data.remove_outliers(var_ref, inplace=True, low=low_ref, high=high_ref) if use_climatology_ref: col_freq = 'monthly' obs_start = const.CLIM_START obs_stop = const.CLIM_STOP else: col_freq = str(grid_ts_type) #TS_TYPE_TO_PANDAS_FREQ[grid_ts_type] obs_start = start obs_stop = stop latitude = gridded_data.latitude.points longitude = gridded_data.longitude.points lat_range = [np.min(latitude), np.max(latitude)] lon_range = [np.min(longitude), np.max(longitude)] ungridded_data = ungridded_data.filter_by_meta(latitude=lat_range, longitude=lon_range) # get timeseries from all stations in provided time resolution # (time resampling is done below in main loop) all_stats = ungridded_data.to_station_data_all( vars_to_convert=var_ref, start=obs_start, stop=obs_stop, by_station_name=True, ignore_index=ignore_station_names, **kwargs) obs_stat_data = all_stats['stats'] ungridded_lons = all_stats['longitude'] ungridded_lats = all_stats['latitude'] if len(obs_stat_data) == 0: raise VarNotAvailableError('Variable {} is not available in specified ' 'time interval ({}-{})'.format( var_ref, start, stop)) # make sure the gridded data is in the right dimension if gridded_data.ndim > 3: if vert_scheme is None: vert_scheme = 'mean' if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES: raise ValueError( 'Vertical scheme {} is not supported'.format(vert_scheme)) grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons, latitude=ungridded_lats, vert_scheme=vert_scheme) pd_freq = TsType(col_freq).to_pandas_freq() time_idx = make_datetime_index(start, stop, pd_freq) coldata = np.empty((2, len(time_idx), len(obs_stat_data))) lons = [] lats = [] alts = [] station_names = [] ungridded_unit = None ts_type_src_ref = None if not harmonise_units: gridded_unit = str(gridded_data.units) else: gridded_unit = None # loop over all stations and append to colocated data object for i, obs_stat in enumerate(obs_stat_data): # ToDo: consider removing to keep ts_type_src_ref (this was probably # introduced for EBAS were the original data frequency is not constant # but can vary from site to site) if ts_type_src_ref is None: ts_type_src_ref = obs_stat['ts_type_src'] elif obs_stat['ts_type_src'] != ts_type_src_ref: spl = ts_type_src_ref.split(';') if not obs_stat['ts_type_src'] in spl: spl.append(obs_stat['ts_type_src']) ts_type_src_ref = ';'.join(spl) if ungridded_unit is None: try: ungridded_unit = obs_stat['var_info'][var_ref]['units'] except KeyError as e: #variable information or unit is not defined logger.exception(repr(e)) try: unit = obs_stat['var_info'][var_ref]['units'] except Exception: unit = None if not unit == ungridded_unit: raise ValueError( 'Cannot perform colocation. Ungridded data ' 'object contains different units ({})'.format(var_ref)) # get observations (Note: the index of the observation time series # is already in the specified frequency format, and thus, does not # need to be updated, for details (or if errors occur), cf. # UngriddedData.to_station_data, where the conversion happens) # get model station data grid_stat = grid_stat_data[i] if harmonise_units: grid_unit = grid_stat.get_unit(var) obs_unit = obs_stat.get_unit(var_ref) if not grid_unit == obs_unit: grid_stat.convert_unit(var, obs_unit) if gridded_unit is None: gridded_unit = obs_unit if remove_outliers and not var_keep_outliers: # don't check if harmonise_units is active, because the # remove_outliers method checks units based on AeroCom default # variables, and a variable mapping might be active, i.e. # sometimes models use abs550aer for absorption coefficients # with units [m-1] and not for AAOD (which is the AeroCom default # and unitless. Hence, unit check in remove_outliers works only # if the variable name (and unit) corresonds to AeroCom default) #chk_unit = not harmonise_units grid_stat.remove_outliers(var, low=low, high=high, check_unit=True) _df = _colocate_site_data_helper( stat_data=grid_stat, stat_data_ref=obs_stat, var=var, var_ref=var_ref, ts_type=col_freq, resample_how=resample_how, apply_time_resampling_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, use_climatology_ref=use_climatology_ref) # this try/except block was introduced on 23/2/2021 as temporary fix from # v0.10.0 -> v0.10.1 as a result of multi-weekly obsdata (EBAS) that # can end up resulting in incorrect number of timestamps after resampling # (the error was discovered using EBASMC, concpm10, 2019 and colocation # frequency monthly) try: # assign the unified timeseries data to the colocated data array coldata[0, :, i] = _df['ref'].values coldata[1, :, i] = _df['data'].values except ValueError as e: const.print_log.warning( f'Failed to colocate time for station {obs_stat.station_name}. ' f'This station will be skipped (error: {e})') lons.append(obs_stat.longitude) lats.append(obs_stat.latitude) alts.append(obs_stat.altitude) station_names.append(obs_stat.station_name) try: revision = ungridded_data.data_revision[dataset_ref] except Exception: try: revision = ungridded_data._get_data_revision_helper(dataset_ref) except MetaDataError: revision = 'MULTIPLE' except Exception: revision = 'n/a' files = [os.path.basename(x) for x in gridded_data.from_files] meta = { 'data_source': [dataset_ref, gridded_data.name], 'var_name': [var_ref, var], 'ts_type': col_freq, # will be updated below if resampling 'filter_name': filter_name, 'ts_type_src': [ts_type_src_ref, grid_ts_type_src], 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'var_units': [ungridded_unit, gridded_unit], 'vert_scheme': vert_scheme, 'data_level': 3, 'revision_ref': revision, 'from_files': files, 'from_files_ref': None, 'stations_ignored': ignore_station_names, 'colocate_time': colocate_time, 'obs_is_clim': use_climatology_ref, 'pyaerocom': pya_ver, 'apply_constraints': apply_time_resampling_constraints, 'min_num_obs': min_num_obs, 'outliers_removed': remove_outliers } meta.update(regfilter.to_dict()) # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'var_units': ('data_source', meta['var_units']), 'ts_type_src': ('data_source', meta['ts_type_src']), 'time': time_idx, 'station_name': station_names, 'latitude': ('station_name', lats), 'longitude': ('station_name', lons), 'altitude': ('station_name', alts) } dims = ['data_source', 'time', 'station_name'] data = ColocatedData(data=coldata, coords=coords, dims=dims, name=var, attrs=meta) # add correct units for lat / lon dimensions data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name data.latitude.attrs['units'] = str(gridded_data.latitude.units) data.longitude.attrs[ 'standard_name'] = gridded_data.longitude.standard_name data.longitude.attrs['units'] = str(gridded_data.longitude.units) if col_freq != str(ts_type): data = data.resample_time( to_ts_type=ts_type, colocate_time=colocate_time, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how, **kwargs) return data
def start(self, value): self._start = to_datestring_YYYYMMDD(value)
def stop(self, value): self._stop = to_datestring_YYYYMMDD(value)