Esempio n. 1
0
def test_filter_ungriddeddata(aeronetsunv3lev2_subset, filter_name,
                              num_sites):

    obs_data = aeronetsunv3lev2_subset

    f = Filter(filter_name)
    num = len(f.apply(obs_data).unique_station_names)
    assert num == num_sites
Esempio n. 2
0
def test_filter_griddeddata(data_tm5, filter_name, mean):

    # use copy so that this fixture can be used elsewhere without being c
    # changed by this method globally
    model = data_tm5.copy()

    f = Filter(filter_name) # europe only land

    subset = f.apply(model)
    np.testing.assert_allclose(np.nanmean(subset.cube.data), mean)
Esempio n. 3
0
def colocate_gridded_ungridded(gridded_data,
                               ungridded_data,
                               ts_type=None,
                               start=None,
                               stop=None,
                               filter_name=None,
                               regrid_res_deg=None,
                               remove_outliers=True,
                               vert_scheme=None,
                               harmonise_units=True,
                               regrid_scheme='areaweighted',
                               var_ref=None,
                               var_outlier_ranges=None,
                               var_ref_outlier_ranges=None,
                               update_baseyear_gridded=None,
                               ignore_station_names=None,
                               apply_time_resampling_constraints=None,
                               min_num_obs=None,
                               colocate_time=False,
                               var_keep_outliers=True,
                               var_ref_keep_outliers=False,
                               use_climatology_ref=False,
                               resample_how=None,
                               **kwargs):
    """Colocate gridded with ungridded data (low level method)

    For high-level colocation see :class:`pyaerocom.colocation_auto.Colocator`
    and :class:`pyaerocom.colocation_auto.ColocationSetup`

    Note
    ----
    Uses the variable that is contained in input :class:`GriddedData` object
    (since these objects only contain a single variable). If this variable
    is not contained in observation data (or contained but using a different
    variable name) you may specify the obs variable to be used via input arg
    `var_ref`

    Parameters
    ----------
    gridded_data : GriddedData
        gridded data object (e.g. model results).
    ungridded_data : UngriddedData
        ungridded data object (e.g. observations).
    ts_type : str
        desired temporal resolution of colocated data (must be valid AeroCom
        ts_type str such as daily, monthly, yearly.).
    start : :obj:`str` or :obj:`datetime64` or similar, optional
        start time for colocation, if None, the start time of the input
        :class:`GriddedData` object is used.
    stop : :obj:`str` or :obj:`datetime64` or similar, optional
        stop time for colocation, if None, the stop time of the input
        :class:`GriddedData` object is used
    filter_name : str
        string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for
        details). If None, then it is set to 'WORLD-wMOUNTAINS', which
        corresponds to no filtering (world with mountains).
        Use WORLD-noMOUNTAINS to exclude mountain sites.
    regrid_res_deg : int or dict, optional
        regrid resolution in degrees. If specified, the input gridded data
        object will be regridded in lon / lat dimension to the input
        resolution (if input is integer, both lat and lon are regridded to that
        resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg`
        to specify regrid resolutions, respectively).
    remove_outliers : bool
        if True, outliers are removed from model and obs data before colocation,
        else not. Outlier ranges can be specified via input args
        `var_outlier_ranges` and `var_ref_outlier_ranges`.
    vert_scheme : str
        string specifying scheme used to reduce the dimensionality in case
        input grid data contains vertical dimension. Example schemes are
        `mean, surface, altitude`, for details see
        :func:`GriddedData.to_time_series`.
    harmonise_units : bool
        if True, units are attempted to be harmonised (note: raises Exception
        if True and units cannot be harmonised).
    var_ref : :obj:`str`, optional
        variable against which data in :attr:`gridded_data` is supposed to be
        compared. If None, then the same variable is used
        (i.e. `gridded_data.var_name`).
    var_outlier_ranges : dict, optional
        dictionary specifying outlier ranges for dataset to be analysed
        (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then
        the pyaerocom default outlier ranges are used for the input variable.
        Defaults to None.
    var_ref_outlier_ranges : dict, optional
        like `var_outlier_ranges` but for reference dataset.
    update_baseyear_gridded : int, optional
        optional input that can be set in order to re-define the time dimension
        in the gridded data object to be analysed. E.g., if the data object
        is a climatology (one year of data) that has set the base year of the
        time dimension to a value other than the specified input start / stop
        time this may be used to update the time in order to make colocation
        possible.
    ignore_station_names : str or list, optional
        station name or pattern or list of station names or patterns that should
        be ignored
    apply_time_resampling_constraints : bool, optional
        if True, then time resampling constraints are applied as provided via
        :attr:`min_num_obs` or if that one is unspecified, as defined in
        :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than
        :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used
        (which defaults to True !!).
    min_num_obs : int or dict, optional
        minimum number of observations for resampling of time
    colocate_time : bool
        if True and if original time resolution of data is higher than desired
        time resolution (`ts_type`), then both datasets are colocated in time
        *before* resampling to lower resolution.
    var_keep_outliers : bool
        if True, then no outliers will be removed from dataset to be analysed,
        even if `remove_outliers` is True. That is because for model evaluation
        often only outliers are supposed to be removed in the observations but
        not in the model.
    var_ref_keep_outliers : bool
        if True, then no outliers will be removed from the reference dataset,
        even if `remove_outliers` is True.
    use_climatology_ref : bool
        if True, climatological timeseries are used from observations
    resample_how : str or dict
        string specifying how data should be aggregated when resampling in time.
        Default is "mean". Can also be a nested dictionary, e.g.
        resample_how={'daily': {'hourly' : 'max'}} would use the maximum value
        to aggregate from hourly to daily, rather than the mean.
    **kwargs
        additional keyword args (passed to
        :func:`UngriddedData.to_station_data_all`)

    Returns
    -------
    ColocatedData
        instance of colocated data

    Raises
    ------
    VarNotAvailableError
        if grid data variable is not available in ungridded data object
    AttributeError
        if instance of input :class:`UngriddedData` object contains more than
        one dataset
    TimeMatchError
        if gridded data time range does not overlap with input time range
    ColocationError
        if none of the data points in input :class:`UngriddedData` matches
        the input colocation constraints
    """
    if var_outlier_ranges is None:
        var_outlier_ranges = {}
    if var_ref_outlier_ranges is None:
        var_ref_outlier_ranges = {}

    if filter_name is None:
        filter_name = const.DEFAULT_REG_FILTER

    try:
        gridded_data.check_dimcoords_tseries()
    except DimensionOrderError:
        gridded_data.reorder_dimensions_tseries()

    var = gridded_data.var_name
    aerocom_var = gridded_data.var_name_aerocom

    _check_var_registered(var, aerocom_var, gridded_data)

    if var_ref is None:
        if aerocom_var is not None:
            var_ref = aerocom_var
        else:
            var_ref = var

    if remove_outliers:
        low, high, low_ref, high_ref = None, None, None, None
        if var in var_outlier_ranges:
            low, high = var_outlier_ranges[var]
        if var_ref in var_ref_outlier_ranges:
            low_ref, high_ref = var_ref_outlier_ranges[var_ref]

    if not var_ref in ungridded_data.contains_vars:
        raise VarNotAvailableError('Variable {} is not available in ungridded '
                                   'data (which contains {})'.format(
                                       var_ref, ungridded_data.contains_vars))
    elif len(ungridded_data.contains_datasets) > 1:
        raise AttributeError('Colocation can only be performed with '
                             'ungridded data objects that only contain a '
                             'single dataset. Use method `extract_dataset` of '
                             'UngriddedData object to extract single datasets')

    dataset_ref = ungridded_data.contains_datasets[0]

    if update_baseyear_gridded is not None:
        # update time dimension in gridded data
        gridded_data.base_year = update_baseyear_gridded

    grid_ts_type_src = gridded_data.ts_type
    grid_ts_type = TsType(gridded_data.ts_type)
    if isinstance(ts_type, str):
        ts_type = TsType(ts_type)
    if ts_type is None or grid_ts_type < ts_type:
        ts_type = grid_ts_type
    elif grid_ts_type > ts_type and not colocate_time:
        gridded_data = gridded_data.resample_time(
            str(ts_type),
            apply_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            how=resample_how)
        grid_ts_type = ts_type

    # get start / stop of gridded data as pandas.Timestamp
    grid_start = to_pandas_timestamp(gridded_data.start)
    grid_stop = to_pandas_timestamp(gridded_data.stop)

    if start is None:
        start = grid_start
    else:
        start = to_pandas_timestamp(start)
    if stop is None:
        stop = grid_stop
    else:
        stop = to_pandas_timestamp(stop)

    if start < grid_start:
        start = grid_start
    if stop > grid_stop:
        stop = grid_stop
    # check overlap
    if stop < grid_start or start > grid_stop:
        raise TimeMatchError('Input time range {}-{} does not '
                             'overlap with data range: {}-{}'.format(
                                 start, stop, grid_start, grid_stop))
    # create instance of Filter class (may, in the future, also include all
    # filter options, e.g. start, stop, variables, only land, only oceans, and
    # may also be linked with other data object, e.g. if data is only supposed
    # to be used if other data object exceeds a certain threshold... but for
    # now, only region and altitude range)
    regfilter = Filter(name=filter_name)

    # apply filter to data
    ungridded_data = regfilter.apply(ungridded_data)

    #crop time
    gridded_data = regfilter.apply(gridded_data)
    if start > grid_start or stop < grid_stop:
        gridded_data = gridded_data.crop(time_range=(start, stop))

    if regrid_res_deg is not None:
        gridded_data = _regrid_gridded(gridded_data, regrid_scheme,
                                       regrid_res_deg)

    if remove_outliers and not var_ref_keep_outliers:  #called twice if used via Colocator, this should go out here
        ungridded_data.remove_outliers(var_ref,
                                       inplace=True,
                                       low=low_ref,
                                       high=high_ref)

    if use_climatology_ref:
        col_freq = 'monthly'
        obs_start = const.CLIM_START
        obs_stop = const.CLIM_STOP
    else:
        col_freq = str(grid_ts_type)  #TS_TYPE_TO_PANDAS_FREQ[grid_ts_type]
        obs_start = start
        obs_stop = stop

    latitude = gridded_data.latitude.points
    longitude = gridded_data.longitude.points
    lat_range = [np.min(latitude), np.max(latitude)]
    lon_range = [np.min(longitude), np.max(longitude)]
    ungridded_data = ungridded_data.filter_by_meta(latitude=lat_range,
                                                   longitude=lon_range)

    # get timeseries from all stations in provided time resolution
    # (time resampling is done below in main loop)
    all_stats = ungridded_data.to_station_data_all(
        vars_to_convert=var_ref,
        start=obs_start,
        stop=obs_stop,
        by_station_name=True,
        ignore_index=ignore_station_names,
        **kwargs)

    obs_stat_data = all_stats['stats']
    ungridded_lons = all_stats['longitude']
    ungridded_lats = all_stats['latitude']

    if len(obs_stat_data) == 0:
        raise VarNotAvailableError('Variable {} is not available in specified '
                                   'time interval ({}-{})'.format(
                                       var_ref, start, stop))
    # make sure the gridded data is in the right dimension
    if gridded_data.ndim > 3:
        if vert_scheme is None:
            vert_scheme = 'mean'
        if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES:
            raise ValueError(
                'Vertical scheme {} is not supported'.format(vert_scheme))

    grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons,
                                                 latitude=ungridded_lats,
                                                 vert_scheme=vert_scheme)

    pd_freq = TsType(col_freq).to_pandas_freq()
    time_idx = make_datetime_index(start, stop, pd_freq)

    coldata = np.empty((2, len(time_idx), len(obs_stat_data)))

    lons = []
    lats = []
    alts = []
    station_names = []

    ungridded_unit = None
    ts_type_src_ref = None
    if not harmonise_units:
        gridded_unit = str(gridded_data.units)
    else:
        gridded_unit = None

    # loop over all stations and append to colocated data object
    for i, obs_stat in enumerate(obs_stat_data):
        # ToDo: consider removing to keep ts_type_src_ref (this was probably
        # introduced for EBAS were the original data frequency is not constant
        # but can vary from site to site)
        if ts_type_src_ref is None:
            ts_type_src_ref = obs_stat['ts_type_src']
        elif obs_stat['ts_type_src'] != ts_type_src_ref:
            spl = ts_type_src_ref.split(';')
            if not obs_stat['ts_type_src'] in spl:
                spl.append(obs_stat['ts_type_src'])
            ts_type_src_ref = ';'.join(spl)

        if ungridded_unit is None:
            try:
                ungridded_unit = obs_stat['var_info'][var_ref]['units']
            except KeyError as e:  #variable information or unit is not defined
                logger.exception(repr(e))
        try:
            unit = obs_stat['var_info'][var_ref]['units']
        except Exception:
            unit = None
        if not unit == ungridded_unit:
            raise ValueError(
                'Cannot perform colocation. Ungridded data '
                'object contains different units ({})'.format(var_ref))
        # get observations (Note: the index of the observation time series
        # is already in the specified frequency format, and thus, does not
        # need to be updated, for details (or if errors occur), cf.
        # UngriddedData.to_station_data, where the conversion happens)

        # get model station data
        grid_stat = grid_stat_data[i]
        if harmonise_units:
            grid_unit = grid_stat.get_unit(var)
            obs_unit = obs_stat.get_unit(var_ref)
            if not grid_unit == obs_unit:
                grid_stat.convert_unit(var, obs_unit)
            if gridded_unit is None:
                gridded_unit = obs_unit

        if remove_outliers and not var_keep_outliers:
            # don't check if harmonise_units is active, because the
            # remove_outliers method checks units based on AeroCom default
            # variables, and a variable mapping might be active, i.e.
            # sometimes models use abs550aer for absorption coefficients
            # with units [m-1] and not for AAOD (which is the AeroCom default
            # and unitless. Hence, unit check in remove_outliers works only
            # if the variable name (and unit) corresonds to AeroCom default)
            #chk_unit = not harmonise_units
            grid_stat.remove_outliers(var, low=low, high=high, check_unit=True)

        _df = _colocate_site_data_helper(
            stat_data=grid_stat,
            stat_data_ref=obs_stat,
            var=var,
            var_ref=var_ref,
            ts_type=col_freq,
            resample_how=resample_how,
            apply_time_resampling_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            use_climatology_ref=use_climatology_ref)

        # this try/except block was introduced on 23/2/2021 as temporary fix from
        # v0.10.0 -> v0.10.1 as a result of multi-weekly obsdata (EBAS) that
        # can end up resulting in incorrect number of timestamps after resampling
        # (the error was discovered using EBASMC, concpm10, 2019 and colocation
        # frequency monthly)
        try:
            # assign the unified timeseries data to the colocated data array
            coldata[0, :, i] = _df['ref'].values
            coldata[1, :, i] = _df['data'].values
        except ValueError as e:
            const.print_log.warning(
                f'Failed to colocate time for station {obs_stat.station_name}. '
                f'This station will be skipped (error: {e})')

        lons.append(obs_stat.longitude)
        lats.append(obs_stat.latitude)
        alts.append(obs_stat.altitude)
        station_names.append(obs_stat.station_name)

    try:
        revision = ungridded_data.data_revision[dataset_ref]
    except Exception:
        try:
            revision = ungridded_data._get_data_revision_helper(dataset_ref)
        except MetaDataError:
            revision = 'MULTIPLE'
        except Exception:
            revision = 'n/a'

    files = [os.path.basename(x) for x in gridded_data.from_files]

    meta = {
        'data_source': [dataset_ref, gridded_data.name],
        'var_name': [var_ref, var],
        'ts_type': col_freq,  # will be updated below if resampling
        'filter_name': filter_name,
        'ts_type_src': [ts_type_src_ref, grid_ts_type_src],
        'start_str': to_datestring_YYYYMMDD(start),
        'stop_str': to_datestring_YYYYMMDD(stop),
        'var_units': [ungridded_unit, gridded_unit],
        'vert_scheme': vert_scheme,
        'data_level': 3,
        'revision_ref': revision,
        'from_files': files,
        'from_files_ref': None,
        'stations_ignored': ignore_station_names,
        'colocate_time': colocate_time,
        'obs_is_clim': use_climatology_ref,
        'pyaerocom': pya_ver,
        'apply_constraints': apply_time_resampling_constraints,
        'min_num_obs': min_num_obs,
        'outliers_removed': remove_outliers
    }

    meta.update(regfilter.to_dict())

    # create coordinates of DataArray
    coords = {
        'data_source': meta['data_source'],
        'var_name': ('data_source', meta['var_name']),
        'var_units': ('data_source', meta['var_units']),
        'ts_type_src': ('data_source', meta['ts_type_src']),
        'time': time_idx,
        'station_name': station_names,
        'latitude': ('station_name', lats),
        'longitude': ('station_name', lons),
        'altitude': ('station_name', alts)
    }

    dims = ['data_source', 'time', 'station_name']
    data = ColocatedData(data=coldata,
                         coords=coords,
                         dims=dims,
                         name=var,
                         attrs=meta)

    # add correct units for lat / lon dimensions
    data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name
    data.latitude.attrs['units'] = str(gridded_data.latitude.units)

    data.longitude.attrs[
        'standard_name'] = gridded_data.longitude.standard_name
    data.longitude.attrs['units'] = str(gridded_data.longitude.units)

    if col_freq != str(ts_type):
        data = data.resample_time(
            to_ts_type=ts_type,
            colocate_time=colocate_time,
            apply_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            how=resample_how,
            **kwargs)
    return data