Ejemplo n.º 1
0
def get_lowest_resolution(ts_type, *ts_types):
    """Get the lowest resolution from several ts_type codes
    
    Parameters
    ----------
    ts_type : str
        first ts_type
    *ts_types
        one or more additional ts_type codes
    
    Returns
    -------
    str
        the ts_type that corresponds to the lowest resolution
        
    Raises
    ------
    ValueError
        if one of the input ts_type codes is not supported
    """
    #all_ts_types = const.GRID_IO.TS_TYPES
    from pyaerocom.tstype import TsType
    lowest = TsType(ts_type)
    for freq in ts_types:
        # =============================================================================
        #         if not freq in all_ts_types:
        #             raise ValueError('Invalid input, only valid ts_type codes are '
        #                              'supported: {}'.format(all_ts_types))
        # =============================================================================
        _temp = TsType(freq)
        if _temp < lowest:
            lowest = _temp
    return lowest.val
Ejemplo n.º 2
0
def _init_data_default_frequencies(coldata, colocation_settings):

    to_ts_types = ['daily', 'monthly', 'yearly']

    data_arrs = dict.fromkeys(to_ts_types)
    jsdate = dict.fromkeys(to_ts_types)

    tt = TsType(coldata.ts_type)

    if tt < TsType('monthly'):
        raise TemporalResolutionError(
            'Temporal resolution ({}) is too low for '
            'web processing, need monthly or higher'.format(tt))
    elif tt > TsType('daily'):
        # resolution is higher than daily -> convert to daily
        coldata = _resample_time_coldata(coldata, 'daily', colocation_settings)
        tt = TsType('daily')

    for freq in to_ts_types:
        tt_freq = TsType(freq)
        if tt < tt_freq:  # skip (coldata is in lower resolution)
            #data_arrs[freq] = None
            continue
        elif tt == tt_freq:
            data_arrs[freq] = coldata.copy()
            jsdate[freq] = _get_jsdate(coldata)

        else:
            cd = _resample_time_coldata(coldata, freq, colocation_settings)
            data_arrs[freq] = cd
            jsdate[freq] = _get_jsdate(cd)

    return (data_arrs, jsdate)
Ejemplo n.º 3
0
def test_next_lower():
    try:
        TsType('yearly').next_lower
    except Exception as e:
        assert type(e) == IndexError

    assert str(TsType('3minutely').next_lower) == 'hourly'
Ejemplo n.º 4
0
def test_next_higher():
    try:
        TsType('minutely').next_higher
    except Exception as e:
        assert type(e) == IndexError

    assert str(TsType('3minutely').next_higher) == 'minutely'
    assert str(TsType('hourly').next_higher) == 'minutely'
    assert str(TsType('monthly').next_higher) == 'weekly'
Ejemplo n.º 5
0
def test_basic_operators_pandas():
    monthly = TsType('MS')
    yearly = TsType('AS')
    daily = TsType('D')

    assert monthly < daily
    assert monthly <= daily
    assert monthly != daily
    assert yearly < daily
    assert not (yearly == daily)
    assert monthly > yearly
    assert monthly >= yearly
Ejemplo n.º 6
0
def test_basic_operators():
    monthly = TsType('monthly')
    yearly = TsType('yearly')
    daily = TsType('daily')

    assert monthly < daily
    assert monthly <= daily
    assert monthly != daily
    assert yearly < daily
    assert not (yearly == daily)
    assert monthly > yearly
    assert monthly >= yearly
Ejemplo n.º 7
0
def get_tot_number_of_seconds(ts_type, dtime=None):
    from pyaerocom.tstype import TsType
    ts_tpe = TsType(ts_type)

    if ts_tpe >= TsType('montly'):
        if dtime is None:
            raise AttributeError(
                'For frequncies larger than or eq. monthly you' +
                'need to provide dtime in order to compute the number of second.  '
            )
        else:
            # find seconds from dtime
            return None
    else:
        return TS_TYPE_SECS[ts_type]
Ejemplo n.º 8
0
def correct_time_coord(cube, ts_type, year):
    """Method that corrects the time coordinate of an iris Cube

    Parameters
    ----------
    cube : Cube
        cube containing data
    ts_type : TsType or str
        temporal resolution of data (e.g. "hourly", "daily"). This information
        is e.g. encrypted in the filename of a NetCDF file and may be
        accessed using :class:`pyaerocom.io.FileConventionRead`
    year : int
        interger specifying start year, e.g. 2017

    Returns
    -------
    Cube
        the same instance of the input cube with corrected time dimension axis

    """
    tindex_cube = None
    dim_lens = []
    if isinstance(ts_type, str):
        ts_type = TsType(ts_type)
    for i, coord in enumerate(cube.dim_coords):
        dim_lens.append(len(coord.points))
        if coord.name() == 'time':
            tindex_cube = i
    if tindex_cube is None:
        if cube.ndim != len(cube.dim_coords):  #one dimension is missing
            for idx, dim_len in enumerate(cube.shape):
                if not dim_len in dim_lens:  #candidate
                    tindex_cube = idx
    if tindex_cube is None:
        raise NetcdfError('Failed to identify data index of time dimension in '
                          'cube {}'.format(repr(cube)))
    tres_str = ts_type.cf_base_unit
    conv = ts_type.datetime64_str
    tunit_str = '%s since %s-01-01 00:00:00' % (tres_str, year)
    num = cube.shape[tindex_cube]

    tunit = cf_units.Unit(tunit_str, calendar=cf_units.CALENDAR_STANDARD)
    tres_np = ts_type.timedelta64_str  #TSTR_TO_NP_TD[ts_type]
    base = datetime64("%s-01-01 00:00:00" % year).astype(conv)
    times = base + arange(0, num, 1).astype(tres_np)
    # see this thread https://github.com/matplotlib/matplotlib/issues/2259/
    times_dt = times.astype("datetime64[s]").astype(datetime)
    #    timestamps = datetime64(str(year)) +
    time_nums = [tunit.date2num(t) for t in times_dt]
    tcoord = iris.coords.DimCoord(time_nums, standard_name='time', units=tunit)

    #tcoord_dim = cube.coord_dims('time')
    try:
        cube.remove_coord('time')
    except Exception:
        pass
    cube.add_dim_coord(tcoord, tindex_cube)
    return cube
Ejemplo n.º 9
0
 def check_validity(self, file):
     """Check if filename is valid"""
     info = self.get_info_from_file(file)
     year = info["year"]
     if not TsType.valid(info['ts_type']):
         raise FileConventionError("Invalid ts_type %s in filename %s" %
                                   (info['ts_type'], basename(file)))
     elif not (const.MIN_YEAR <= year <= const.MAX_YEAR):
         raise FileConventionError("Invalid year %d in filename %s" %
                                   (info['year'], basename(file)))
Ejemplo n.º 10
0
def get_tot_number_of_seconds(ts_type, dtime=None):
    from pyaerocom.tstype import TsType

    ts_tpe = TsType(ts_type)

    if ts_tpe >= TsType('monthly'):
        if dtime is None:
            raise AttributeError(
                'For frequncies larger than or eq. monthly you' +
                ' need to provide dtime in order to compute the number of second.'
            )
        else:
            # find seconds from dtime
            # TODO generalize this
            days_in_month = dtime.dt.daysinmonth
            if ts_type == 'monthly':
                monthly_to_sec = days_in_month * 24 * 60 * 60
            return monthly_to_sec
    else:
        return TS_TYPE_SECS[ts_type]
Ejemplo n.º 11
0
def _check_correct_time_dim(cube, file, file_convention=None):
    if file_convention is None:
        try:
            file_convention = FileConventionRead(from_file=file)
        except Exception:
            pass

    if not isinstance(file_convention, FileConventionRead):

        raise FileConventionError(
            'Unknown file convention: {}'.format(file_convention))

    finfo = file_convention.get_info_from_file(file)
    try:
        ts_type = TsType(finfo['ts_type'])
    except Exception:
        raise FileConventionError(
            'Invalid ts_type in file: {}'.format(ts_type))
    year = finfo['year']

    if not const.MIN_YEAR <= year <= const.MAX_YEAR:
        raise FileConventionError('Invalid year in file: {}'.format(year))
    try:
        check_time_coord(cube, ts_type, year)
    except UnresolvableTimeDefinitionError as e:
        raise UnresolvableTimeDefinitionError(repr(e))
    except Exception:
        msg = ("Invalid time dimension coordinate in file {}. ".format(
            os.path.basename(file)))
        logger.warning(msg)
        if const.GRID_IO.CORRECT_TIME_FILENAME:
            logger.warning("Attempting to correct time coordinate "
                           "using information in file name")
            try:
                cube = correct_time_coord(cube,
                                          ts_type=finfo["ts_type"],
                                          year=finfo["year"])
            except Exception:
                pass
        if const.WRITE_FILEIO_ERR_LOG:
            add_file_to_log(file, 'Invalid time dimension')
    return cube
Ejemplo n.º 12
0
def test_to_timedelta64(ts_type, ref_time_str, np_dt_str, output_str):
    tref = np.datetime64(ref_time_str, np_dt_str)
    assert str(tref + TsType(ts_type).to_timedelta64()) == output_str
Ejemplo n.º 13
0
    def _run_gridded_ungridded(self, var_name=None):
        """Analysis method for gridded vs. ungridded data"""
        model_reader = ReadGridded(self.model_id)

        obs_reader = ReadUngridded(self.obs_id)

        obs_vars_supported = obs_reader.get_reader(
            self.obs_id).PROVIDES_VARIABLES

        obs_vars = list(np.intersect1d(self.obs_vars, obs_vars_supported))

        if len(obs_vars) == 0:
            raise DataCoverageError(
                'No observation variable matches found for '
                '{}'.format(self.obs_id))

        var_matches = self._find_var_matches(obs_vars, model_reader, var_name)

        if self.read_opts_ungridded is not None:
            ropts = self.read_opts_ungridded
        else:
            ropts = {}
        obs_data = obs_reader.read(datasets_to_read=self.obs_id,
                                   vars_to_retrieve=obs_vars,
                                   **ropts)
        if 'obs_filters' in self:
            remaining_filters = self._eval_obs_filters()
            obs_data = obs_data.apply_filters(**remaining_filters)

        if self.remove_outliers:
            self._update_var_outlier_ranges(var_matches)

        #all_ts_types = const.GRID_IO.TS_TYPES

        data_objs = {}
        for model_var, obs_var in var_matches.items():

            ts_type = self.ts_type
            start, stop = start_stop(self.start, self.stop)
            print_log.info('Running {} / {} ({}, {})'.format(
                self.model_id, self.obs_id, model_var, obs_var))
            try:
                model_data = self._read_gridded(reader=model_reader,
                                                var_name=model_var,
                                                start=start,
                                                stop=stop,
                                                is_model=True)
            except Exception as e:

                msg = (
                    'Failed to load gridded data: {} / {}. Reason {}'.format(
                        self.model_id, model_var, repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')

                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)
                else:
                    continue
            ts_type_src = model_data.ts_type
            # =============================================================================
            #             if not model_data.ts_type in all_ts_types:
            #                 raise TemporalResolutionError('Invalid temporal resolution {} '
            #                                               'in model {}'.format(model_data.ts_type,
            #                                                                    self.model_id))
            # =============================================================================
            ignore_stats = None
            if self.ignore_station_names is not None:
                ignore_stats = self.ignore_station_names
                if isinstance(ignore_stats, dict):
                    if obs_var in ignore_stats:
                        ignore_stats = ignore_stats[obs_var]
                    else:
                        ignore_stats = None

            #ts_type_src = model_data.ts_type
            if TsType(ts_type_src) < TsType(
                    ts_type):  # < all_ts_types.index(ts_type_src):
                print_log.info('Updating ts_type from {} to {} (highest '
                               'available in model {})'.format(
                                   ts_type, ts_type_src, self.model_id))
                ts_type = ts_type_src

            if self.save_coldata:
                savename = self._coldata_savename(model_data,
                                                  start,
                                                  stop,
                                                  ts_type,
                                                  var_name=model_var)

                file_exists = self._check_coldata_exists(
                    model_data.data_id, savename)

                out_dir = chk_make_subdir(self.basedir_coldata, self.model_id)
                if file_exists:
                    if not self.reanalyse_existing:
                        if self._log:
                            self._write_log('SKIP: {}\n'.format(savename))
                            print_log.info('Skip {} (file already '
                                           'exists)'.format(savename))
                            self.file_status[savename] = 'skipped'
                        continue
                    else:
                        print_log.info(
                            'Deleting and recomputing existing '
                            'colocated data file {}'.format(savename))
                        print_log.info('REMOVE: {}\n'.format(savename))
                        os.remove(os.path.join(out_dir, savename))

            try:
                by = None
                if self.model_use_climatology:
                    by = start.year
                coldata = colocate_gridded_ungridded(
                    gridded_data=model_data,
                    ungridded_data=obs_data,
                    ts_type=ts_type,
                    start=start,
                    stop=stop,
                    var_ref=obs_var,
                    filter_name=self.filter_name,
                    regrid_res_deg=self.regrid_res_deg,
                    remove_outliers=self.remove_outliers,
                    vert_scheme=self.vert_scheme,
                    harmonise_units=self.harmonise_units,
                    var_outlier_ranges=self.var_outlier_ranges,
                    var_ref_outlier_ranges=self.var_ref_outlier_ranges,
                    update_baseyear_gridded=by,
                    ignore_station_names=ignore_stats,
                    apply_time_resampling_constraints=self.
                    apply_time_resampling_constraints,
                    min_num_obs=self.min_num_obs,
                    colocate_time=self.colocate_time,
                    var_keep_outliers=self.model_keep_outliers,
                    var_ref_keep_outliers=self.obs_keep_outliers)

                if self.save_coldata:
                    self._save_coldata(coldata, savename, out_dir, model_var,
                                       model_data, obs_var)
                data_objs[model_var] = coldata
            except Exception as e:
                msg = ('Colocation between model {} / {} and obs {} / {} '
                       'failed: Reason {}'.format(self.model_id, model_var,
                                                  self.obs_id, obs_var,
                                                  repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')
                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)

        return data_objs
Ejemplo n.º 14
0
    def resample(self, to_ts_type, input_data=None, from_ts_type=None,
                 how=None, apply_constraints=None,
                 min_num_obs=None, **kwargs):
        """Resample input data

        Parameters
        ----------
        to_ts_type : str or pyaerocom.tstype.TsType
            output resolution
        input_data : pandas.Series or xarray.DataArray
            data to be resampled
        how : str
            string specifying how the data is to be aggregated, default is mean
        apply_constraints : bool, optional
            if True, hierarchical resampling is applied using input
            `samping_constraints` (if provided) or else, using constraints
            specified in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`
        min_num_obs : dict or int, optinal
            integer or nested dictionary specifying minimum number of
            observations required to resample from higher to lower frequency.
            For instance, if `input_data` is hourly and `to_ts_type` is
            monthly, you may specify something like::

                min_num_obs =
                    {'monthly'  :   {'daily'  : 7},
                     'daily'    :   {'hourly' : 6}}

            to require at least 6 hours per day and 7 days per month.

        **kwargs
           additional input arguments passed to resampling method

        Returns
        -------
        pandas.Series or xarray.DataArray
            resampled data object
        """
        if how is None:
            how = 'mean'

        if not isinstance(to_ts_type, TsType):
            to_ts_type = TsType(to_ts_type)

        if input_data is not None:
            self.input_data = input_data
        if self.input_data is None:
            raise ValueError('Please provide data (Series or DataArray)')

        if apply_constraints is None:
            apply_constraints = self.APPLY_CONSTRAINTS

        self.last_setup = dict(apply_constraints=False,
                               min_num_obs=None,
                               how=how)

        if not apply_constraints or from_ts_type is None:
            freq = to_ts_type.to_pandas_freq()
            if not isinstance(how, str):
                raise ValueError('Temporal resampling without constraints can '
                                 'only use string type argument how (e.g. '
                                 'how=mean). Got {}'.format(how))
            return self.fun(self.input_data, freq=freq,
                            how=how, **kwargs)
# =============================================================================
#         elif from_ts_type is None:
#             self.last_setup = dict(apply_constraints=False,
#                                    min_num_obs=None)
#             freq = to_ts_type.to_pandas_freq()
#             return self.fun(self.input_data, freq=freq,
#                             how=how, **kwargs)
# =============================================================================

        if isinstance(from_ts_type, str):
            from_ts_type = TsType(from_ts_type)

        if not isinstance(from_ts_type, TsType):
            raise ValueError('Invalid input for from_ts_type: {}. Need valid '
                             'str or TsType. Input arg from_ts_type is '
                             'required if resampling using hierarchical '
                             'constraints (arg apply_constraints) is activated'
                             .format(from_ts_type.val))

        if to_ts_type > from_ts_type:
            raise TemporalResolutionError('Cannot resample time-series from {} '
                                          'to {}'
                                          .format(from_ts_type, to_ts_type))
        elif to_ts_type == from_ts_type:
            const.logger.info('Input time frequency {} equals current frequency '
                              'of data. Resampling will be applied anyways '
                              'which will introduce NaN values at missing '
                              'time stamps'.format(to_ts_type.val))

            freq = to_ts_type.to_pandas_freq()
            return self.fun(self.input_data, freq=freq, how='mean',
                            **kwargs)

        if min_num_obs is None:
            min_num_obs = self.SAMPLING_CONSTRAINTS

        _idx = self._gen_idx(from_ts_type, to_ts_type, min_num_obs, how)
        data = self.input_data
        for to_ts_type, mno, rshow in _idx:
            const.logger.info('TO: {} ({}, {})'.format(to_ts_type, mno, rshow))
            freq = TsType(to_ts_type).to_pandas_freq()
            data = self.fun(data, freq=freq, how=rshow,
                            min_num_obs=mno)
        self.last_setup = dict(apply_constraints=True,
                               min_num_obs=min_num_obs,
                               how=how)
        return data
Ejemplo n.º 15
0
 def __init__(self, input_data=None):
     self.last_setup = None
     self._input_data = None
     self.input_data = input_data
     self.valid_base_ts_types = [x for x in const.GRID_IO.TS_TYPES if
                                 TsType(x).mulfac==1]
Ejemplo n.º 16
0
    def resample(self,
                 to_ts_type,
                 input_data=None,
                 from_ts_type=None,
                 how='mean',
                 apply_constraints=None,
                 sampling_constraints=None,
                 **kwargs):
        """Resample input data
        
        Parameters
        ----------
        input_data : pandas.Series or xarray.DataArray
            data to be resampled
        to_ts_type : str or pyaerocom.tstype.TsType
            output resolution
        how : str
            string specifying how the data is to be aggregated, default is mean
        apply_constraints : bool, optional
            if True, hierarchical resampling is applied using input 
            `samping_constraints` (if provided) or else, using constraints 
            specified in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`
        sampling_constraints : dict
            nested dictionary specifying sampling constraints to be applied to
            data. For instance, if `input_data` is hourly and `to_ts_type` is
            monthly, you may specify something like::
                
                sampling_constraints = 
                    {'monthly'  :   {'daily'  : 7}, 
                     'daily'    :   {'hourly' : 6}}
                    
            to require at least 6 hours per day and 7 days per month.
        **kwargs
           additional input arguments passed to resampling method
          
        Returns
        -------
        pandas.Series or xarray.DataArray
            resampled data object
        """
        if not isinstance(to_ts_type, TsType):
            to_ts_type = TsType(to_ts_type)

        if not to_ts_type.val in self.FREQS_SUPPORTED:
            raise NotImplementedError('Cannot resample to input frequency '
                                      '{}. Choose from: {}'.format(
                                          to_ts_type,
                                          self.FREQS_SUPPORTED.keys()))

        if input_data is not None:
            self.input_data = input_data
        if self.input_data is None:
            raise ValueError('Please provide data (Series or DataArray)')

        if apply_constraints is None:
            apply_constraints = self.APPLY_CONSTRAINTS

        if not apply_constraints:
            return self.fun(self.input_data,
                            freq=to_ts_type.val,
                            how=how,
                            **kwargs)

        if isinstance(from_ts_type, str):
            from_ts_type = TsType(from_ts_type)

        if not isinstance(from_ts_type, TsType):
            raise ValueError(
                'Invalid input for from_ts_type: {}. Need valid '
                'str or TsType. Input arg from_ts_type is '
                'required if resampling using hierarchical '
                'constraints (arg apply_constraints) is activated'.format(
                    from_ts_type))
        if sampling_constraints is None:
            sampling_constraints = self.SAMPLING_CONSTRAINTS

        _idx = self._gen_idx(from_ts_type, to_ts_type, sampling_constraints)
        data = self.input_data
        for to_ts_type, min_num_obs in _idx:
            data = self.fun(data,
                            freq=to_ts_type,
                            how=how,
                            min_num_obs=min_num_obs)

        return data
Ejemplo n.º 17
0
def colocate_gridded_ungridded(gridded_data,
                               ungridded_data,
                               ts_type=None,
                               start=None,
                               stop=None,
                               filter_name=None,
                               regrid_res_deg=None,
                               remove_outliers=True,
                               vert_scheme=None,
                               harmonise_units=True,
                               regrid_scheme='areaweighted',
                               var_ref=None,
                               var_outlier_ranges=None,
                               var_ref_outlier_ranges=None,
                               update_baseyear_gridded=None,
                               ignore_station_names=None,
                               apply_time_resampling_constraints=None,
                               min_num_obs=None,
                               colocate_time=False,
                               var_keep_outliers=True,
                               var_ref_keep_outliers=False,
                               use_climatology_ref=False,
                               resample_how=None,
                               **kwargs):
    """Colocate gridded with ungridded data (low level method)

    For high-level colocation see :class:`pyaerocom.colocation_auto.Colocator`
    and :class:`pyaerocom.colocation_auto.ColocationSetup`

    Note
    ----
    Uses the variable that is contained in input :class:`GriddedData` object
    (since these objects only contain a single variable). If this variable
    is not contained in observation data (or contained but using a different
    variable name) you may specify the obs variable to be used via input arg
    `var_ref`

    Parameters
    ----------
    gridded_data : GriddedData
        gridded data object (e.g. model results).
    ungridded_data : UngriddedData
        ungridded data object (e.g. observations).
    ts_type : str
        desired temporal resolution of colocated data (must be valid AeroCom
        ts_type str such as daily, monthly, yearly.).
    start : :obj:`str` or :obj:`datetime64` or similar, optional
        start time for colocation, if None, the start time of the input
        :class:`GriddedData` object is used.
    stop : :obj:`str` or :obj:`datetime64` or similar, optional
        stop time for colocation, if None, the stop time of the input
        :class:`GriddedData` object is used
    filter_name : str
        string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for
        details). If None, then it is set to 'WORLD-wMOUNTAINS', which
        corresponds to no filtering (world with mountains).
        Use WORLD-noMOUNTAINS to exclude mountain sites.
    regrid_res_deg : int or dict, optional
        regrid resolution in degrees. If specified, the input gridded data
        object will be regridded in lon / lat dimension to the input
        resolution (if input is integer, both lat and lon are regridded to that
        resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg`
        to specify regrid resolutions, respectively).
    remove_outliers : bool
        if True, outliers are removed from model and obs data before colocation,
        else not. Outlier ranges can be specified via input args
        `var_outlier_ranges` and `var_ref_outlier_ranges`.
    vert_scheme : str
        string specifying scheme used to reduce the dimensionality in case
        input grid data contains vertical dimension. Example schemes are
        `mean, surface, altitude`, for details see
        :func:`GriddedData.to_time_series`.
    harmonise_units : bool
        if True, units are attempted to be harmonised (note: raises Exception
        if True and units cannot be harmonised).
    var_ref : :obj:`str`, optional
        variable against which data in :attr:`gridded_data` is supposed to be
        compared. If None, then the same variable is used
        (i.e. `gridded_data.var_name`).
    var_outlier_ranges : dict, optional
        dictionary specifying outlier ranges for dataset to be analysed
        (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then
        the pyaerocom default outlier ranges are used for the input variable.
        Defaults to None.
    var_ref_outlier_ranges : dict, optional
        like `var_outlier_ranges` but for reference dataset.
    update_baseyear_gridded : int, optional
        optional input that can be set in order to re-define the time dimension
        in the gridded data object to be analysed. E.g., if the data object
        is a climatology (one year of data) that has set the base year of the
        time dimension to a value other than the specified input start / stop
        time this may be used to update the time in order to make colocation
        possible.
    ignore_station_names : str or list, optional
        station name or pattern or list of station names or patterns that should
        be ignored
    apply_time_resampling_constraints : bool, optional
        if True, then time resampling constraints are applied as provided via
        :attr:`min_num_obs` or if that one is unspecified, as defined in
        :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than
        :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used
        (which defaults to True !!).
    min_num_obs : int or dict, optional
        minimum number of observations for resampling of time
    colocate_time : bool
        if True and if original time resolution of data is higher than desired
        time resolution (`ts_type`), then both datasets are colocated in time
        *before* resampling to lower resolution.
    var_keep_outliers : bool
        if True, then no outliers will be removed from dataset to be analysed,
        even if `remove_outliers` is True. That is because for model evaluation
        often only outliers are supposed to be removed in the observations but
        not in the model.
    var_ref_keep_outliers : bool
        if True, then no outliers will be removed from the reference dataset,
        even if `remove_outliers` is True.
    use_climatology_ref : bool
        if True, climatological timeseries are used from observations
    resample_how : str or dict
        string specifying how data should be aggregated when resampling in time.
        Default is "mean". Can also be a nested dictionary, e.g.
        resample_how={'daily': {'hourly' : 'max'}} would use the maximum value
        to aggregate from hourly to daily, rather than the mean.
    **kwargs
        additional keyword args (passed to
        :func:`UngriddedData.to_station_data_all`)

    Returns
    -------
    ColocatedData
        instance of colocated data

    Raises
    ------
    VarNotAvailableError
        if grid data variable is not available in ungridded data object
    AttributeError
        if instance of input :class:`UngriddedData` object contains more than
        one dataset
    TimeMatchError
        if gridded data time range does not overlap with input time range
    ColocationError
        if none of the data points in input :class:`UngriddedData` matches
        the input colocation constraints
    """
    if var_outlier_ranges is None:
        var_outlier_ranges = {}
    if var_ref_outlier_ranges is None:
        var_ref_outlier_ranges = {}

    if filter_name is None:
        filter_name = const.DEFAULT_REG_FILTER

    try:
        gridded_data.check_dimcoords_tseries()
    except DimensionOrderError:
        gridded_data.reorder_dimensions_tseries()

    var = gridded_data.var_name
    aerocom_var = gridded_data.var_name_aerocom

    _check_var_registered(var, aerocom_var, gridded_data)

    if var_ref is None:
        if aerocom_var is not None:
            var_ref = aerocom_var
        else:
            var_ref = var

    if remove_outliers:
        low, high, low_ref, high_ref = None, None, None, None
        if var in var_outlier_ranges:
            low, high = var_outlier_ranges[var]
        if var_ref in var_ref_outlier_ranges:
            low_ref, high_ref = var_ref_outlier_ranges[var_ref]

    if not var_ref in ungridded_data.contains_vars:
        raise VarNotAvailableError('Variable {} is not available in ungridded '
                                   'data (which contains {})'.format(
                                       var_ref, ungridded_data.contains_vars))
    elif len(ungridded_data.contains_datasets) > 1:
        raise AttributeError('Colocation can only be performed with '
                             'ungridded data objects that only contain a '
                             'single dataset. Use method `extract_dataset` of '
                             'UngriddedData object to extract single datasets')

    dataset_ref = ungridded_data.contains_datasets[0]

    if update_baseyear_gridded is not None:
        # update time dimension in gridded data
        gridded_data.base_year = update_baseyear_gridded

    grid_ts_type_src = gridded_data.ts_type
    grid_ts_type = TsType(gridded_data.ts_type)
    if isinstance(ts_type, str):
        ts_type = TsType(ts_type)
    if ts_type is None or grid_ts_type < ts_type:
        ts_type = grid_ts_type
    elif grid_ts_type > ts_type and not colocate_time:
        gridded_data = gridded_data.resample_time(
            str(ts_type),
            apply_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            how=resample_how)
        grid_ts_type = ts_type

    # get start / stop of gridded data as pandas.Timestamp
    grid_start = to_pandas_timestamp(gridded_data.start)
    grid_stop = to_pandas_timestamp(gridded_data.stop)

    if start is None:
        start = grid_start
    else:
        start = to_pandas_timestamp(start)
    if stop is None:
        stop = grid_stop
    else:
        stop = to_pandas_timestamp(stop)

    if start < grid_start:
        start = grid_start
    if stop > grid_stop:
        stop = grid_stop
    # check overlap
    if stop < grid_start or start > grid_stop:
        raise TimeMatchError('Input time range {}-{} does not '
                             'overlap with data range: {}-{}'.format(
                                 start, stop, grid_start, grid_stop))
    # create instance of Filter class (may, in the future, also include all
    # filter options, e.g. start, stop, variables, only land, only oceans, and
    # may also be linked with other data object, e.g. if data is only supposed
    # to be used if other data object exceeds a certain threshold... but for
    # now, only region and altitude range)
    regfilter = Filter(name=filter_name)

    # apply filter to data
    ungridded_data = regfilter.apply(ungridded_data)

    #crop time
    gridded_data = regfilter.apply(gridded_data)
    if start > grid_start or stop < grid_stop:
        gridded_data = gridded_data.crop(time_range=(start, stop))

    if regrid_res_deg is not None:
        gridded_data = _regrid_gridded(gridded_data, regrid_scheme,
                                       regrid_res_deg)

    if remove_outliers and not var_ref_keep_outliers:  #called twice if used via Colocator, this should go out here
        ungridded_data.remove_outliers(var_ref,
                                       inplace=True,
                                       low=low_ref,
                                       high=high_ref)

    if use_climatology_ref:
        col_freq = 'monthly'
        obs_start = const.CLIM_START
        obs_stop = const.CLIM_STOP
    else:
        col_freq = str(grid_ts_type)  #TS_TYPE_TO_PANDAS_FREQ[grid_ts_type]
        obs_start = start
        obs_stop = stop

    latitude = gridded_data.latitude.points
    longitude = gridded_data.longitude.points
    lat_range = [np.min(latitude), np.max(latitude)]
    lon_range = [np.min(longitude), np.max(longitude)]
    ungridded_data = ungridded_data.filter_by_meta(latitude=lat_range,
                                                   longitude=lon_range)

    # get timeseries from all stations in provided time resolution
    # (time resampling is done below in main loop)
    all_stats = ungridded_data.to_station_data_all(
        vars_to_convert=var_ref,
        start=obs_start,
        stop=obs_stop,
        by_station_name=True,
        ignore_index=ignore_station_names,
        **kwargs)

    obs_stat_data = all_stats['stats']
    ungridded_lons = all_stats['longitude']
    ungridded_lats = all_stats['latitude']

    if len(obs_stat_data) == 0:
        raise VarNotAvailableError('Variable {} is not available in specified '
                                   'time interval ({}-{})'.format(
                                       var_ref, start, stop))
    # make sure the gridded data is in the right dimension
    if gridded_data.ndim > 3:
        if vert_scheme is None:
            vert_scheme = 'mean'
        if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES:
            raise ValueError(
                'Vertical scheme {} is not supported'.format(vert_scheme))

    grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons,
                                                 latitude=ungridded_lats,
                                                 vert_scheme=vert_scheme)

    pd_freq = TsType(col_freq).to_pandas_freq()
    time_idx = make_datetime_index(start, stop, pd_freq)

    coldata = np.empty((2, len(time_idx), len(obs_stat_data)))

    lons = []
    lats = []
    alts = []
    station_names = []

    ungridded_unit = None
    ts_type_src_ref = None
    if not harmonise_units:
        gridded_unit = str(gridded_data.units)
    else:
        gridded_unit = None

    # loop over all stations and append to colocated data object
    for i, obs_stat in enumerate(obs_stat_data):
        # ToDo: consider removing to keep ts_type_src_ref (this was probably
        # introduced for EBAS were the original data frequency is not constant
        # but can vary from site to site)
        if ts_type_src_ref is None:
            ts_type_src_ref = obs_stat['ts_type_src']
        elif obs_stat['ts_type_src'] != ts_type_src_ref:
            spl = ts_type_src_ref.split(';')
            if not obs_stat['ts_type_src'] in spl:
                spl.append(obs_stat['ts_type_src'])
            ts_type_src_ref = ';'.join(spl)

        if ungridded_unit is None:
            try:
                ungridded_unit = obs_stat['var_info'][var_ref]['units']
            except KeyError as e:  #variable information or unit is not defined
                logger.exception(repr(e))
        try:
            unit = obs_stat['var_info'][var_ref]['units']
        except Exception:
            unit = None
        if not unit == ungridded_unit:
            raise ValueError(
                'Cannot perform colocation. Ungridded data '
                'object contains different units ({})'.format(var_ref))
        # get observations (Note: the index of the observation time series
        # is already in the specified frequency format, and thus, does not
        # need to be updated, for details (or if errors occur), cf.
        # UngriddedData.to_station_data, where the conversion happens)

        # get model station data
        grid_stat = grid_stat_data[i]
        if harmonise_units:
            grid_unit = grid_stat.get_unit(var)
            obs_unit = obs_stat.get_unit(var_ref)
            if not grid_unit == obs_unit:
                grid_stat.convert_unit(var, obs_unit)
            if gridded_unit is None:
                gridded_unit = obs_unit

        if remove_outliers and not var_keep_outliers:
            # don't check if harmonise_units is active, because the
            # remove_outliers method checks units based on AeroCom default
            # variables, and a variable mapping might be active, i.e.
            # sometimes models use abs550aer for absorption coefficients
            # with units [m-1] and not for AAOD (which is the AeroCom default
            # and unitless. Hence, unit check in remove_outliers works only
            # if the variable name (and unit) corresonds to AeroCom default)
            #chk_unit = not harmonise_units
            grid_stat.remove_outliers(var, low=low, high=high, check_unit=True)

        _df = _colocate_site_data_helper(
            stat_data=grid_stat,
            stat_data_ref=obs_stat,
            var=var,
            var_ref=var_ref,
            ts_type=col_freq,
            resample_how=resample_how,
            apply_time_resampling_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            use_climatology_ref=use_climatology_ref)

        # this try/except block was introduced on 23/2/2021 as temporary fix from
        # v0.10.0 -> v0.10.1 as a result of multi-weekly obsdata (EBAS) that
        # can end up resulting in incorrect number of timestamps after resampling
        # (the error was discovered using EBASMC, concpm10, 2019 and colocation
        # frequency monthly)
        try:
            # assign the unified timeseries data to the colocated data array
            coldata[0, :, i] = _df['ref'].values
            coldata[1, :, i] = _df['data'].values
        except ValueError as e:
            const.print_log.warning(
                f'Failed to colocate time for station {obs_stat.station_name}. '
                f'This station will be skipped (error: {e})')

        lons.append(obs_stat.longitude)
        lats.append(obs_stat.latitude)
        alts.append(obs_stat.altitude)
        station_names.append(obs_stat.station_name)

    try:
        revision = ungridded_data.data_revision[dataset_ref]
    except Exception:
        try:
            revision = ungridded_data._get_data_revision_helper(dataset_ref)
        except MetaDataError:
            revision = 'MULTIPLE'
        except Exception:
            revision = 'n/a'

    files = [os.path.basename(x) for x in gridded_data.from_files]

    meta = {
        'data_source': [dataset_ref, gridded_data.name],
        'var_name': [var_ref, var],
        'ts_type': col_freq,  # will be updated below if resampling
        'filter_name': filter_name,
        'ts_type_src': [ts_type_src_ref, grid_ts_type_src],
        'start_str': to_datestring_YYYYMMDD(start),
        'stop_str': to_datestring_YYYYMMDD(stop),
        'var_units': [ungridded_unit, gridded_unit],
        'vert_scheme': vert_scheme,
        'data_level': 3,
        'revision_ref': revision,
        'from_files': files,
        'from_files_ref': None,
        'stations_ignored': ignore_station_names,
        'colocate_time': colocate_time,
        'obs_is_clim': use_climatology_ref,
        'pyaerocom': pya_ver,
        'apply_constraints': apply_time_resampling_constraints,
        'min_num_obs': min_num_obs,
        'outliers_removed': remove_outliers
    }

    meta.update(regfilter.to_dict())

    # create coordinates of DataArray
    coords = {
        'data_source': meta['data_source'],
        'var_name': ('data_source', meta['var_name']),
        'var_units': ('data_source', meta['var_units']),
        'ts_type_src': ('data_source', meta['ts_type_src']),
        'time': time_idx,
        'station_name': station_names,
        'latitude': ('station_name', lats),
        'longitude': ('station_name', lons),
        'altitude': ('station_name', alts)
    }

    dims = ['data_source', 'time', 'station_name']
    data = ColocatedData(data=coldata,
                         coords=coords,
                         dims=dims,
                         name=var,
                         attrs=meta)

    # add correct units for lat / lon dimensions
    data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name
    data.latitude.attrs['units'] = str(gridded_data.latitude.units)

    data.longitude.attrs[
        'standard_name'] = gridded_data.longitude.standard_name
    data.longitude.attrs['units'] = str(gridded_data.longitude.units)

    if col_freq != str(ts_type):
        data = data.resample_time(
            to_ts_type=ts_type,
            colocate_time=colocate_time,
            apply_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            how=resample_how,
            **kwargs)
    return data
Ejemplo n.º 18
0
def resample_time_dataarray(arr, freq, how='mean', min_num_obs=None):
    """Resample the time dimension of a :class:`xarray.DataArray`
    
    Note
    ----
    The dataarray must have a dimension coordinate named "time"
    
    Parameters
    ----------
    arr : DataArray
        data array to be resampled
    freq : str
        new temporal resolution (can be pandas freq. string, or pyaerocom
        ts_type)
    how : str
        choose from mean or median
    min_num_obs : :obj:`int`, optional
        minimum number of observations required per period (when downsampling).
        E.g. if input is in daily resolution and freq is monthly and 
        min_num_obs is 10, then all months that have less than 10 days of data
        are set to nan.
    
    Returns
    -------
    DataArray
        resampled data array object
    
    Raises
    ------
    IOError
        if data input `arr` is not an instance of :class:`DataArray`
    DataDimensionError
        if time dimension is not available in dataset
    """

    if not isinstance(arr, xray.DataArray):
        raise IOError('Invalid input for arr: need DataArray, got {}'.format(
            type(arr)))
    elif not 'time' in arr.dims:
        raise DataDimensionError('Cannot resample time: input DataArray has '
                                 'no time dimension')

    from pyaerocom.tstype import TsType
    from pyaerocom.time_config import XARR_TIME_GROUPERS
    to = TsType(freq)
    pd_freq = to.to_pandas()
    invalid = None
    if min_num_obs is not None:
        if not pd_freq in XARR_TIME_GROUPERS:
            raise ValueError(
                'Cannot infer xarray grouper for ts_type {}'.format(to.val))
        gr = XARR_TIME_GROUPERS[pd_freq]
        # 2D mask with shape of resampled data array
        invalid = arr.groupby(
            'time.{}'.format(gr)).count(dim='time') < min_num_obs

    freq, loffset = _get_pandas_freq_and_loffset(freq)
    arr = arr.resample(time=pd_freq, loffset=loffset).mean(dim='time')
    if invalid is not None:
        arr.data[invalid.data] = np.nan
    return arr
Ejemplo n.º 19
0
def check_time_coordOLD(cube, ts_type, year):
    """Method that checks the time coordinate of an iris Cube

    This method checks if the time dimension of a cube is accessible and
    according to the standard (i.e. fully usable). It only checks, and does not
    correct. For the latter, please see :func:`correct_time_coord`.

    Parameters
    ----------
    cube : Cube
        cube containing data
    ts_type : str
        temporal resolution of data (e.g. "hourly", "daily"). This information
        is e.g. encrypted in the filename of a NetCDF file and may be
        accessed using :class:`pyaerocom.io.FileConventionRead`
    year : int
        interger specifying year of observation, e.g. 2017

    Returns
    -------
    bool
        True, if time dimension is ok, False if not
    """

    ok = True
    ts_type = TsType(ts_type)
    test_idx = [
        0, 1, 2, 7
    ]  #7, since last accessible index in a 3hourly dataset of one day is 7
    try:
        try:
            t = cube.coord("time")
        except Exception:
            raise AttributeError("Cube does not contain time dimension")
        if not isinstance(t, iris.coords.DimCoord):
            raise AttributeError("Time is not a DimCoord instance")
        try:
            cftime_to_datetime64(0, cfunit=t.units)
        except Exception:
            raise ValueError("Could not convert time unit string")


# =============================================================================
#         tres_np = TSTR_TO_NP_TD[ts_type]
#         conv = TSTR_TO_NP_DT[ts_type]
# =============================================================================
        tres_np = ts_type.timedelta64_str
        conv = ts_type.datetime64_str_str

        base = datetime64("{}-01-01 00:00:00".format(year)).astype(conv)
        test_datenums = asarray(test_idx)
        ts_nominal = base + test_datenums.astype(tres_np)
        dts_nominal = ts_nominal[1:] - ts_nominal[:-1]
        ts_values = cftime_to_datetime64(t[test_idx].points,
                                         cfunit=t.units).astype(conv)
        dts_values = ts_values[1:] - ts_values[:-1]
        if not all(ts_values == ts_nominal):
            raise ValueError(
                "Time match error, nominal dates for test array"
                "%s (unit=%s): %s\nReceived values after "
                "conversion: %s" %
                (test_datenums, t.units.origin, ts_nominal, ts_values))
        elif not all(dts_values == dts_nominal):
            raise ValueError(
                "Time match error, time steps for test array"
                "%s (unit=%s): %s\nReceived values after "
                "conversion: %s" %
                (test_datenums, t.units.origin, dts_nominal, dts_values))
    except Exception as e:
        logger.warning("Invalid time dimension.\n"
                       "Error message: {}".format(repr(e)))
        ok = False
    return ok
Ejemplo n.º 20
0
def test_to_pandas_freq():
    assert TsType('3hourly').to_pandas_freq() == '3H'
    assert TsType('daily').to_pandas_freq() == 'D'
Ejemplo n.º 21
0
 def resample(self, to_ts_type, input_data=None, from_ts_type=None, 
              how='mean', apply_constraints=False, 
              min_num_obs=None, **kwargs):
     """Resample input data
     
     Parameters
     ----------
     input_data : pandas.Series or xarray.DataArray
         data to be resampled
     to_ts_type : str or pyaerocom.tstype.TsType
         output resolution
     how : str
         string specifying how the data is to be aggregated, default is mean
     apply_constraints : bool, optional
         if True, hierarchical resampling is applied using input 
         `samping_constraints` (if provided) or else, using constraints 
         specified in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`
     min_num_obs : dict or int, optinal
         integer or nested dictionary specifying minimum number of 
         observations required to resample from higher to lower frequency.
         For instance, if `input_data` is hourly and `to_ts_type` is
         monthly, you may specify something like::
             
             min_num_obs = 
                 {'monthly'  :   {'daily'  : 7}, 
                  'daily'    :   {'hourly' : 6}}
                 
         to require at least 6 hours per day and 7 days per month. Or, if 
         data is daily and output is monthly and   
     **kwargs
        additional input arguments passed to resampling method
       
     Returns
     -------
     pandas.Series or xarray.DataArray
         resampled data object
     """
     if not isinstance(to_ts_type, TsType):
         to_ts_type = TsType(to_ts_type)
     
     if not to_ts_type.val in self.FREQS_SUPPORTED:
         raise NotImplementedError('Cannot resample to input frequency '
                                   '{}. Choose from: {}'
                                   .format(to_ts_type, 
                                           self.FREQS_SUPPORTED.keys()))
     
     if input_data is not None:
         self.input_data = input_data
     if self.input_data is None:
         raise ValueError('Please provide data (Series or DataArray)')
     
     if apply_constraints is None:
         apply_constraints = self.APPLY_CONSTRAINTS
     
     if not apply_constraints:
         self.last_setup = dict(apply_constraints=False,
                                min_num_obs=None)
         return self.fun(self.input_data, freq=to_ts_type.val, 
                         how=how, **kwargs)
     elif from_ts_type is None:
         const.print_log.warn('Cannot apply time resampling constraints, '
                              'since input from_ts_type is None. Applying '
                              'resampling to {} without any constraints'
                              .format(to_ts_type))
         self.last_setup = dict(apply_constraints=False,
                                min_num_obs=None)
         return self.fun(self.input_data, freq=to_ts_type.val, 
                         how=how, **kwargs)
     
     if isinstance(from_ts_type, str):
         from_ts_type = TsType(from_ts_type)
     
     if not isinstance(from_ts_type, TsType):
         raise ValueError('Invalid input for from_ts_type: {}. Need valid '
                          'str or TsType. Input arg from_ts_type is '
                          'required if resampling using hierarchical '
                          'constraints (arg apply_constraints) is activated'
                          .format(from_ts_type))
     
     if to_ts_type > from_ts_type:
         raise TemporalResolutionError('Cannot resample time-series from {} '
                                       'to {}'
                                       .format(from_ts_type, to_ts_type))
     elif to_ts_type == from_ts_type:
         const.logger.info('Input time frequency equals current frequency '
                           'of data, ignoring any resampling constraints')
         self.last_setup = dict(apply_constraints=False,
                                min_num_obs=None)
         return self.fun(self.input_data, freq=to_ts_type.val, how=how, 
                         **kwargs)
         
     if min_num_obs is None:
         min_num_obs = self.SAMPLING_CONSTRAINTS
         
     _idx = self._gen_idx(from_ts_type, to_ts_type, min_num_obs)
     data = self.input_data
     for to_ts_type, mno in _idx:
         data = self.fun(data, freq=to_ts_type, how=how, 
                         min_num_obs=mno)
     self.last_setup = dict(apply_constraints=True,
                            min_num_obs=min_num_obs)   
     return data
Ejemplo n.º 22
0
def colocate_gridded_gridded(gridded_data,
                             gridded_data_ref,
                             ts_type=None,
                             start=None,
                             stop=None,
                             filter_name=None,
                             regrid_res_deg=None,
                             remove_outliers=True,
                             vert_scheme=None,
                             harmonise_units=True,
                             regrid_scheme='areaweighted',
                             var_outlier_ranges=None,
                             var_ref_outlier_ranges=None,
                             update_baseyear_gridded=None,
                             apply_time_resampling_constraints=None,
                             min_num_obs=None,
                             colocate_time=False,
                             var_keep_outliers=True,
                             var_ref_keep_outliers=False,
                             resample_how=None,
                             **kwargs):
    """Colocate 2 gridded data objects

    Todo
    ----
    - think about vertical dimension (vert_scheme input not used at the moment)

    Parameters
    ----------
    gridded_data : GriddedData
        gridded data (e.g. model results)
    gridded_data_ref : GriddedData
        reference dataset that is used to evaluate
        :attr:`gridded_data` (e.g. gridded observation data)
    ts_type : str
        desired temporal resolution of colocated data (must be valid AeroCom
        ts_type str such as daily, monthly, yearly..)
    start : :obj:`str` or :obj:`datetime64` or similar, optional
        start time for colocation, if None, the start time of the input
        :class:`GriddedData` object is used
    stop : :obj:`str` or :obj:`datetime64` or similar, optional
        stop time for colocation, if None, the stop time of the input
        :class:`GriddedData` object is used
    filter_name : str
        string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for
        details). If None, then it is set to 'WORLD-wMOUNTAINS', which
        corresponds to no filtering (world with mountains).
        Use WORLD-noMOUNTAINS to exclude mountain sites.
    regrid_res_deg : int or dict, optional
        regrid resolution in degrees. If specified, the input gridded data
        objects will be regridded in lon / lat dimension to the input
        resolution (if input is integer, both lat and lon are regridded to that
        resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg`
        to specify regrid resolutions, respectively).
    remove_outliers : bool
        if True, outliers are removed from model and obs data before colocation,
        else not.
    vert_scheme : str
        string specifying scheme used to reduce the dimensionality in case
        input grid data contains vertical dimension. Example schemes are
        `mean, surface, altitude`, for details see
        :func:`GriddedData.to_time_series`.
    harmonise_units : bool
        if True, units are attempted to be harmonised (note: raises Exception
        if True and units cannot be harmonised).
    regrid_scheme : str
        iris scheme used for regridding (defaults to area weighted regridding)
    var_outlier_ranges : :obj:`dict`, optional
        dictionary specifying outlier ranges for dataset to be analysed
        (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then
        the pyaerocom default outlier ranges are used for the input variable.
        Defaults to None.
    var_ref_outlier_ranges : dict, optional
        like `var_outlier_ranges` but for reference dataset.
    update_baseyear_gridded : int, optional
        optional input that can be set in order to redefine the time dimension
        in the gridded data object to be analysed. E.g., if the data object
        is a climatology (one year of data) that has set the base year of the
        time dimension to a value other than the specified input start / stop
        time this may be used to update the time in order to make colocation
        possible.
    apply_time_resampling_constraints : bool, optional
        if True, then time resampling constraints are applied as provided via
        :attr:`min_num_obs` or if that one is unspecified, as defined in
        :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than
        :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used
        (which defaults to True !!).
    min_num_obs : int or dict, optional
        minimum number of observations for resampling of time
    colocate_time : bool
        if True and if original time resolution of data is higher than desired
        time resolution (`ts_type`), then both datasets are colocated in time
        *before* resampling to lower resolution.
    var_keep_outliers : bool
        if True, then no outliers will be removed from dataset to be analysed,
        even if `remove_outliers` is True. That is because for model evaluation
        often only outliers are supposed to be removed in the observations but
        not in the model.
    var_ref_keep_outliers : bool
        if True, then no outliers will be removed from the reference dataset,
        even if `remove_outliers` is True.
    resample_how : str or dict
        string specifying how data should be aggregated when resampling in time.
        Default is "mean". Can also be a nested dictionary, e.g.
        resample_how={'daily': {'hourly' : 'max'}} would use the maximum value
        to aggregate from hourly to daily, rather than the mean.
    **kwargs
        additional keyword args (not used here, but included such that factory
        class can handle different methods with different inputs)

    Returns
    -------
    ColocatedData
        instance of colocated data

    """

    if vert_scheme is not None:
        raise NotImplementedError(
            f'This type of colocation is not implemented '
            f'for gridded / gridded colocation... ({vert_scheme})')

    if var_outlier_ranges is None:
        var_outlier_ranges = {}
    if var_ref_outlier_ranges is None:
        var_ref_outlier_ranges = {}

    if filter_name is None:
        filter_name = const.DEFAULT_REG_FILTER

    if harmonise_units and gridded_data.var_info.has_unit:
        if not gridded_data.units == gridded_data_ref.units:
            try:
                gridded_data_ref.convert_unit(gridded_data.units)
            except Exception:
                raise DataUnitError('Failed to merge data unit of reference '
                                    'gridded data object ({}) to data unit '
                                    'of gridded data object ({})'.format(
                                        gridded_data.units,
                                        gridded_data_ref.units))

    var, var_ref = gridded_data.var_name, gridded_data_ref.var_name
    aerocom_var = gridded_data.var_name_aerocom
    _check_var_registered(var, aerocom_var, gridded_data)

    if remove_outliers:
        low, high, low_ref, high_ref = None, None, None, None
        if var in var_outlier_ranges:
            low, high = var_outlier_ranges[var]
        if var_ref in var_ref_outlier_ranges:
            low_ref, high_ref = var_ref_outlier_ranges[var_ref]

        if not var_keep_outliers:
            gridded_data.remove_outliers(low, high, inplace=True)
        if not var_ref_keep_outliers:
            gridded_data_ref.remove_outliers(low_ref, high_ref, inplace=True)

    if update_baseyear_gridded is not None:
        # update time dimension in gridded data
        gridded_data.base_year = update_baseyear_gridded

    if regrid_res_deg is not None:
        gridded_data_ref = _regrid_gridded(gridded_data_ref, regrid_scheme,
                                           regrid_res_deg)
    # perform regridding
    if gridded_data.lon_res < gridded_data_ref.lon_res:  #obs has lower resolution
        gridded_data = gridded_data.regrid(gridded_data_ref,
                                           scheme=regrid_scheme)
    else:
        gridded_data_ref = gridded_data_ref.regrid(gridded_data,
                                                   scheme=regrid_scheme)
    # get start / stop of gridded data as pandas.Timestamp
    grid_start = to_pandas_timestamp(gridded_data.start)
    grid_stop = to_pandas_timestamp(gridded_data.stop)

    grid_start_ref = to_pandas_timestamp(gridded_data_ref.start)
    grid_stop_ref = to_pandas_timestamp(gridded_data_ref.stop)

    # time resolution of dataset to be analysed
    grid_ts_type = grid_ts_type_src = gridded_data.ts_type
    ref_ts_type = ref_ts_type_src = gridded_data_ref.ts_type
    if ref_ts_type != grid_ts_type:
        # ref data is in higher resolution
        if TsType(ref_ts_type) > TsType(grid_ts_type):

            gridded_data_ref = gridded_data_ref.resample_time(
                grid_ts_type,
                apply_constraints=apply_time_resampling_constraints,
                min_num_obs=min_num_obs,
                how=resample_how)

        else:
            gridded_data = gridded_data.resample_time(
                ref_ts_type,
                apply_constraints=apply_time_resampling_constraints,
                min_num_obs=min_num_obs,
                how=resample_how)
            grid_ts_type = ref_ts_type
    # now both are in same temporal resolution

    # input ts_type is not specified or model is in lower resolution
    # than input ts_type -> use model frequency to colocate
    if ts_type is None or TsType(grid_ts_type) < TsType(ts_type):
        ts_type = grid_ts_type

    if start is None:
        start = grid_start
    else:
        start = to_pandas_timestamp(start)
    if stop is None:
        stop = grid_stop
    else:
        stop = to_pandas_timestamp(stop)

    if grid_start_ref > start:
        start = grid_start_ref
    if grid_stop_ref < stop:
        stop = grid_stop_ref
    # check overlap
    if stop < grid_start or start > grid_stop:
        raise TimeMatchError('Input time range {}-{} does not '
                             'overlap with data range: {}-{}'.format(
                                 start, stop, grid_start, grid_stop))
    gridded_data = gridded_data.crop(time_range=(start, stop))
    gridded_data_ref = gridded_data_ref.crop(time_range=(start, stop))

    # perform region extraction (if applicable)
    regfilter = Filter(name=filter_name)
    gridded_data = regfilter(gridded_data)
    gridded_data_ref = regfilter(gridded_data_ref)

    if not gridded_data.shape == gridded_data_ref.shape:
        raise ColocationError('Shape mismatch between two colocated data '
                              'arrays, please debug')
    files_ref = [os.path.basename(x) for x in gridded_data_ref.from_files]
    files = [os.path.basename(x) for x in gridded_data.from_files]

    meta = {
        'data_source': [gridded_data_ref.data_id, gridded_data.data_id],
        'var_name': [var_ref, var],
        'ts_type': grid_ts_type,
        'filter_name': filter_name,
        'ts_type_src': [ref_ts_type_src, grid_ts_type_src],
        'start_str': to_datestring_YYYYMMDD(start),
        'stop_str': to_datestring_YYYYMMDD(stop),
        'var_units': [str(gridded_data_ref.units),
                      str(gridded_data.units)],
        'vert_scheme': vert_scheme,
        'data_level': 3,
        'revision_ref': gridded_data_ref.data_revision,
        'from_files': files,
        'from_files_ref': files_ref,
        'colocate_time': colocate_time,
        'obs_is_clim': False,
        'pyaerocom': pya_ver,
        'apply_constraints': apply_time_resampling_constraints,
        'min_num_obs': min_num_obs
    }

    meta.update(regfilter.to_dict())

    data = gridded_data.grid.data
    if isinstance(data, np.ma.core.MaskedArray):
        data = data.filled(np.nan)
    data_ref = gridded_data_ref.grid.data
    if isinstance(data_ref, np.ma.core.MaskedArray):
        data_ref = data_ref.filled(np.nan)
    arr = np.asarray((data_ref, data))
    time = gridded_data.time_stamps().astype('datetime64[ns]')
    lats = gridded_data.latitude.points
    lons = gridded_data.longitude.points

    # create coordinates of DataArray
    coords = {
        'data_source': meta['data_source'],
        'var_name': ('data_source', meta['var_name']),
        'var_units': ('data_source', meta['var_units']),
        'ts_type_src': ('data_source', meta['ts_type_src']),
        'time': time,
        'latitude': lats,
        'longitude': lons
    }

    dims = ['data_source', 'time', 'latitude', 'longitude']

    data = ColocatedData(data=arr,
                         coords=coords,
                         dims=dims,
                         name=gridded_data.var_name,
                         attrs=meta)

    # add correct units for lat / lon dimensions
    data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name
    data.latitude.attrs['units'] = str(gridded_data.latitude.units)

    data.longitude.attrs[
        'standard_name'] = gridded_data.longitude.standard_name
    data.longitude.attrs['units'] = str(gridded_data.longitude.units)

    if grid_ts_type != ts_type:
        data = data.resample_time(
            to_ts_type=ts_type,
            colocate_time=colocate_time,
            apply_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            how=resample_how,
            **kwargs)
    return data
Ejemplo n.º 23
0
def test_cf_base_unit():
    assert TsType('daily').cf_base_unit == 'days'
    assert TsType('monthly').cf_base_unit == 'days'
    assert TsType('hourly').cf_base_unit == 'hours'
Ejemplo n.º 24
0
def check_time_coord(cube, ts_type, year):
    """Method that checks the time coordinate of an iris Cube

    This method checks if the time dimension of a cube is accessible and
    according to the standard (i.e. fully usable). It only checks, and does not
    correct. For the latter, please see :func:`correct_time_coord`.

    Parameters
    ----------
    cube : Cube
        cube containing data
    ts_type : str
        pyaerocom ts_type
    year :
        year of data

    Returns
    -------
    bool
        True, if time dimension is ok, False if not
    """
    if isinstance(ts_type, str):
        ts_type = TsType(ts_type)
    try:
        t = cube.coord("time")
    except Exception:
        raise AttributeError("Cube does not contain time dimension")
    if not isinstance(t, iris.coords.DimCoord):
        raise AttributeError("Time is not a DimCoord instance")
    try:
        cftime_to_datetime64(0, cfunit=t.units)
    except Exception:
        raise ValueError("Could not convert time unit string")

    freq = ts_type.to_pandas_freq()

    tidx = make_datetimeindex_from_year(freq, year)

    num_per = len(tidx)
    num = len(t.points)

    if not num == num_per:
        if tidx[0].is_leap_year:
            if not _check_leap_year(num, num_per, ts_type):
                raise UnresolvableTimeDefinitionError(
                    'Expected {} timestamps but '
                    'data has {}'.format(len(tidx), num))
        else:
            raise UnresolvableTimeDefinitionError('Expected {} timestamps but '
                                                  'data has {}'.format(
                                                      len(tidx), num))

    # ToDo: check why MS is not working for period conversion
    if freq == 'MS':
        freq = 'M'
    # convert first and last timestamps of index array into periods
    # (e.g. January and December for monthly data)
    per0 = tidx[0].to_period(freq)
    per1 = tidx[-1].to_period(freq)

    # first and last timestamp in data
    t0, t1 = cftime_to_datetime64([t.points[0], t.points[-1]], cfunit=t.units)

    if not per0.start_time <= t0 <= per0.end_time:
        raise ValueError('First timestamp of data {} does not lie in first '
                         'period: {}'.format(t0, per0))
    elif not per1.start_time <= t1 <= per1.end_time:
        raise ValueError('Last timestamp of data {} does not lie in last '
                         'period: {}'.format(t1, per1))
Ejemplo n.º 25
0
def test_to_numpy_freq():
    assert TsType('3hourly').to_numpy_freq() == '3h'
    assert TsType('daily').to_numpy_freq() == '1D'
Ejemplo n.º 26
0
    def _run_gridded_ungridded(self, var_name=None):
        """Analysis method for gridded vs. ungridded data"""
        print_log.info('PREPARING colocation of {} vs. {}'.format(
            self.model_id, self.obs_id))

        model_reader = self.instantiate_gridded_reader(what='model')
        obs_reader = ReadUngridded(self.obs_id, data_dir=self.obs_data_dir)

        obs_vars = obs_reader.get_vars_supported(self.obs_id, self.obs_vars)

        if len(obs_vars) == 0:
            raise DataCoverageError(
                'No observation variable matches found for '
                '{}'.format(self.obs_id))

        var_matches = self._find_var_matches(obs_vars, model_reader, var_name)

        print_log.info(
            'The following variable combinations will be colocated\n'
            'MODEL-VAR\tOBS-VAR')
        for key, val in var_matches.items():
            print_log.info('{}\t{}'.format(key, val))

        # get list of unique observation variables
        obs_vars = np.unique(list(var_matches.values())).tolist()

        if self.remove_outliers:
            self._update_var_outlier_ranges(var_matches)

        if self.read_opts_ungridded is not None:
            ropts = self.read_opts_ungridded
        else:
            ropts = {}

        data_objs = {}
        if self.start is None:
            self._infer_start_stop(model_reader)

        start, stop = start_stop(self.start, self.stop)

        for model_var, obs_var in var_matches.items():

            # ToDo: consider removing outliers already here.
            #if 'obs_filters' in self:
            ts_type = self.ts_type
            print_log.info('Running {} / {} ({}, {})'.format(
                self.model_id, self.obs_id, model_var, obs_var))

            try:
                model_data = self._read_gridded(reader=model_reader,
                                                var_name=model_var,
                                                start=start,
                                                stop=stop,
                                                is_model=True)
            except Exception as e:

                msg = (
                    'Failed to load gridded data: {} / {}. Reason {}'.format(
                        self.model_id, model_var, repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')

                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)
                else:
                    continue
            ts_type_src = model_data.ts_type
            rshow = self._eval_resample_how(model_var, obs_var)
            if ts_type is None:
                # if colocation frequency is not specified
                ts_type = ts_type_src

            ignore_stats = None
            if self.ignore_station_names is not None:
                ignore_stats = self.ignore_station_names
                if isinstance(ignore_stats, dict):
                    if obs_var in ignore_stats:
                        ignore_stats = ignore_stats[obs_var]
                    else:
                        ignore_stats = None

            #ts_type_src = model_data.ts_type
            if TsType(ts_type_src) < TsType(
                    ts_type):  # < all_ts_types.index(ts_type_src):
                print_log.info('Updating ts_type from {} to {} (highest '
                               'available in model {})'.format(
                                   ts_type, ts_type_src, self.model_id))
                ts_type = ts_type_src

            really_do_reanalysis = True
            if self.save_coldata:
                really_do_reanalysis = False
                savename = self._coldata_savename(model_data,
                                                  start,
                                                  stop,
                                                  ts_type,
                                                  var_name=model_var)

                file_exists = self._check_coldata_exists(
                    model_data.data_id, savename)

                out_dir = chk_make_subdir(self.basedir_coldata, self.model_id)
                if file_exists:
                    if not self.reanalyse_existing:
                        if self._log:
                            self._write_log('SKIP: {}\n'.format(savename))
                            print_log.info('Skip {} (file already '
                                           'exists)'.format(savename))
                            self.file_status[savename] = 'skipped'
                        continue
                    else:
                        really_do_reanalysis = True
                        print_log.info(
                            'Deleting and recomputing existing '
                            'colocated data file {}'.format(savename))
                        print_log.info('REMOVE: {}\n'.format(savename))
                        os.remove(os.path.join(out_dir, savename))
                else:
                    really_do_reanalysis = True

            if really_do_reanalysis:
                #Reading obs data only if the co-located data file does
                #not already exist.
                #This part of the method has been changed by @hansbrenna to work better with
                #large observational data sets. Only one variable is loaded into
                # the UngriddedData object at a time. Currently the variable is
                #re-read a lot of times, which is a weakness.
                obs_data = obs_reader.read(vars_to_retrieve=obs_var,
                                           only_cached=self._obs_cache_only,
                                           **ropts)

                # ToDo: consider removing outliers already here.
                if 'obs_filters' in self:
                    remaining_filters = self._eval_obs_filters()
                    obs_data = obs_data.apply_filters(**remaining_filters)

            try:
                try:
                    by = self.update_baseyear_gridded
                    stop = None
                except AttributeError:
                    by = None
                if self.model_use_climatology:
                    by = start.year
                coldata = colocate_gridded_ungridded(
                    gridded_data=model_data,
                    ungridded_data=obs_data,
                    ts_type=ts_type,
                    start=start,
                    stop=stop,
                    var_ref=obs_var,
                    filter_name=self.filter_name,
                    regrid_res_deg=self.regrid_res_deg,
                    remove_outliers=self.remove_outliers,
                    vert_scheme=self.vert_scheme,
                    harmonise_units=self.harmonise_units,
                    var_outlier_ranges=self.var_outlier_ranges,
                    var_ref_outlier_ranges=self.var_ref_outlier_ranges,
                    update_baseyear_gridded=by,
                    ignore_station_names=ignore_stats,
                    apply_time_resampling_constraints=self.
                    apply_time_resampling_constraints,
                    min_num_obs=self.min_num_obs,
                    colocate_time=self.colocate_time,
                    var_keep_outliers=self.model_keep_outliers,
                    var_ref_keep_outliers=self.obs_keep_outliers,
                    use_climatology_ref=self.obs_use_climatology,
                    resample_how=rshow)

                if self.model_to_stp:
                    coldata = correct_model_stp_coldata(coldata)
                if self.save_coldata:
                    self._save_coldata(coldata, savename, out_dir, model_var,
                                       model_data, obs_var)
                data_objs[model_var] = coldata
            except Exception:
                msg = ('Colocation between model {} / {} and obs {} / {} '
                       'failed.\nTraceback:\n{}'.format(
                           self.model_id, model_var, self.obs_id, obs_var,
                           traceback.format_exc()))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')
                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)

        return data_objs