Exemple #1
0
def _apply_filters(data, meta, filters):
    keep = np.array([True] * len(data))

    # filter by columns and list of values
    for col, values in filters.items():
        if col in meta.columns:
            matches = pattern_match(meta[col], values)
            cat_idx = meta[matches].index
            keep_col = data[META_IDX].set_index(META_IDX).index.isin(cat_idx)

        elif col in ['model', 'scenario', 'region', 'unit']:
            keep_col = pattern_match(data[col], values)

        elif col == 'variable':
            level = filters['level'] if 'level' in filters.keys() else None
            keep_col = pattern_match(data[col], values, level)

        elif col == 'year':
            keep_col = years_match(data[col], values)

        elif col == 'level':
            if 'variable' not in filters.keys():
                keep_col = pattern_match(data['variable'], '*', level=values)
            else:
                continue
        else:
            raise ValueError('filter by column ' + col + ' not supported')
        keep &= keep_col

    return keep
Exemple #2
0
def filter_by_meta(data, df, join_meta=False, **kwargs):
    """Filter by and join meta columns from an IamDataFrame to a pd.DataFrame

    Parameters
    ----------
    data: pd.DataFrame instance
        DataFrame to which meta columns are to be joined,
        index or columns must include `['model', 'scenario']`
    df: IamDataFrame instance
        IamDataFrame from which meta columns are filtered and joined (optional)
    join_meta: bool, default False
        join selected columns from `df.meta` on `data`
    kwargs:
        meta columns to be filtered/joined, where `col=...` applies filters
        by the given arguments (using `utils.pattern_match()`) and `col=None`
        joins the column without filtering (setting col to `np.nan`
        if `(model, scenario) not in df.meta.index`)
    """
    if not set(META_IDX).issubset(data.index.names + list(data.columns)):
        raise ValueError('missing required index dimensions or columns!')

    meta = pd.DataFrame(df.meta[list(set(kwargs) - set(META_IDX))].copy())

    # filter meta by columns
    keep = np.array([True] * len(meta))
    apply_filter = False
    for col, values in kwargs.items():
        if col in META_IDX and values is not None:
            _col = meta.index.get_level_values(0 if col is 'model' else 1)
            keep &= pattern_match(_col, values, has_nan=False)
            apply_filter = True
        elif values is not None:
            keep &= pattern_match(meta[col], values)
        apply_filter |= values is not None
    meta = meta[keep]

    # set the data index to META_IDX and apply filtered meta index
    data = data.copy()
    idx = list(data.index.names) if not data.index.names == [None] else None
    data = data.reset_index().set_index(META_IDX)
    meta = meta.loc[meta.index.intersection(data.index)]
    meta.index.names = META_IDX
    if apply_filter:
        data = data.loc[meta.index]
    data.index.names = META_IDX

    # join meta (optional), reset index to format as input arg
    data = data.join(meta) if join_meta else data
    data = data.reset_index().set_index(idx or 'index')
    if idx is None:
        data.index.name = None

    return data
Exemple #3
0
def test_pattern_match_dollar():
    data = pd.Series(['foo$bar', 'foo'])
    values = ['foo$bar']

    obs = utils.pattern_match(data, values)
    exp = [True, False]

    assert (obs == exp).all()
Exemple #4
0
def test_pattern_match_brackets():
    data = pd.Series(['foo (bar)', 'foo bar'])
    values = ['foo (bar)']

    obs = utils.pattern_match(data, values)
    exp = [True, False]

    assert (obs == exp).all()
Exemple #5
0
def test_pattern_match_dot():
    data = pd.Series(['foo', 'fo.'])
    values = ['fo.']

    obs = utils.pattern_match(data, values)
    exp = [False, True]

    assert (obs == exp).all()
Exemple #6
0
def test_pattern_match_none():
    data = pd.Series(['foo', 'bar'])
    values = ['baz']

    obs = utils.pattern_match(data, values)
    exp = [False, False]

    assert (obs == exp).all()
Exemple #7
0
def test_pattern_match_plus():
    data = pd.Series(['foo', 'foo+', '+bar', 'b+az'])
    values = ['*+*']

    obs = utils.pattern_match(data, values)
    exp = [False, True, True, True]

    assert (obs == exp).all()
Exemple #8
0
def test_pattern_match_ast_regex():
    data = pd.Series(['foo', 'foo2', 'bar'])
    values = ['foo*']

    obs = utils.pattern_match(data, values)
    exp = [True, True, False]

    assert (obs == exp).all()
Exemple #9
0
def _apply_filters(data, meta, filters):
    """Applies filters to the data and meta tables of an IamDataFrame.

    Parametersp
    ----------
    data: pd.DataFrame
        data table of an IamDataFrame
    meta: pd.DataFrame
        meta table of an IamDataFrame
    filters: dict
        dictionary of filters ({col: values}}); uses a pseudo-regexp syntax by
        default, but accepts `regexp: True` to use direct regexp
    """
    regexp = filters.pop('regexp', False)
    keep = np.array([True] * len(data))

    # filter by columns and list of values
    for col, values in filters.items():
        if col in meta.columns:
            matches = pattern_match(meta[col], values, regexp=regexp)
            cat_idx = meta[matches].index
            keep_col = data[META_IDX].set_index(META_IDX).index.isin(cat_idx)

        elif col in ['model', 'scenario', 'region', 'unit']:
            keep_col = pattern_match(data[col], values, regexp=regexp)

        elif col == 'variable':
            level = filters['level'] if 'level' in filters else None
            keep_col = pattern_match(data[col], values, level, regexp)

        elif col == 'year':
            keep_col = years_match(data[col], values)

        elif col == 'level':
            if 'variable' not in filters.keys():
                keep_col = pattern_match(data['variable'],
                                         '*',
                                         values,
                                         regexp=regexp)
            else:
                continue
        else:
            raise ValueError('filter by column ' + col + ' not supported')
        keep &= keep_col

    return keep
Exemple #10
0
def test_pattern_regexp():
    data = pd.Series(['foo', 'foa', 'foo$'])
    values = ['fo.$']

    obs = utils.pattern_match(data, values, regexp=True)
    exp = [True, True, False]

    assert (obs == exp).all()
Exemple #11
0
 def _match(data, patterns):
     # this is empty, return empty list which means "everything"
     if not patterns:
         return []
     # otherwise match everything
     matches = np.array([False] * len(data))
     for p in patterns:
         matches |= pattern_match(data, p)
     return data[matches].unique()
Exemple #12
0
    def check_aggregate(self, variable, components=None, units=None,
                        exclude_on_fail=False, multiplier=1, **kwargs):
        """Check whether the timeseries data match the aggregation
        of components or sub-categories

        Parameters
        ----------
        variable: str
            variable to be checked for matching aggregation of sub-categories
        components: list of str, default None
            list of variables, defaults to all sub-categories of `variable`
        units: str or list of str, default None
            filter variable and components for given unit(s)
        exclude_on_fail: boolean, default False
            flag scenarios failing validation as `exclude: True`
        multiplier: number, default 1
            factor when comparing variable and sum of components
        kwargs: passed to `np.isclose()`
        """
        # default components to all variables one level below `variable`
        if components is None:
            var_list = pd.Series(self.data.variable.unique())
            components = var_list[pattern_match(var_list,
                                                '{}|*'.format(variable), 0)]

        if not len(components):
            msg = 'cannot check aggregate for {} because it has no components'
            logger().info(msg.format(variable))

            return

        # filter and groupby data, use `pd.Series.align` for matching index
        df_variable, df_components = (
            _aggregate_by_variables(self.data, variable, units)
            .align(_aggregate_by_variables(self.data, components, units))
        )

        # use `np.isclose` for checking match
        diff = df_variable[~np.isclose(df_variable, multiplier * df_components,
                                       **kwargs)]

        if len(diff):
            msg = '{} - {} of {} data points are not aggregates of components'
            logger().info(msg.format(variable, len(diff), len(df_variable)))

            if exclude_on_fail:
                self._exclude_on_fail(diff.index.droplevel([2, 3]))

            diff = pd.concat([diff], keys=[variable], names=['variable'])

            return diff.unstack().rename_axis(None, axis=1)
Exemple #13
0
def filter_by_meta(data, df, join_meta=False, **kwargs):
    """Filter by and join meta columns from an IamDataFrame to a pd.DataFrame

    Parameters
    ----------
    data: pd.DataFrame instance
        DataFrame to which meta columns are to be joined,
        index or columns must include `['model', 'scenario']`
    df: IamDataFrame instance
        IamDataFrame from which meta columns are filtered and joined (optional)
    join_meta: bool, default False
        join selected columns from `df.meta` on `data`
    kwargs:
        meta columns to be joined, where `col=...` applies filters
        by the given arguments (using `utils.pattern_match()`) and `col=None`
        joins the column without filtering
    """
    if not set(META_IDX).issubset(data.index.names + list(data.columns)):
        raise ValueError('missing required index dimensions or columns!')

    meta = df.meta[list(kwargs)].copy()

    # filter meta by columns
    keep = np.array([True] * len(meta))
    for col, values in kwargs.items():
        if values is not None:
            keep_col = pattern_match(meta[col], values)
            keep &= keep_col
    meta = meta[keep]

    # set the data index to META_IDX and apply filtered meta index
    data = data.copy()
    idx = list(data.index.names) if not data.index.names == [None] else None
    data = data.reset_index().set_index(META_IDX).loc[meta.index]

    # join meta (optional), reset index to format as input arg
    data = data.join(meta) if join_meta else data
    data = data.reset_index().set_index(idx or 'index')
    if idx is None:
        data.index.name = None

    return data
Exemple #14
0
def test_pattern_match_ast2_regex():
    data = pd.Series(['foo|bar', 'foo', 'bar'])
    values = ['*o*b*']

    obs = utils.pattern_match(data, values)
    assert (obs == [True, False, False]).all()
Exemple #15
0
def test_pattern_match_nan():
    data = pd.Series(['foo', np.nan])
    values = ['baz']

    obs = utils.pattern_match(data, values, has_nan=True)
    assert (obs == [False, False]).all()
Exemple #16
0
def test_pattern_match_one():
    data = pd.Series(['foo', 'bar'])
    values = ['foo']

    obs = utils.pattern_match(data, values)
    assert (obs == [True, False]).all()
Exemple #17
0
def test_pattern_match_dollar():
    data = pd.Series(["foo$bar", "foo"])
    values = ["foo$bar"]

    obs = utils.pattern_match(data, values)
    assert (obs == [True, False]).all()
Exemple #18
0
def test_pattern_regexp():
    data = pd.Series(["foo", "foa", "foo$"])
    values = ["fo.$"]

    obs = utils.pattern_match(data, values, regexp=True)
    assert (obs == [True, True, False]).all()
Exemple #19
0
def test_pattern_match_dot():
    data = pd.Series(["foo", "fo."])
    values = ["fo."]

    obs = utils.pattern_match(data, values)
    assert (obs == [False, True]).all()
Exemple #20
0
def test_pattern_match_brackets():
    data = pd.Series(["foo (bar)", "foo bar"])
    values = ["foo (bar)"]

    obs = utils.pattern_match(data, values)
    assert (obs == [True, False]).all()
Exemple #21
0
def test_pattern_match_plus():
    data = pd.Series(["foo", "foo+", "+bar", "b+az"])
    values = ["*+*"]

    obs = utils.pattern_match(data, values)
    assert (obs == [False, True, True, True]).all()
Exemple #22
0
def test_pattern_match_ast2_regex():
    data = pd.Series(["foo|bar", "foo", "bar"])
    values = ["*o*b*"]

    obs = utils.pattern_match(data, values)
    assert (obs == [True, False, False]).all()
Exemple #23
0
def test_pattern_match_none():
    data = pd.Series(["foo", "bar"])
    values = ["baz"]

    obs = utils.pattern_match(data, values)
    assert (obs == [False, False]).all()
Exemple #24
0
def read_unfccc(
    party_code,
    gases=None,
    tier=None,
    mapping=None,
    model="UNFCCC",
    scenario="Data Inventory",
):
    """Read data from the UNFCCC Data Inventory

    This function is a wrappter for the
    :meth:`unfccc_di_api.UNFCCCApiReader.query`.

    The data returned from the UNFCCC Data Inventory is transformed
    into a structure similar to the format used in IPCC reports and
    IAM model comparison projects. For compatibility with the
    `iam-units <https://github.com/IAMconsortium/units>`_ package
    and the :meth:`convert_unit <IamDataFrame.convert_unit>`,
    emissions species are formatted to standard text ('CO2')
    instead of subscripts ('CO₂') and the unit 'CO₂ equivalent'
    used by UNFCCC is replaced by 'CO2e'.

    Parameters
    ----------
    party_code : str
        ISO3-style code for UNFCCC party (country)
    gases : str or list of str, optional
        Emission species to be queried from the data inventory can be stated
        as subscript-format ('CO₂') or simple text ('CO2')
    tier : int or list of int
        Pre-specified groupings of UNFCCC data to a variable naming format
        used in IPCC reports and IAM model comparison projects
    mapping : dict, optional
        Mapping to cast UNFCCC-data columns into IAMC-style variables, e.g.

        .. code-block:: python

            {
                'Emissions|{gas}|Energy': ('1.  Energy', '*', '*', '*'),
            }

        where the tuple corresponds to filters for the columns
        `['category', 'classification', 'measure', 'gas']`
        and `{<col>}` tags in the key are replaced by the column value.
    model : str, optional
        Name to be used as model identifier
    scenario : str, optional
        Name to be used as scenario identifier

    Returns
    -------
    :class:`IamDataFrame`
    """
    if not HAS_UNFCCC:  # pragma: no cover
        raise ImportError("Required package `unfccc-di-api` not found!")

    # check that only one of `tier` or `mapping` is provided
    if (tier is None and mapping is None) or (tier is not None
                                              and mapping is not None):
        raise ValueError("Please specify either `tier` or `mapping`!")

    global _READER
    if _READER is None:
        _READER = unfccc_di_api.UNFCCCApiReader()

    # retrieve data, drop non-numeric data and base year
    data = _READER.query(party_code=party_code, gases=to_list(gases))
    data = data[~np.isnan(data.numberValue)]
    data = data[data.year != "Base year"]

    # create the mapping from the data if `tier` is given
    if tier is not None:
        _category = data.category.unique()
        mapping = {}

        for t in to_list(tier):
            # treatment of tear 1
            if t == 1:
                pattern = re.compile(".\\.  ")  # pattern of top-level category
                for i in [i for i in _category if pattern.match(i)]:
                    key = "Emissions|{gas}|" + i[4:]
                    mapping[key] = (
                        i,
                        "Total for category",
                        "Net emissions/removals",
                        "*",
                    )
            else:
                raise ValueError(f"Unknown value for `tier`: {t}")

    # add new `variable` column, iterate over mapping to determine variables
    data["variable"] = None
    for variable, value in mapping.items():
        matches = np.array([True] * len(data))
        for i, col in enumerate(NAME_COLS):
            matches &= pattern_match(data[col], value[i])

        data.loc[matches,
                 "variable"] = data.loc[matches].apply(_compile_variable,
                                                       variable=variable,
                                                       axis=1)

    # drop unspecified rows and columns, rename value column
    cols = ["party", "variable", "unit", "year", "gas", "numberValue"]
    data = data.loc[[isstr(i) for i in data.variable], cols]
    data.rename(columns={"numberValue": "value"}, inplace=True)

    # append `gas` to unit, drop `gas` column
    data.loc[:, "unit"] = data.apply(_compile_unit, axis=1)
    data.drop(columns="gas", inplace=True)

    return IamDataFrame(data, model=model, scenario=scenario, region="party")
Exemple #25
0
    def check_aggregate_regions(self, variable, region='World',
                                components=None, units=None,
                                exclude_on_fail=False, **kwargs):
        """Check whether the region timeseries data match the aggregation
        of components

        Parameters
        ----------
        variable: str
            variable to be checked for matching aggregation of components data
        region: str
            region to be checked for matching aggregation of components data
        components: list of str, default None
            list of regions, defaults to all regions except region
        units: str or list of str, default None
            filter variable and components for given unit(s)
        exclude_on_fail: boolean, default False
            flag scenarios failing validation as `exclude: True`
        kwargs: passed to `np.isclose()`
        """
        var_df = self.filter(variable=variable, level=0)

        if components is None:
            components = list(set(var_df.data.region) - set([region]))

        if not len(components):
            msg = (
                'cannot check regional aggregate for `{}` because it has no '
                'regional components'
            )
            logger().info(msg.format(variable))

            return None

        # filter and groupby data, use `pd.Series.align` for matching index
        df_region, df_components = (
            _aggregate_by_regions(var_df.data, region, units)
            .align(_aggregate_by_regions(var_df.data, components, units))
        )

        df_components.index = df_components.index.droplevel(
            "variable"
        )

        # Add in variables that are included in region totals but which
        # aren't included in the regional components.
        # For example, if we are looking at World and Emissions|BC, we need
        # to add aviation and shipping to the sum of Emissions|BC for each
        # of World's regional components to do a valid check.
        different_region = components[0]
        var_list = pd.Series(self.data.variable.unique())
        var_components = var_list[pattern_match(var_list,
                                                '{}|*'.format(variable), 0)]
        for var_to_add in var_components:
            var_rows = self.data.variable == var_to_add
            region_rows = self.data.region == different_region
            var_has_regional_info = (var_rows & region_rows).any()
            if not var_has_regional_info:
                df_var_to_add = self.filter(
                    region=region, variable=var_to_add
                ).data.groupby(REGION_IDX).sum()['value']
                df_var_to_add.index = df_var_to_add.index.droplevel("variable")

                if len(df_var_to_add):
                    df_components = df_components.add(df_var_to_add,
                                                      fill_value=0)

        df_components = pd.concat([df_components], keys=[variable],
                                  names=['variable'])

        # use `np.isclose` for checking match
        diff = df_region[~np.isclose(df_region, df_components, **kwargs)]

        if len(diff):
            msg = (
                '{} - {} of {} data points are not aggregates of regional '
                'components'
            )
            logger().info(msg.format(variable, len(diff), len(df_region)))

            if exclude_on_fail:
                self._exclude_on_fail(diff.index.droplevel([2, 3]))

            diff = pd.concat([diff], keys=[region], names=['region'])

            return diff.unstack().rename_axis(None, axis=1)