def _aggregate_recursive(df, variable, recursive): """Recursive aggregation along the variable tree""" # downselect to components of `variable`, initialize list for aggregated (new) data # keep variable at highest level if it exists _df = df.filter(variable=[variable, f"{variable}|*"]) data_list = [] # iterate over variables (bottom-up) and aggregate all components up to `variable` for d in reversed(range(find_depth(variable), max(find_depth(_df.variable)))): components = compress(_df.variable, find_depth(_df.variable, level=d + 1)) var_list = set([reduce_hierarchy(v, -1) for v in components]) # a temporary dataframe allows to distinguish between full data and new data _data_agg = _aggregate(_df, variable=var_list) # check if data for intermediate variables already exists with adjust_log_level("pyam.core"): _data_self = _df.filter(variable=var_list)._data _overlap = _data_agg.index.intersection(_data_self.index) _new = _data_agg.index.difference(_data_self.index) # assert that aggregated values are consistent with existing data (optional) if recursive != "skip-validate" and not _overlap.empty: conflict = _compare(_data_self, _data_agg[_overlap], "self", "aggregate") if not conflict.empty: msg = "Aggregated values are inconsistent with existing data:" raise ValueError(f"{msg}\n{conflict}") # append aggregated values that are not already in data _df.append(_data_agg[_new], inplace=True) data_list.append(_data_agg[_new]) return pd.concat(data_list)
def _aggregate_region(df, variable, region, subregions=None, components=False, method='sum', weight=None): """Internal implementation for aggregating data over subregions""" if not isstr(variable) and components is not False: msg = 'aggregating by list of variables with components ' \ 'is not supported' raise ValueError(msg) if weight is not None and components is not False: msg = 'using weights and components in one operation not supported' raise ValueError(msg) # default subregions to all regions other than `region` subregions = subregions or df._all_other_regions(region, variable) if not len(subregions): msg = 'cannot aggregate variable `{}` to `{}` because it does not'\ ' exist in any subregion' logger.info(msg.format(variable, region)) return # compute aggregate over all subregions subregion_df = df.filter(region=subregions) rows = subregion_df._apply_filters(variable=variable) if weight is None: col = 'region' _data = _group_and_agg(subregion_df.data[rows], col, method=method) else: weight_rows = subregion_df._apply_filters(variable=weight) _data = _agg_weight(subregion_df.data[rows], subregion_df.data[weight_rows], method) # if not `components=False`, add components at the `region` level if components is not False: with adjust_log_level(logger): region_df = df.filter(region=region) # if `True`, auto-detect `components` at the `region` level, # defaults to variables below `variable` only present in `region` if components is True: level = dict(level=None) r_comps = region_df._variable_components(variable, **level) sr_comps = subregion_df._variable_components(variable, **level) components = set(r_comps).difference(sr_comps) if len(components): # rename all components to `variable` and aggregate rows = region_df._apply_filters(variable=components) _df = region_df.data[rows].copy() _df['variable'] = variable _data = _data.add(_group_and_agg(_df, 'region'), fill_value=0) return _data
def _aggregate_region( df, variable, region, subregions=None, components=False, method="sum", weight=None, drop_negative_weights=True, ): """Internal implementation for aggregating data over subregions""" if not isstr(variable) and components is not False: raise ValueError( "Aggregating by list of variables with components is not supported!" ) if weight is not None and components is not False: raise ValueError("Using weights and components in one operation not supported!") # default subregions to all regions other than `region` subregions = subregions or df._all_other_regions(region, variable) if not len(subregions): logger.info( f"Cannot aggregate variable '{variable}' to '{region}' " "because it does not exist in any subregion!" ) return # compute aggregate over all subregions subregion_df = df.filter(region=subregions) rows = subregion_df._apply_filters(variable=variable) if weight is None: if drop_negative_weights is False: raise ValueError( "Dropping negative weights can only be used with `weights`!" ) _data = _group_and_agg(subregion_df._data[rows], "region", method=method) else: weight_rows = subregion_df._apply_filters(variable=weight) _data = _agg_weight( subregion_df._data[rows], subregion_df._data[weight_rows], method, drop_negative_weights, ) # if not `components=False`, add components at the `region` level if components: with adjust_log_level(logger): region_df = df.filter(region=region) # if `True`, auto-detect `components` at the `region` level, # defaults to variables below `variable` only present in `region` if components is True: level = dict(level=None) r_comps = region_df._variable_components(variable, **level) sr_comps = subregion_df._variable_components(variable, **level) components = set(r_comps).difference(sr_comps) if len(components): # rename all components to `variable` and aggregate rows = region_df._apply_filters(variable=components) _df = region_df._data[rows] mapping = {c: variable for c in components} _df.index = replace_index_values(_df.index, "variable", mapping) _data = _data.add(_group_and_agg(_df, "region"), fill_value=0) return _data