def _aggregate_by_variables(df, variables, units=None): variables = [variables] if isstr(variables) else variables df = df[df.variable.isin(variables)] if units is not None: units = [units] if isstr(units) else units df = df[df.unit.isin(units)] return df.groupby(YEAR_IDX).sum()['value']
def _aggregate_by_regions(df, regions, units=None): regions = [regions] if isstr(regions) else regions df = df[df.region.isin(regions)] if units is not None: units = [units] if isstr(units) else units df = df[df.unit.isin(units)] return df.groupby(REGION_IDX).sum()['value']
def pivot_table(self, index, columns, values='value', aggfunc='count', fill_value=None, style=None): """Returns a pivot table Parameters ---------- index: str or list of strings rows for Pivot table columns: str or list of strings columns for Pivot table values: str, default 'value' dataframe column to aggregate or count aggfunc: str or function, default 'count' function used for aggregation, accepts 'count', 'mean', and 'sum' fill_value: scalar, default None value to replace missing values with style: str, default None output style for pivot table formatting accepts 'highlight_not_max', 'heatmap' """ index = [index] if isstr(index) else index columns = [columns] if isstr(columns) else columns df = self.data # allow 'aggfunc' to be passed as string for easier user interface if isstr(aggfunc): if aggfunc == 'count': df = self.data.groupby(index + columns, as_index=False).count() fill_value = 0 elif aggfunc == 'mean': df = self.data.groupby(index + columns, as_index=False).mean()\ .round(2) aggfunc = np.sum fill_value = 0 if style == 'heatmap' else "" elif aggfunc == 'sum': aggfunc = np.sum fill_value = 0 if style == 'heatmap' else "" df = df.pivot_table(values=values, index=index, columns=columns, aggfunc=aggfunc, fill_value=fill_value) return df
def _load_yaml(self, obj): check_rel_paths = False if hasattr(obj, 'read'): # it's a file obj = obj.read() if isstr(obj) and not os.path.exists(obj): raise IOError('File {} does not exist'.format(obj)) if isstr(obj) and os.path.exists(obj): check_rel_paths = True fname = obj with open(fname) as f: obj = f.read() if not isinstance(obj, dict): obj = yaml.load(obj) return obj
def cross_threshold(x, threshold=0, direction=['from above', 'from below']): """Returns a list of the years in which a timeseries (indexed over years) crosses a given threshold Parameters ---------- x: pandas.Series a timeseries indexed over years threshold: float, default 0 the threshold that the timeseries is checked against direction: str, optional, default `['from above', 'from below']` whether to return all years where the threshold is crossed or only where threshold is crossed in a specific direction """ prev_yr, prev_val = None, None years = [] direction = [direction] if isstr(direction) else list(direction) if not set(direction).issubset(set(['from above', 'from below'])): raise ValueError('invalid direction `{}`'.format(direction)) for yr, val in zip(x.index, x.values): if np.isnan(val): # ignore nans in the timeseries continue if prev_val is None: prev_yr, prev_val = yr, val continue if not np.sign(prev_val - threshold) == np.sign(val - threshold): if ('from above' in direction and prev_val > val) \ or ('from below' in direction and prev_val < val): change = (val - prev_val) / (yr - prev_yr) # add one because int() rounds down cross_yr = prev_yr + int((threshold - prev_val) / change) + 1 years.append(cross_yr) prev_yr, prev_val = yr, val return years
def mpl_args_to_meta_cols(df, **kwargs): """Return the kwargs values (not keys) matching a `df.meta` column name""" cols = set() for arg, value in kwargs.items(): if isstr(value) and value in df.meta.columns: cols.add(value) return list(cols)
def _aggregate_region(df, variable, region, subregions=None, components=False, method='sum', weight=None): """Internal implementation for aggregating data over subregions""" if not isstr(variable) and components is not False: msg = 'aggregating by list of variables with components ' \ 'is not supported' raise ValueError(msg) if weight is not None and components is not False: msg = 'using weights and components in one operation not supported' raise ValueError(msg) # default subregions to all regions other than `region` subregions = subregions or df._all_other_regions(region, variable) if not len(subregions): msg = 'cannot aggregate variable `{}` to `{}` because it does not'\ ' exist in any subregion' logger.info(msg.format(variable, region)) return # compute aggregate over all subregions subregion_df = df.filter(region=subregions) rows = subregion_df._apply_filters(variable=variable) if weight is None: col = 'region' _data = _group_and_agg(subregion_df.data[rows], col, method=method) else: weight_rows = subregion_df._apply_filters(variable=weight) _data = _agg_weight(subregion_df.data[rows], subregion_df.data[weight_rows], method) # if not `components=False`, add components at the `region` level if components is not False: with adjust_log_level(logger): region_df = df.filter(region=region) # if `True`, auto-detect `components` at the `region` level, # defaults to variables below `variable` only present in `region` if components is True: level = dict(level=None) r_comps = region_df._variable_components(variable, **level) sr_comps = subregion_df._variable_components(variable, **level) components = set(r_comps).difference(sr_comps) if len(components): # rename all components to `variable` and aggregate rows = region_df._apply_filters(variable=components) _df = region_df.data[rows].copy() _df['variable'] = variable _data = _data.add(_group_and_agg(_df, 'region'), fill_value=0) return _data
def _get_method_func(method): """Translate a string to a known method""" if not isstr(method): return method if method in KNOWN_FUNCS: return KNOWN_FUNCS[method] # raise error if `method` is a string but not in dict of known methods raise ValueError('method `{}` is not a known aggregator'.format(method))
def _get_method_func(method): """Translate a string to a known method""" if not isstr(method): return method if method in KNOWN_FUNCS: return KNOWN_FUNCS[method] # raise error if `method` is a string but not in dict of known methods raise ValueError(f"'{method}' is not a known method!")
def test_interpolate(test_pd_df): _df = test_pd_df.copy() _df["foo"] = ["bar", "baz", 2] # add extra_col (check for #351) df = IamDataFrame(_df) obs = df.interpolate(2007, inplace=False).filter(year=2007)._data.values npt.assert_allclose(obs, [3, 1.5, 4]) # redo the interpolation and check that no duplicates are added df.interpolate(2007, inplace=False) assert not df._data.index.duplicated().any() # assert that extra_col does not have nan's (check for #351) assert all([True if isstr(i) else ~np.isnan(i) for i in df.foo])
def test_interpolate(test_pd_df): _df = test_pd_df.copy() _df['foo'] = ['bar', 'baz', 2] # add extra_col (check for #351) df = IamDataFrame(_df) df.interpolate(2007) obs = df.filter(year=2007).data['value'].reset_index(drop=True) exp = pd.Series([3, 1.5, 4], name='value') pd.testing.assert_series_equal(obs, exp) # redo the interpolation and check that no duplicates are added df.interpolate(2007) assert not df.filter().data.duplicated().any() # assert that extra_col does not have nan's (check for #351) assert all([True if isstr(i) else ~np.isnan(i) for i in df.data.foo])
def _get_token(creds, base_url): """Parse credentials and get token from IIASA authentication service""" plaintextcreds = True # try reading default config or parse file if creds is None: creds = _get_config() plaintextcreds = False elif isinstance(creds, Path) or isstr(creds): _creds = _get_config(creds) if _creds is None: logger.error(f"Could not read credentials from `{creds}`") creds = _creds plaintextcreds = False # if (still) no creds, get anonymous auth and return if creds is None: url = "/".join([base_url, "anonym"]) r = requests.get(url) _check_response(r, "Could not get anonymous token") return r.json(), None # parse creds, write warning if isinstance(creds, Mapping): user, pw = creds["username"], creds["password"] else: user, pw = creds if plaintextcreds: logger.warning("You provided credentials in plain text. DO NOT save " "these in a repository or otherwise post them online") deprecation_warning( "Please use `pyam.iiasa.set_config(<user>, <pwd>)`" " to store your credentials in a file!", "Providing credentials in plain text", ) # get user token headers = { "Accept": "application/json", "Content-Type": "application/json" } data = {"username": user, "password": pw} url = "/".join([base_url, "login"]) r = requests.post(url, headers=headers, data=json.dumps(data)) _check_response(r, "Login failed for user: {}".format(user)) return r.json(), user
def _get_token(creds, base_url): """Parse credentials and get token from IIASA authentication service""" plaintextcreds = True # try reading default config or parse file if creds is None: creds = _get_config() plaintextcreds = False elif isinstance(creds, Path) or isstr(creds): _creds = _get_config(creds) if _creds is None: logger.error(f'Could not read credentials from `{creds}`') creds = _creds plaintextcreds = False # if (still) no creds, get anonymous auth and return if creds is None: url = '/'.join([base_url, 'anonym']) r = requests.get(url) _check_response(r, 'Could not get anonymous token') return r.json(), None # parse creds, write warning if isinstance(creds, Mapping): user, pw = creds['username'], creds['password'] else: user, pw = creds if plaintextcreds: logger.warning('You provided credentials in plain text. DO NOT save ' 'these in a repository or otherwise post them online') deprecation_warning( 'Please use `pyam.iiasa.set_config(<user>, <pwd>)`' ' to store your credentials in a file!', 'Providing credentials in plain text') # get user token headers = { 'Accept': 'application/json', 'Content-Type': 'application/json' } data = {'username': user, 'password': pw} url = '/'.join([base_url, 'login']) r = requests.post(url, headers=headers, data=json.dumps(data)) _check_response(r, 'Login failed for user: {}'.format(user)) return r.json(), user
def _aggregate(df, variable, components=None, method=np.sum): """Internal implementation of the `aggregate` function""" if components is not None: # ensure that components is a proper list (not a dictionary) if not islistable(components) or isinstance(components, dict): raise ValueError( f"Value for `components` must be a list, found: {components}" ) # list of variables require default components (no manual list) if islistable(variable): raise NotImplementedError( "Aggregating by list of variables does not support `components`." ) mapping = {} msg = "Cannot aggregate variable '{}' because it has no components!" # if single variable if isstr(variable): # default components to all variables one level below `variable` components = components or df._variable_components(variable) if not len(components): logger.info(msg.format(variable)) return for c in components: mapping[c] = variable # else, use all variables one level below `variable` as components else: for v in variable if islistable(variable) else [variable]: _components = df._variable_components(v) if not len(_components): logger.info(msg.format(v)) continue for c in _components: mapping[c] = v # rename all components to `variable` and aggregate _df = df._data[df._apply_filters(variable=mapping.keys())] _df.index = replace_index_values(_df, "variable", mapping) return _group_and_agg(_df, [], method)
def cross_threshold(x, threshold=0, direction=['from above', 'from below'], return_type=int): """Returns a list of the years in which a timeseries crosses a threshold Parameters ---------- x : :class:`pandas.Series` A timeseries indexed over years (as integers) threshold : float, optional The threshold that the timeseries is checked against direction : str, optional Whether to return all years where the threshold is crossed or only where threshold is crossed in a specific direction return_type : type, optional Whether to cast the returned values to integer (years) """ direction = [direction] if isstr(direction) else list(direction) if not set(direction).issubset(set(['from above', 'from below'])): raise ValueError('invalid direction `{}`'.format(direction)) # get the values and time-domain index x = x.dropna() values, index = x.values - threshold, x.index.to_numpy() positive, negative = (values >= 0), (values < 0) # determine all indices before crossing the threshold pre = [False] * (len(x) - 1) if 'from above' in direction: pre |= positive[:-1] & negative[1:] if 'from below' in direction: pre |= positive[1:] & negative[:-1] pre = np.argwhere(pre) # determine all indices after crossing the threshold post = pre + 1 # compute the index value where the threshold is crossed change = (values[post] - values[pre]) / (index[post] - index[pre]) years = index[pre] - values[pre] / change # it year (as int) is returned, add one because int() rounds down if return_type == int: return [y + 1 for y in map(int, years)] return years
def _get_token(creds, base_url): """Parse credentials and get token from IIASA authentication service""" # try reading default config or parse file if creds is None: creds = _get_config() elif isinstance(creds, Path) or isstr(creds): _creds = _get_config(creds) if _creds is None: logger.error(f"Could not read credentials from `{creds}`") creds = _creds else: msg = ( "Passing credentials as clear-text is not allowed. " "Please use `pyam.iiasa.set_config(<user>, <password>)` instead!") raise DeprecationWarning(msg) # if (still) no creds, get anonymous auth and return if creds is None: url = "/".join([base_url, "anonym"]) r = requests.get(url) _check_response(r, "Could not get anonymous token") return r.json(), None # parse creds, write warning if isinstance(creds, Mapping): user, pw = creds["username"], creds["password"] else: user, pw = creds # get user token headers = { "Accept": "application/json", "Content-Type": "application/json" } data = {"username": user, "password": pw} url = "/".join([base_url, "login"]) r = requests.post(url, headers=headers, data=json.dumps(data)) _check_response(r, "Login failed for user: {}".format(user)) return r.json(), user
def _aggregate(df, variable, components=None, method=np.sum): """Internal implementation of the `aggregate` function""" # list of variables require default components (no manual list) if islistable(variable) and components is not None: raise ValueError('aggregating by list of variables cannot use ' 'custom components') mapping = {} msg = 'cannot aggregate variable `{}` because it has no components' # if single variable if isstr(variable): # default components to all variables one level below `variable` components = components or df._variable_components(variable) if not len(components): logger.info(msg.format(variable)) return for c in components: mapping[c] = variable # else, use all variables one level below `variable` as components else: for v in variable if islistable(variable) else [variable]: _components = df._variable_components(v) if not len(_components): logger.info(msg.format(v)) continue for c in _components: mapping[c] = v # rename all components to `variable` and aggregate _df = df.data[df._apply_filters(variable=mapping.keys())].copy() _df['variable'].replace(mapping, inplace=True) return _group_and_agg(_df, [], method)
def _aggregate_region( df, variable, region, subregions=None, components=False, method="sum", weight=None, drop_negative_weights=True, ): """Internal implementation for aggregating data over subregions""" if not isstr(variable) and components is not False: raise ValueError( "Aggregating by list of variables with components is not supported!" ) if weight is not None and components is not False: raise ValueError("Using weights and components in one operation not supported!") # default subregions to all regions other than `region` subregions = subregions or df._all_other_regions(region, variable) if not len(subregions): logger.info( f"Cannot aggregate variable '{variable}' to '{region}' " "because it does not exist in any subregion!" ) return # compute aggregate over all subregions subregion_df = df.filter(region=subregions) rows = subregion_df._apply_filters(variable=variable) if weight is None: if drop_negative_weights is False: raise ValueError( "Dropping negative weights can only be used with `weights`!" ) _data = _group_and_agg(subregion_df._data[rows], "region", method=method) else: weight_rows = subregion_df._apply_filters(variable=weight) _data = _agg_weight( subregion_df._data[rows], subregion_df._data[weight_rows], method, drop_negative_weights, ) # if not `components=False`, add components at the `region` level if components: with adjust_log_level(logger): region_df = df.filter(region=region) # if `True`, auto-detect `components` at the `region` level, # defaults to variables below `variable` only present in `region` if components is True: level = dict(level=None) r_comps = region_df._variable_components(variable, **level) sr_comps = subregion_df._variable_components(variable, **level) components = set(r_comps).difference(sr_comps) if len(components): # rename all components to `variable` and aggregate rows = region_df._apply_filters(variable=components) _df = region_df._data[rows] mapping = {c: variable for c in components} _df.index = replace_index_values(_df.index, "variable", mapping) _data = _data.add(_group_and_agg(_df, "region"), fill_value=0) return _data
def read_unfccc( party_code, gases=None, tier=None, mapping=None, model="UNFCCC", scenario="Data Inventory", ): """Read data from the UNFCCC Data Inventory This function is a wrappter for the :meth:`unfccc_di_api.UNFCCCApiReader.query`. The data returned from the UNFCCC Data Inventory is transformed into a structure similar to the format used in IPCC reports and IAM model comparison projects. For compatibility with the `iam-units <https://github.com/IAMconsortium/units>`_ package and the :meth:`convert_unit <IamDataFrame.convert_unit>`, emissions species are formatted to standard text ('CO2') instead of subscripts ('CO₂') and the unit 'CO₂ equivalent' used by UNFCCC is replaced by 'CO2e'. Parameters ---------- party_code : str ISO3-style code for UNFCCC party (country) gases : str or list of str, optional Emission species to be queried from the data inventory can be stated as subscript-format ('CO₂') or simple text ('CO2') tier : int or list of int Pre-specified groupings of UNFCCC data to a variable naming format used in IPCC reports and IAM model comparison projects mapping : dict, optional Mapping to cast UNFCCC-data columns into IAMC-style variables, e.g. .. code-block:: python { 'Emissions|{gas}|Energy': ('1. Energy', '*', '*', '*'), } where the tuple corresponds to filters for the columns `['category', 'classification', 'measure', 'gas']` and `{<col>}` tags in the key are replaced by the column value. model : str, optional Name to be used as model identifier scenario : str, optional Name to be used as scenario identifier Returns ------- :class:`IamDataFrame` """ if not HAS_UNFCCC: # pragma: no cover raise ImportError("Required package `unfccc-di-api` not found!") # check that only one of `tier` or `mapping` is provided if (tier is None and mapping is None) or (tier is not None and mapping is not None): raise ValueError("Please specify either `tier` or `mapping`!") global _READER if _READER is None: _READER = unfccc_di_api.UNFCCCApiReader() # retrieve data, drop non-numeric data and base year data = _READER.query(party_code=party_code, gases=to_list(gases)) data = data[~np.isnan(data.numberValue)] data = data[data.year != "Base year"] # create the mapping from the data if `tier` is given if tier is not None: _category = data.category.unique() mapping = {} for t in to_list(tier): # treatment of tear 1 if t == 1: pattern = re.compile(".\\. ") # pattern of top-level category for i in [i for i in _category if pattern.match(i)]: key = "Emissions|{gas}|" + i[4:] mapping[key] = ( i, "Total for category", "Net emissions/removals", "*", ) else: raise ValueError(f"Unknown value for `tier`: {t}") # add new `variable` column, iterate over mapping to determine variables data["variable"] = None for variable, value in mapping.items(): matches = np.array([True] * len(data)) for i, col in enumerate(NAME_COLS): matches &= pattern_match(data[col], value[i]) data.loc[matches, "variable"] = data.loc[matches].apply(_compile_variable, variable=variable, axis=1) # drop unspecified rows and columns, rename value column cols = ["party", "variable", "unit", "year", "gas", "numberValue"] data = data.loc[[isstr(i) for i in data.variable], cols] data.rename(columns={"numberValue": "value"}, inplace=True) # append `gas` to unit, drop `gas` column data.loc[:, "unit"] = data.apply(_compile_unit, axis=1) data.drop(columns="gas", inplace=True) return IamDataFrame(data, model=model, scenario=scenario, region="party")
def _get_kwarg(k): x = kwargs.pop(k, []) return [x] if isstr(x) else x
def __setitem__(self, key, value): _key_check = [key] if isstr(key) else key if set(_key_check).issubset(self.meta.columns): return self.meta.__setitem__(key, value) else: return self.data.__setitem__(key, value)
def __init__(self, df, groupby=None, filters=None, rows=False, percentiles=[0.25, 0.5, 0.75]): self.df = df self.idx_depth = None # assing `groupby` settings and check that specifications are valid self.col = None self.groupby = None if isstr(groupby): self.col = groupby self.groupby = {groupby: None} elif isinstance(groupby, dict) and len(groupby) == 1: self.col = list(groupby.keys())[0] self.groupby = groupby self.idx_depth = 2 elif groupby is not None: raise ValueError('arg `{}` not valid `groupby`'.format(groupby)) if self.col is not None and self.col not in df.meta.columns: raise ValueError('column `{}` not in `df.meta`'.format(self.col)) # if neither groupby nor filters is given, use filters to describe all # and assume that rows are used if groupby is None and filters is None: self.filters = [('', {})] rows = True else: self.filters = filters if filters is not None else [] # set lists to sort index and subindex self._idx = [] if self.col is None else [self.col] self._sub_idx = self.groupby[self.col] or self.df[self.col].unique() \ if self.col is not None else [] self._headers, self._subheaders = ([], []) # assing `filters` settings and check that specifications are valid for (idx, _filter) in self.filters: # check that index in tuple is valid if isstr(idx): self._add_to_index(idx) else: if not (isinstance(idx, tuple) and len(idx) == 2 and isstr(idx[0]) or not isstr(idx[1])): raise ValueError('`{}` is not a valid index'.format(idx)) self._add_to_index(idx[0], idx[1]) # check that filters in tuple are valid if not isinstance(_filter, dict): raise ValueError('`{}` is not a valid filter'.format(_filter)) elif not (set(_filter) - set(META_IDX)).issubset(df.meta): raise ValueError('column `{}` not in `df.meta`'.format( set(_filter) - set(META_IDX) - set(df.meta))) self.stats = None self.rows = [] if rows else None # percentiles for passing to `pandas.describe()` self.percentiles = list(percentiles) self._describe_cols = (['count', 'mean', 'std', 'min'] + ['{:.0%}'.format(i) for i in self.percentiles] + ['max'])
def line_plot(df, x='year', y='value', ax=None, legend=None, title=True, color=None, marker=None, linestyle=None, cmap=None, fill_between=None, final_ranges=None, rm_legend_label=[], **kwargs): """Plot data as lines with or without markers. Parameters ---------- df : pd.DataFrame Data to plot as a long-form data frame x : string, optional The column to use for x-axis values default: year y : string, optional The column to use for y-axis values default: value ax : matplotlib.Axes, optional legend : bool or dictionary, optional Add a legend. If a dictionary is provided, it will be used as keyword arguments in creating the legend. default: None (displays legend only if less than 13 entries) title : bool or string, optional Display a default or custom title. color : string, optional A valid matplotlib color or column name. If a column name, common values will be provided the same color. default: None marker : string, optional A valid matplotlib marker or column name. If a column name, common values will be provided the same marker. default: None linestyle : string, optional A valid matplotlib linestyle or column name. If a column name, common values will be provided the same linestyle. default: None cmap : string, optional A colormap to use. default: None fill_between : boolean or dict, optional Fill lines between minima/maxima of the 'color' argument. This can only be used if also providing a 'color' argument. If this is True, then default arguments will be provided to `ax.fill_between()`. If this is a dictionary, those arguments will be provided instead of defaults. default: None final_ranges : boolean or dict, optional Add vertical line between minima/maxima of the 'color' argument in the last period plotted. This can only be used if also providing a 'color' argument. If this is True, then default arguments will be provided to `ax.axvline()`. If this is a dictionary, those arguments will be provided instead of defaults. default: None rm_legend_label : string, list, optional Remove the color, marker, or linestyle label in the legend. default: [] kwargs : Additional arguments to pass to the pd.DataFrame.plot() function """ if ax is None: fig, ax = plt.subplots() # assign styling properties props = assign_style_props(df, color=color, marker=marker, linestyle=linestyle, cmap=cmap) if fill_between and 'color' not in props: raise ValueError('Must use `color` kwarg if using `fill_between`') if final_ranges and 'color' not in props: raise ValueError('Must use `color` kwarg if using `final_ranges`') # reshape data for use in line_plot df = reshape_line_plot(df, x, y) # long form to one column per line # determine index of column name in reshaped dataframe prop_idx = {} for kind, var in [('color', color), ('marker', marker), ('linestyle', linestyle)]: if var is not None and var in df.columns.names: prop_idx[kind] = df.columns.names.index(var) # plot data, keeping track of which legend labels to apply no_label = [rm_legend_label] if isstr(rm_legend_label) else rm_legend_label for col, data in df.iteritems(): pargs = {} labels = [] # build plotting args and line legend labels for key, kind, var in [('c', 'color', color), ('marker', 'marker', marker), ('linestyle', 'linestyle', linestyle)]: if kind in props: label = col[prop_idx[kind]] pargs[key] = props[kind][label] if kind not in no_label: labels.append(repr(label).lstrip("u'").strip("'")) else: pargs[key] = var kwargs.update(pargs) data = data.dropna() data.plot(ax=ax, **kwargs) if labels: ax.lines[-1].set_label(' '.join(labels)) if fill_between: _kwargs = {'alpha': 0.25} if fill_between in [True, None] \ else fill_between data = df.T columns = data.columns # get outer boundary mins and maxes allmins = data.groupby(color).min() intermins = ( data.dropna(axis=1).groupby(color).min() # nonan data .reindex(columns=columns) # refill with nans .T.interpolate(method='index').T # interpolate ) mins = pd.concat([allmins, intermins]).min(level=0) allmaxs = data.groupby(color).max() intermaxs = ( data.dropna(axis=1).groupby(color).max() # nonan data .reindex(columns=columns) # refill with nans .T.interpolate(method='index').T # interpolate ) maxs = pd.concat([allmaxs, intermaxs]).max(level=0) # do the fill for idx in mins.index: ymin = mins.loc[idx] ymax = maxs.loc[idx] ax.fill_between(ymin.index, ymin, ymax, facecolor=props['color'][idx], **_kwargs) # add bars to the end of the plot showing range if final_ranges: # have to explicitly draw it to get the tick labels (these change once # you add the vlines) plt.gcf().canvas.draw() _kwargs = {'linewidth': 2} if final_ranges in [True, None] \ else final_ranges first = df.index[0] final = df.index[-1] mins = df.T.groupby(color).min()[final] maxs = df.T.groupby(color).max()[final] ymin, ymax = ax.get_ylim() ydiff = ymax - ymin xmin, xmax = ax.get_xlim() xdiff = xmax - xmin xticks = ax.get_xticks() xlabels = ax.get_xticklabels() # 1.5% increase seems to be ok per extra line extra_space = 0.015 for i, idx in enumerate(mins.index): xpos = final + xdiff * extra_space * (i + 1) _ymin = (mins[idx] - ymin) / ydiff _ymax = (maxs[idx] - ymin) / ydiff ax.axvline(xpos, ymin=_ymin, ymax=_ymax, color=props['color'][idx], **_kwargs) # for equal spacing between xmin and first datapoint and xmax and last # line ax.set_xlim(xmin, xpos + first - xmin) ax.set_xticks(xticks) ax.set_xticklabels(xlabels) # build unique legend handles and labels handles, labels = ax.get_legend_handles_labels() handles, labels = np.array(handles), np.array(labels) _, idx = np.unique(labels, return_index=True) handles, labels = handles[idx], labels[idx] if legend is not False: _add_legend(ax, handles, labels, legend) # add default labels if possible ax.set_xlabel(x.title()) units = df.columns.get_level_values('unit').unique() units_for_ylabel = len(units) == 1 and x == 'year' and y == 'value' ylabel = units[0] if units_for_ylabel else y.title() ax.set_ylabel(ylabel) # build a default title if possible if title: default_title = [] for var in ['model', 'scenario', 'region', 'variable']: if var in df.columns.names: values = df.columns.get_level_values(var).unique() if len(values) == 1: default_title.append('{}: {}'.format(var, values[0])) title = ' '.join(default_title) if title is True else title ax.set_title(title) return ax, handles, labels
def _get_kwarg(k): # TODO refactor API to return all models if model-list is empty x = kwargs.pop(k, "*" if k == "model" else []) return [x] if isstr(x) else x
def _group_and_agg(df, by, method=np.sum): """Groupby & aggregate `df` by column(s), return indexed `pd.Series`""" by = [by] if isstr(by) else by cols = [c for c in list(df.columns) if c not in ['value'] + by] # pick aggregator func (default: sum) return df.groupby(cols)['value'].agg(_get_method_func(method))
def _new_meta_column(self, name, value): """Add a metadata column, set to `uncategorized` if str else np.nan""" if name is None: raise ValueError('cannot add a meta column {}'.format(name)) if name not in self.meta: self.meta[name] = 'uncategorized' if isstr(value) else np.nan
def line_plot(df, x='year', y='value', ax=None, legend=None, title=True, color=None, marker=None, linestyle=None, cmap=None, rm_legend_label=[], **kwargs): """Plot data as lines with or without markers. Parameters ---------- df : pd.DataFrame Data to plot as a long-form data frame x : string, optional The column to use for x-axis values default: year y : string, optional The column to use for y-axis values default: value ax : matplotlib.Axes, optional legend : bool or dictionary, optional Add a legend. If a dictionary is provided, it will be used as keyword arguments in creating the legend. default: None (displays legend only if less than 13 entries) title : bool or string, optional Display a default or custom title. color : string, optional A valid matplotlib color or column name. If a column name, common values will be provided the same color. default: None marker : string, optional A valid matplotlib marker or column name. If a column name, common values will be provided the same marker. default: None linestyle : string, optional A valid matplotlib linestyle or column name. If a column name, common values will be provided the same linestyle. default: None cmap : string, optional A colormap to use. default: None rm_legend_label : string, list, optional Remove the color, marker, or linestyle label in the legend. default: [] kwargs : Additional arguments to pass to the pd.DataFrame.plot() function """ if ax is None: fig, ax = plt.subplots() df = reshape_line_plot(df, x, y) # long form to one column per line # determine color, marker, and linestyle for each line defaults = default_props(reset=True, num_colors=len(df.columns), colormap=cmap) props = {} prop_idx = {} rc = run_control() for kind, var in [('color', color), ('marker', marker), ('linestyle', linestyle)]: rc_has_kind = kind in rc if var in df.columns.names: rc_has_var = rc_has_kind and var in rc[kind] props_for_kind = {} for val in df.columns.get_level_values(var).unique(): if rc_has_var and val in rc[kind][var]: props_for_kind[val] = rc[kind][var][val] # cycle any way to keep defaults the same next(defaults[kind]) else: props_for_kind[val] = next(defaults[kind]) props[kind] = props_for_kind prop_idx[kind] = df.columns.names.index(var) # plot data, keeping track of which legend labels to apply no_label = [rm_legend_label] if isstr(rm_legend_label) else rm_legend_label for col, data in df.iteritems(): pargs = {} labels = [] # build plotting args and line legend labels for key, kind, var in [('c', 'color', color), ('marker', 'marker', marker), ('linestyle', 'linestyle', linestyle)]: if kind in props: label = col[prop_idx[kind]] pargs[key] = props[kind][label] if kind not in no_label: labels.append(repr(label).lstrip("u'").strip("'")) else: pargs[key] = var kwargs.update(pargs) data = data.dropna() data.plot(ax=ax, **kwargs) if labels: ax.lines[-1].set_label(' '.join(labels)) # build unique legend handles and labels handles, labels = ax.get_legend_handles_labels() handles, labels = np.array(handles), np.array(labels) _, idx = np.unique(labels, return_index=True) handles, labels = handles[idx], labels[idx] if legend is not False: _add_legend(ax, handles, labels, legend) # add default labels if possible ax.set_xlabel(x.title()) units = df.columns.get_level_values('unit').unique() units_for_ylabel = len(units) == 1 and x == 'year' and y == 'value' ylabel = units[0] if units_for_ylabel else y.title() ax.set_ylabel(ylabel) # build a default title if possible _title = [] for var in ['model', 'scenario', 'region', 'variable']: if var in df.columns.names: values = df.columns.get_level_values(var).unique() if len(values) == 1: _title.append('{}: {}'.format(var, values[0])) if title and _title: ax.set_title(' '.join(_title)) return ax, handles, labels