def test_cumulative_out_of_range(): # set logger level to exclude warnings in unit test output logger().setLevel('ERROR') # note that the series is not order and the index is defined as float y = pd.Series(data=[np.nan, 1, 3, 1], index=[2002., 2005., 2007., 2013.]) assert cumulative(y, 2008, 2015) is np.nan logger().setLevel('NOTSET')
def validate(self, criteria={}, exclude_on_fail=False): """Validate scenarios using criteria on timeseries values Parameters ---------- criteria: dict dictionary with variable keys and check values ('up' and 'lo' for respective bounds, 'year' for years) exclude_on_fail: bool, default False flag scenarios failing validation as `exclude: True` """ df = _apply_criteria(self.data, criteria, in_range=False) if exclude_on_fail: idx = _meta_idx(df) self.meta.loc[idx, 'exclude'] = True if not df.empty: msg = '{} of {} data points to not satisfy the criteria' logger().info(msg.format(len(df), len(self.data))) if exclude_on_fail and len(idx) > 0: logger().info('{} non-valid scenario{} will be excluded' .format(len(idx), '' if len(idx) == 1 else 's')) return df
def _add_legend(ax, handles, labels, legend): if legend is None and len(labels) >= MAX_LEGEND_LABELS: logger().info( '>={} labels, not applying legend'.format(MAX_LEGEND_LABELS)) else: legend = {} if legend in [True, None] else legend ax.legend(handles, labels, **legend)
def connect(self, name): # TODO: deprecate in next release if name == 'iamc15': warnings.warn( 'The name `iamc15` is deprecated and will be removed in the ' + 'next release. Please use `IXSE_SR15`.') name = 'IXSE_SR15' valid = self.valid_connections if len(valid) == 0: raise RuntimeError( 'No valid connections found for the provided credentials.') if name not in valid: msg = """ {} not recognized as a valid connection name. Choose from one of the supported connections for your user: {}. """ raise ValueError(msg.format(name, valid)) url = '/'.join([_BASE_URL, 'applications', name, 'config']) headers = {'Authorization': 'Bearer {}'.format(self._token)} r = requests.get(url, headers=headers) _check_response(r, 'Could not get application information') response = r.json() idxs = {x['path']: i for i, x in enumerate(response)} self._base_url = response[idxs['baseUrl']]['value'] # TODO: request the full citation to be added to this metadata intead # of linking to the about page about = '/'.join([response[idxs['uiUrl']]['value'], '#', 'about']) logger().info(_CITE_MSG.format(name, about)) self._connected = name
def format_data(df): """Convert an imported dataframe and check all required columns""" # format columns to lower-case and check that all required columns exist df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True) if not set(IAMC_IDX).issubset(set(df.columns)): missing = list(set(IAMC_IDX) - set(df.columns)) raise ValueError("missing required columns `{}`!".format(missing)) if 'notes' in df.columns: logger().info('Ignoring notes column in dataframe') df.drop(columns='notes', inplace=True) df = df[~df.model.str.contains('database', case=False)] # check whether data in IAMC style or year/value layout if 'value' not in df.columns: numcols = sorted(set(df.columns) - set(IAMC_IDX)) df = pd.melt(df, id_vars=IAMC_IDX, var_name='year', value_vars=numcols, value_name='value') df['year'] = pd.to_numeric(df['year']) # drop NaN's df.dropna(inplace=True) # sort data df.sort_values(SORT_IDX, inplace=True) return df
def format_data(df): """Convert an imported dataframe and check all required columns""" # all lower case df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True) if 'notes' in df.columns: # this came from the database logger().info('Ignoring notes column in dataframe') df.drop(columns='notes', inplace=True) col = df.columns[0] # first column has database copyright notice df = df[~df[col].str.contains('database', case=False)]
def check_aggregate(self, variable, components=None, units=None, exclude_on_fail=False, multiplier=1, **kwargs): """Check whether the timeseries data match the aggregation of components or sub-categories Parameters ---------- variable: str variable to be checked for matching aggregation of sub-categories components: list of str, default None list of variables, defaults to all sub-categories of `variable` units: str or list of str, default None filter variable and components for given unit(s) exclude_on_fail: boolean, default False flag scenarios failing validation as `exclude: True` multiplier: number, default 1 factor when comparing variable and sum of components kwargs: passed to `np.isclose()` """ # default components to all variables one level below `variable` if components is None: components = self.filter(variable='{}|*'.format(variable), level=0).variables() if not len(components): msg = '{} - cannot check aggregate because it has no components' logger().info(msg.format(variable)) return # filter and groupby data, use `pd.Series.align` for matching index df_variable, df_components = (_aggregate_by_variables( self.data, variable, units).align(_aggregate_by_variables(self.data, components, units))) # use `np.isclose` for checking match diff = df_variable[~np.isclose(df_variable, multiplier * df_components, **kwargs)] if len(diff): msg = '{} - {} of {} data points are not aggregates of components' logger().info(msg.format(variable, len(diff), len(df_variable))) if exclude_on_fail: self._exclude_on_fail(diff.index.droplevel([2, 3])) diff = pd.concat([diff], keys=[variable], names=['variable']) return diff.unstack().rename_axis(None, axis=1)
def cumulative(x, first_year, last_year): """Returns the cumulative sum of a timeseries (indexed over years), implements linear interpolation between years, ignores nan's in the range. The function includes the last-year value of the series, and raises a warning if start_year or last_year is outside of the timeseries range and returns nan Parameters ---------- x: pandas.Series a timeseries to be summed over time first_year: int first year of the sum last_year: int last year of the sum (inclusive) """ # if the timeseries does not cover the range `[first_year, last_year]`, # return nan to avoid erroneous aggregation if min(x.index) > first_year: logger().warning('the timeseries `{}` does not start by {}'.format( x.name or x, first_year)) return np.nan if max(x.index) < last_year: logger().warning('the timeseries `{}` does not extend until {}'.format( x.name or x, last_year)) return np.nan # cast tiemseries colums to `int` if necessary if not x.index.dtype == 'int64': cast_years_to_int(x, index=True) x[first_year] = fill_series(x, first_year) x[last_year] = fill_series(x, last_year) years = [ i for i in x.index if i >= first_year and i <= last_year and ~np.isnan(x[i]) ] years.sort() # loop over years if not np.isnan(x[first_year]) and not np.isnan(x[last_year]): value = 0 for (i, yr) in enumerate(years[:-1]): next_yr = years[i + 1] # the summation is shifted to include the first year fully in sum, # otherwise, would return a weighted average of `yr` and `next_yr` value += ((next_yr - yr - 1) * x[next_yr] + (next_yr - yr + 1) * x[yr]) / 2 # the loop above does not include the last element in range # (`last_year`), therefore added explicitly value += x[last_year] return value
def _add_legend(ax, handles, labels, legend): if legend is None and len(labels) >= MAX_LEGEND_LABELS: logger().info('>={} labels, not applying legend'.format( MAX_LEGEND_LABELS)) else: legend = {} if legend in [True, None] else legend loc = legend.pop('loc', 'best') outside = loc.split(' ')[1] if loc.startswith('outside ') else False _legend = OUTSIDE_LEGEND[outside] if outside else dict(loc=loc) _legend.update(legend) ax.legend(handles, labels, **_legend)
def read_file(fname, *args, **kwargs): """Read data from a file saved in the standard IAMC format or a table with year/value columns """ if not isstr(fname): raise ValueError('reading multiple files not supported, ' 'please use `pyam.IamDataFrame.append()`') logger().info('Reading `{}`'.format(fname)) format_kwargs = {} # extract kwargs that are intended for `format_data` for c in [i for i in IAMC_IDX + ['year', 'time', 'value'] if i in kwargs]: format_kwargs[c] = kwargs.pop(c) return format_data(read_pandas(fname, *args, **kwargs), **format_kwargs)
def categorize(self, name, value, criteria, color=None, marker=None, linestyle=None): """Assign scenarios to a category according to specific criteria or display the category assignment Parameters ---------- name: str category column name value: str category identifier criteria: dict dictionary with variables mapped to applicable checks ('up' and 'lo' for respective bounds, 'year' for years - optional) color: str assign a color to this category for plotting marker: str assign a marker to this category for plotting linestyle: str assign a linestyle to this category for plotting """ # add plotting run control for kind, arg in [('color', color), ('marker', marker), ('linestyle', linestyle)]: if arg: run_control().update({kind: {name: {value: arg}}}) # find all data that matches categorization rows = _apply_criteria(self.data, criteria, in_range=True, return_test='all') idx = _meta_idx(rows) if len(idx) == 0: logger().info("No scenarios satisfy the criteria") return # EXIT FUNCTION # update metadata dataframe self._new_meta_column(name, value) self.meta.loc[idx, name] = value msg = '{} scenario{} categorized as `{}: {}`' logger().info( msg.format(len(idx), '' if len(idx) == 1 else 's', name, value))
def read_files(fnames, *args, **kwargs): """Read data from a snapshot file saved in the standard IAMC format or a table with year/value columns """ if isstr(fnames): fnames = [fnames] fnames = itertools.chain(*[glob.glob(f) for f in fnames]) dfs = [] for fname in fnames: logger().info('Reading `{}`'.format(fname)) df = read_pandas(fname, *args, **kwargs) dfs.append(format_data(df)) return pd.concat(dfs)
def load_metadata(self, path, *args, **kwargs): """Load metadata exported from `pyam.IamDataFrame` instance Parameters ---------- path: string xlsx file with metadata exported from `pyam.IamDataFrame` instance """ if not os.path.exists(path): raise ValueError("no metadata file '" + path + "' found!") if path.endswith('csv'): df = pd.read_csv(path, *args, **kwargs) else: xl = pd.ExcelFile(path) if len(xl.sheet_names) > 1 and 'sheet_name' not in kwargs: kwargs['sheet_name'] = 'meta' df = pd.read_excel(path, *args, **kwargs) req_cols = ['model', 'scenario', 'exclude'] if not set(req_cols).issubset(set(df.columns)): e = 'File `{}` does not have required columns ({})!' raise ValueError(e.format(path, req_cols)) # set index, filter to relevant scenarios from imported metadata file df.set_index(META_IDX, inplace=True) idx = self.meta.index.intersection(df.index) n_invalid = len(df) - len(idx) if n_invalid > 0: msg = 'Ignoring {} scenario{} from imported metadata' logger().info(msg.format(n_invalid, 's' if n_invalid > 1 else '')) if idx.empty: raise ValueError('No valid scenarios in imported metadata file!') df = df.loc[idx] # Merge in imported metadata msg = 'Importing metadata for {} scenario{} (for total of {})' logger().info(msg.format(len(df), 's' if len(df) > 1 else '', len(self.meta))) for col in df.columns: self._new_meta_column(col) self.meta[col] = df[col].combine_first(self.meta[col]) # set column `exclude` to bool self.meta.exclude = self.meta.exclude.astype('bool')
def __init__(self, name): """ Parameters ---------- name : str A valid database name. For available options, see valid_connection_names(). """ valid = valid_connection_names() if name not in valid: raise ValueError('{} is not a valid name. Choose one of {}'.format( name, valid)) logger().info( 'You are connected to the {} {}. Please cite as:\n\n{}'.format( name, 'scenario explorer', _CITATIONS[name])) self.base_url = _URL_TEMPLATE.format(name)
def require_variable(self, variable, unit=None, year=None, exclude_on_fail=False): """Check whether all scenarios have a required variable Parameters ---------- variable: str required variable unit: str, default None name of unit (optional) years: int or list, default None years (optional) exclude: bool, default False flag scenarios missing the required variables as `exclude: True` """ criteria = {'variable': variable} if unit: criteria.update({'unit': unit}) if year: criteria.update({'year': year}) keep = _apply_filters(self.data, self.meta, criteria) idx = self.meta.index.difference(_meta_idx(self.data[keep])) n = len(idx) if n == 0: logger().info( 'All scenarios have the required variable `{}`'.format( variable)) return msg = '{} scenario does not include required variable `{}`' if n == 1 \ else '{} scenarios do not include required variable `{}`' if exclude_on_fail: self.meta.loc[idx, 'exclude'] = True msg += ', marked as `exclude: True` in metadata' logger().info(msg.format(n, variable)) return pd.DataFrame(index=idx).reset_index()
def filter(self, filters=None, keep=True, inplace=False, **kwargs): """Return a filtered IamDataFrame (i.e., a subset of current data) Parameters ---------- keep: bool, default True keep all scenarios satisfying the filters (if True) or the inverse inplace: bool, default False if True, do operation inplace and return None filters by kwargs or dict (deprecated): The following columns are available for filtering: - metadata columns: filter by category assignment in metadata - 'model', 'scenario', 'region', 'variable', 'unit': string or list of strings, where ``*`` can be used as a wildcard - 'level': the maximum "depth" of IAM variables (number of '|') (exluding the strings given in the 'variable' argument) - 'year': takes an integer, a list of integers or a range note that the last year of a range is not included, so ``range(2010,2015)`` is interpreted as ``[2010, ..., 2014]`` - 'regexp=True' overrides pseudo-regexp syntax in `pattern_match()` """ if filters is not None: warnings.warn( '`filters` keyword argument in filters() is deprecated and will be removed in the next release') kwargs.update(filters) _keep = _apply_filters(self.data, self.meta, kwargs) _keep = _keep if keep else ~_keep ret = copy.deepcopy(self) if not inplace else self ret.data = ret.data[_keep] idx = pd.MultiIndex.from_tuples( pd.unique(list(zip(ret.data['model'], ret.data['scenario']))), names=('model', 'scenario') ) if len(idx) == 0: logger().warning('Filtered IamDataFrame is empty!') ret.meta = ret.meta.loc[idx] if not inplace: return ret
def validate(self, criteria={}, exclude_on_fail=False): """Validate scenarios using criteria on timeseries values Parameters ---------- criteria: dict dictionary with variable keys and check values ('up' and 'lo' for respective bounds, 'year' for years) exclude_on_fail: bool, default False flag scenarios failing validation as `exclude: True` """ df = _apply_criteria(self.data, criteria, in_range=False) if not df.empty: msg = '{} of {} data points to not satisfy the criteria' logger().info(msg.format(len(df), len(self.data))) if exclude_on_fail and len(df) > 0: self._exclude_on_fail(df) return df
def _exclude_on_fail(self, df): """Assign a selection of scenarios as `exclude: True` in meta""" idx = df if isinstance(df, pd.MultiIndex) else _meta_idx(df) self.meta.loc[idx, 'exclude'] = True logger().info('{} non-valid scenario{} will be excluded'.format( len(idx), '' if len(idx) == 1 else 's'))
def test_context_adjust_log_level(): assert logger().getEffectiveLevel() == 20 with adjust_log_level(): assert logger().getEffectiveLevel() == 40 assert logger().getEffectiveLevel() == 20
def map_regions(self, map_col, agg=None, copy_col=None, fname=None, region_col=None, remove_duplicates=False, inplace=False): """Plot regional data for a single model, scenario, variable, and year see pyam.plotting.region_plot() for all available options Parameters ---------- map_col: string The column used to map new regions to. Common examples include iso and 5_region. agg: string, optional Perform a data aggregation. Options include: sum. copy_col: string, optional Copy the existing region data into a new column for later use. fname: string, optional Use a non-default region mapping file region_col: string, optional Use a non-default column name for regions to map from. remove_duplicates: bool, optional, default: False If there are duplicates in the mapping from one regional level to another, then remove these duplicates by counting the most common mapped value. This option is most useful when mapping from high resolution (e.g., model regions) to low resolution (e.g., 5_region). inplace : bool, default False if True, do operation inplace and return None """ models = self.meta.index.get_level_values('model').unique() fname = fname or run_control()['region_mapping']['default'] mapping = read_pandas(fname).rename(str.lower, axis='columns') map_col = map_col.lower() ret = copy.deepcopy(self) if not inplace else self _df = ret.data columns_orderd = _df.columns # merge data dfs = [] for model in models: df = _df[_df['model'] == model] _col = region_col or '{}.REGION'.format(model) _map = mapping.rename(columns={_col.lower(): 'region'}) _map = _map[['region', map_col]].dropna().drop_duplicates() _map = _map[_map['region'].isin(_df['region'])] if remove_duplicates and _map['region'].duplicated().any(): # find duplicates where_dup = _map['region'].duplicated(keep=False) dups = _map[where_dup] logger().warning(""" Duplicate entries found for the following regions. Mapping will occur only for the most common instance. {}""".format(dups['region'].unique())) # get non duplicates _map = _map[~where_dup] # order duplicates by the count frequency dups = (dups .groupby(['region', map_col]) .size() .reset_index(name='count') .sort_values(by='count', ascending=False) .drop('count', axis=1)) # take top occurance dups = dups[~dups['region'].duplicated(keep='first')] # combine them back _map = pd.concat([_map, dups]) if copy_col is not None: df[copy_col] = df['region'] df = (df .merge(_map, on='region') .drop('region', axis=1) .rename(columns={map_col: 'region'}) ) dfs.append(df) df = pd.concat(dfs) # perform aggregations if agg == 'sum': df = df.groupby(LONG_IDX).sum().reset_index() ret.data = (df .reindex(columns=columns_orderd) .sort_values(SORT_IDX) .reset_index(drop=True) ) if not inplace: return ret
def check_aggregate_regions(self, variable, region='World', components=None, units=None, exclude_on_fail=False, **kwargs): """Check whether the region timeseries data match the aggregation of components Parameters ---------- variable: str variable to be checked for matching aggregation of components data region: str region to be checked for matching aggregation of components data components: list of str, default None list of regions, defaults to all regions except region units: str or list of str, default None filter variable and components for given unit(s) exclude_on_fail: boolean, default False flag scenarios failing validation as `exclude: True` kwargs: passed to `np.isclose()` """ var_df = self.filter(variable=variable, level=0) if components is None: components = list(set(var_df.data.region) - set([region])) if not len(components): msg = ( 'cannot check regional aggregate for `{}` because it has no ' 'regional components' ) logger().info(msg.format(variable)) return None # filter and groupby data, use `pd.Series.align` for matching index df_region, df_components = ( _aggregate_by_regions(var_df.data, region, units) .align(_aggregate_by_regions(var_df.data, components, units)) ) df_components.index = df_components.index.droplevel( "variable" ) # Add in variables that are included in region totals but which # aren't included in the regional components. # For example, if we are looking at World and Emissions|BC, we need # to add aviation and shipping to the sum of Emissions|BC for each # of World's regional components to do a valid check. different_region = components[0] var_list = pd.Series(self.data.variable.unique()) var_components = var_list[pattern_match(var_list, '{}|*'.format(variable), 0)] for var_to_add in var_components: var_rows = self.data.variable == var_to_add region_rows = self.data.region == different_region var_has_regional_info = (var_rows & region_rows).any() if not var_has_regional_info: df_var_to_add = self.filter( region=region, variable=var_to_add ).data.groupby(REGION_IDX).sum()['value'] df_var_to_add.index = df_var_to_add.index.droplevel("variable") if len(df_var_to_add): df_components = df_components.add(df_var_to_add, fill_value=0) df_components = pd.concat([df_components], keys=[variable], names=['variable']) # use `np.isclose` for checking match diff = df_region[~np.isclose(df_region, df_components, **kwargs)] if len(diff): msg = ( '{} - {} of {} data points are not aggregates of regional ' 'components' ) logger().info(msg.format(variable, len(diff), len(df_region))) if exclude_on_fail: self._exclude_on_fail(diff.index.droplevel([2, 3])) diff = pd.concat([diff], keys=[region], names=['region']) return diff.unstack().rename_axis(None, axis=1)
def format_data(df, **kwargs): """Convert a `pd.Dataframe` or `pd.Series` to the required format""" if isinstance(df, pd.Series): df = df.to_frame() # Check for R-style year columns, converting where necessary def convert_r_columns(c): try: first = c[0] second = c[1:] if first == 'X': try: # bingo! was X2015 R-style, return the integer return int(second) except: # nope, not an int, fall down to final return statement pass except: # not a string/iterable/etc, fall down to final return statement pass return c df.columns = df.columns.map(convert_r_columns) # if `value` is given but not `variable`, # melt value columns and use column name as `variable` if 'value' in kwargs and 'variable' not in kwargs: value = kwargs.pop('value') value = value if islistable(value) else [value] _df = df.set_index(list(set(df.columns) - set(value))) dfs = [] for v in value: if v not in df.columns: raise ValueError('column `{}` does not exist!'.format(v)) vdf = _df[v].to_frame().rename(columns={v: 'value'}) vdf['variable'] = v dfs.append(vdf.reset_index()) df = pd.concat(dfs).reset_index(drop=True) # otherwise, rename columns or concat to IAMC-style or do a fill-by-value for col, value in kwargs.items(): if col in df: raise ValueError('conflict of kwarg with column `{}` in dataframe!' .format(col)) if isstr(value) and value in df: df.rename(columns={value: col}, inplace=True) elif islistable(value) and all([c in df.columns for c in value]): df[col] = df.apply(lambda x: concat_with_pipe(x, value), axis=1) df.drop(value, axis=1, inplace=True) elif isstr(value): df[col] = value else: raise ValueError('invalid argument for casting `{}: {}`' .format(col, value)) # all lower case str_cols = [c for c in df.columns if isstr(c)] df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True) if 'notes' in df.columns: # this came from the database logger().info('Ignoring notes column in dataframe') df.drop(columns='notes', inplace=True) col = df.columns[0] # first column has database copyright notice df = df[~df[col].str.contains('database', case=False)]