def test_empty_xlsx(self): to_cero = ToCERO(conf=(TestToCERO._dd + r"test_empty_xlsx.yaml")) cero = to_cero.create_cero() with self.assertRaises(CERO.EmptyCERO): self.assertTrue(CERO.is_cero(cero, empty_ok=False)) self.assertTrue(CERO.is_cero(cero))
def test_fillna(self): df = pd.DataFrame.from_dict( { "A": [1, 2, 3], "B": [3, 4, 5], "C": [6, 7, 8] }, orient="index") df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2018, 2019], format="%Y")) df.sort_index(inplace=True) df.iloc[1, 1] = pd.np.nan df = df.astype(pd.np.float32) self.assertTrue(CERO.is_cero(df)) libfuncs.fillna(df, value=0.0) self.assertTrue(df.iloc[1, 1] == 0.0) df.iloc[1, 1] = pd.np.nan libfuncs.fillna(df, method="bfill") self.assertTrue(df.iloc[1, 1] == 5.0) df.iloc[1, 1] = pd.np.nan libfuncs.fillna(df) self.assertTrue(df.iloc[1, 1] == 3.0) self.assertTrue(CERO.is_cero(df))
def test_rename_2(self): to_cero = ToCERO(conf=(TestToCERO._dd + r"test_rename_2.yaml")) cero = to_cero.create_cero() CERO.is_cero(cero) test_idx = ["PROFESSIONALS", ("1", "MANAGERS") ] # Rename operation always moves series to the end self.assertTrue( all([x == y for (x, y) in zip(test_idx, cero.index.tolist())]))
def run(self) -> None: """ Execute a scenario run. """ self.cero = CERO.create_empty() ceros = [in_conf.create_cero() for in_conf in self["input_conf"]] if ceros: self.cero = CERO.combine_ceros(ceros) print("Successfully loaded scenario inputs as CERO.") FromCERO.dataframe_out(self.cero, (self.get_name() + "_%03d_step_%02d.xlsx" % (self["run_no"], 0)), "xlsx") for idx, model in enumerate(self["models"]): m_cero = model.run(self.cero) print( "Completed run of model (%s) at %s." % (model["name"], dt.datetime.now().strftime('%Y-%m-%d %H:%M'))) # If ouput_conf is not defined for a model, then None is returned... if m_cero is None: continue if not CERO.is_cero(m_cero): raise TypeError( "Object returned from model run is *not* of CERO format.") if model.get("export_mod_xlsx", self.get("export_mod_xlsx", True)): # By default, export model outputs automatically to xlsx files model_out_file = (self.get_name() + "_%03d_%s.xlsx" % (self["run_no"], model["name"])) print("Exporting output of %s to %s." % (model["name"], model_out_file)) m_cero.to_excel(model_out_file) self.cero = CERO.combine_ceros([self.cero, m_cero]) if self.get("export_int_xlsx", True): # If true (default), export the intermediate steps to xlsx files isfn = (self.get_name() + "_%03d_step_%02d.xlsx" % (self["run_no"], idx + 1)) print("Exporting updated CERO to %s." % (isfn)) self.cero.to_excel(isfn) for out_conf in self["output_conf"]: out_conf.exec_procedures(self.cero) else: print("Completed generation of scenario outputs.")
def groupby(df: "CERO", *args, key: "Union[int, list[int]]" = None, match: str = None, agg: str = None, **kwargs): if key is None: raise TypeError( "'key' must be provided to 'groupby' function as either an int or list of ints." ) elif not issubclass(type(key), list): key = [key] if not all([issubclass(type(k), int) for k in key]): raise TypeError( "'key' must be provided to 'groupby' function as either an int or list of ints." ) defaults = {"axis": 0, "sort": False, "group_keys": False} defaults.update(kwargs) match = _Identifier.tupleize_name(match) m_ids = [match] if match is None: m_ids = _Identifier.unique_id_fields(df.index.values, key=key) conv = lambda x: tuple(x) if issubclass(type(x), str) else x m_ids = [conv(m) for m in m_ids] rename_dict = {} for m in m_ids: # Create func that identifies rows for grouping def f(x): return all([x[k] == m[idx] for idx, k in enumerate(key)]) # Groupby and apply aggregation function agg_df = df.groupby(by=f, **defaults).agg(agg) # Put aggregated calculation in first row that meets the condition row_loc = next(x for x in df.index.values if f(x)) df.iloc[df.index.get_loc(row_loc)] = agg_df.loc[True] # Rename row rename_dict.update( {row_loc: _Identifier.keep_only_fields(key, row_loc)[0]}) CERO.rename_index_values(df, rename_dict, inplace=True) return df
def exec_ops(self, cero): """ :param cero: The cero (``pandas.DataFrame``) object upon which to execute the operations. No modifications will be applied to the original cero (i.e. all modifications are applied to a copy of ``cero``). :return: """ self._set_inputs(cero) for op in self["operations"]: ret = self._exec_op(op) if ret is not None: self.inputs = CERO.combine_ceros([self.inputs, ret], overwrite=True) if "outputs" in self and self["outputs"] is None: # The result of this procedures operations is to be explicitly ignored, may be useful when objective is simply to plot data return if (self.get("outputs", []) == []) or (self.get("outputs", True) == True): # Get all rows if none specified self["outputs"] = self.inputs.index.tolist() out_df = self.inputs.iloc[[self.inputs.index.get_loc(o) for o in self["outputs"]]] assert issubclass(type(out_df), pd.DataFrame) if "file" in self: # If file is specified, all 'outputs' from this procedure go to its own file output_type = os.path.splitext(self["file"])[1][1:] FromCERO.dataframe_out(out_df, self["file"], output_type, self.get("output_kwargs")) else: # procedure output name is that provided return {self["name"]: out_df}
def test_plotoutput(self): try: import seaborn except ImportError: raise unittest.SkipTest("PyQt4 not installed, and therefore ConCERO's plotting capabilities cannot be used.") nf = "AssociateProfessionals.png" # CERO path png = DataTools.get_test_data(TestPlotOutput._dd + "test_plotoutput.png") cero = CERO.read_xlsx(TestPlotOutput._dd + "test_plotoutput.xlsx") fc = FromCERO(TestPlotOutput._dd + "test_plotoutput.yaml") fc.exec_procedures(cero) plt = DataTools.get_test_data(nf) # These lines have been commented out because figures are very hard to compare accurately - defaults seem to \ # differ depending on operating system. # try: # self.assertEqual(plt, png) # except AssertionError as e: # raise e # Tidy up os.remove(os.path.relpath(nf))
def test_apply_func(self): df = pd.DataFrame.from_dict( { "A": [1, 2, 3], "B": [3, 4, 5], "C": [6, 7, 8] }, orient="index", dtype=pd.np.float32) df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2018, 2019], format="%Y")) df.sort_index(inplace=True) self.assertTrue(CERO.is_cero(df)) libfuncs.apply_func(df, numpy_func="square") test_df = pd.DataFrame.from_dict( { "A": [1, 4, 9], "B": [9, 16, 25], "C": [36, 49, 64] }, orient="index", dtype=pd.np.float32) test_df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2018, 2019], format="%Y")) test_df.sort_index(inplace=True) self.assertTrue(df.equals(test_df))
def test__import_vd(self): fo = { "file": TestToCERO_FileObj._dd + "test__import_vd.VD", "date_col": 3, "val_col": 8 } fo = ToCERO._FileObj(fo) df = fo._import_vd() df.columns.set_names([None], inplace=True) df = df.astype(pd.np.float32) test_df = pd.DataFrame.from_dict( { ("VAR_Act", "-", "FT_COMELC", "ACT", "2015", "PD", "-"): [ 0.740833333333336, 0.740833333333336, 0.8005115537522, 0.829127920241238 ] }, orient="index", dtype=pd.np.float32) test_df.columns = pd.Index([2015, 2016, 2020, 2025]) test_df.sort_index(inplace=True) self.assertTrue(test_df.equals(df)) fo = { "file": TestToCERO_FileObj._dd + "test__import_vd.VD", "date_col": 3, "val_col": 8, "default_year": 2018 } fo = ToCERO._FileObj(fo) df = fo._import_vd() df.columns.set_names([None], inplace=True) df = df.astype(pd.np.float32) df.sort_index(inplace=True) test_df = pd.DataFrame(data=[ [ 0.740833333333336, 0.740833333333336, pd.np.nan, 0.8005115537522, 0.829127920241238 ], [pd.np.nan, pd.np.nan, 1.39891653080538, pd.np.nan, pd.np.nan], [pd.np.nan, pd.np.nan, 19.6047685777802, pd.np.nan, pd.np.nan], [pd.np.nan, pd.np.nan, 31516.8951973493, pd.np.nan, pd.np.nan], ], columns=[2015, 2016, 2018, 2020, 2025], dtype=pd.np.float32) test_df.index = CERO.create_cero_index([ ("VAR_Act", "-", "FT_COMELC", "ACT", "2015", "PD", "-"), ("Cost_Salv", "-", "EN_WinONS-26", "ADE", "2040", "-", "-"), ("Cost_NPV", "-", "EE_StmTurb009", "CQ", "-", "-", "ACT"), ("Reg_irec", "-", "-", "WA", "-", "-", "-"), ]) test_df.sort_index(inplace=True) self.assertTrue(test_df.equals(df))
def _set_inputs(self, cero: pd.DataFrame): """Copies each data series in ``cero`` indexed by the items in ``inp_list`` to an ``OrderedDict``. This \ ensures that ``operations`` do not alter ``cero``. """ if self["inputs"] == []: # Input is entire CERO unless otherwise specified self["inputs"] = cero.index.tolist() # Check values in dataframe - check is necessary because KeyError is not thrown if some values are in index (pandas version 0.22). invalid_inputs = [i for i in self["inputs"] if i not in cero.index] try: self.inputs = copy.deepcopy(cero.iloc[[cero.index.get_loc(loc) for loc in self["inputs"]]]) # Reduce data frame to necessary data and copy except KeyError: invalid_inputs = [i for i in self["inputs"] if i not in cero.index] msg = ("Inputs %s do not exist. The most likely reason is that the configuration file is " + "incorrectly specified, or lacks specification. If debugging level has been set to " + "'DEBUG', then the input list is in the log file - note that this list may be " + "extraordinarily long. Common causes of this problem include: \n" + " 1. Set definition in configuration file includes elements that do not exist in the CERO.\n" + " 2. Mis-spellings of identifiers in the configuration file (which includes names of sets for 'inputs' or 'arrays').\n" + " 3. Incorrect ordering of sets in the identifier." ) % invalid_inputs FromCERO._logger.error(msg) raise KeyError(msg) assert (isinstance(self.inputs, pd.DataFrame)) map_dict = {} for map_op in self.get("map", []): idx = map_op.get("idx") orig_s = self["sets"][map_op["orig"]] ns = self["sets"][map_op["new"]] for val in self.inputs.index.values: new_val = val if idx is not None and (val[idx] in orig_s) and (not isinstance(val, str)): new_val = val[:idx] + (ns[orig_s.index(val[idx])],) + val[idx+1:] elif val in orig_s: new_val = ns[orig_s.index(val)] map_dict.update({val: new_val}) CERO.rename_index_values(self.inputs, map_dict, inplace=True)
def exec_procedures(self, cero): """ Execute all the procedures of the FromCERO object . :param pandas.DataFrame cero: A CERO to serve as input for the procedures. The argument is not mutated/modified. """ CERO.is_cero(cero, raise_exception=True, empty_ok=False) CERO.rename_index_values(cero, self.get("map", {})) self.output_procedures = OrderedDict() for procedure in self["procedures"]: try: ret = procedure.exec_ops(cero) # if ret is not None, should be dict with key: procedure["name"], value: resultant CERO except Exception as e: raise e.__class__(e.__str__() + " Error in procedure '%s'." % (procedure["name"])) if ret is None: ret = {} self.output_procedures.update(ret) else: if not self["procedures"]: # If empty list self.output_procedures["default_output"] = cero if any([not procedure.get("file") for procedure in self["procedures"]]): msg = "It has been detected that not all procedures direct output to file. Therefore some output will go to \'%s\'." % self["file"] print(msg) FromCERO._logger.info(msg) if self.output_procedures != {}: file_ext = os.path.splitext(self["file"])[1][1:] if file_ext in FromCERO.sup_output_types: out_df = CERO.combine_ceros(list(self.output_procedures.values())) FromCERO.dataframe_out(out_df, self["file"], output_type=file_ext) elif file_ext in FromCERO._Procedure.sup_output_types: raise ValueError("This data type is not supported for general export, because it probably has a more than 2 dimensions - export using 'procedures' instead.") else: raise ValueError("Unsupported data type detected for general export.")
def test_complex_xlsx(self): to_cero = ToCERO(conf=(TestToCERO._dd + r'test_complex_xlsx_import.yaml')) cero = to_cero.create_cero() df = DataTools.get_test_data(TestToCERO._dd + "test_complex_xlsx_result.pickle") self.assertTrue(CERO.is_cero(cero)) self.assertTrue(cero.equals(df))
def test_csv_complex(self): test_df = pd.DataFrame(data=np.array([[3.78981, 2.73377], [2.22027, 3.99257]]), index=[("a", "b"), "c"], dtype=pd.np.float32) test_df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2018], format="%Y")) cero = CERO.read_csv(TestCERO._dd + "test_csv_complex.csv") self.assertTrue(test_df.equals(cero))
def _exec_op(self, op: dict): # Apply operation to procedure func_name = op.pop('func', "noop") # Perform noop (no-operation) if no func provided. op_args = op.pop('args', []) rename = op.pop("rename", None) arrays = None if "arrays" in op: arrays = op.pop("arrays") if issubclass(type(arrays), str): arrays = [arrays] arrays = _Identifier.get_all_idents(arrays, sets=self["sets"]) for mod in self["libfuncs"]: if hasattr(mod, func_name): func = getattr(mod, func_name) break else: msg = ('Invalid function name provided - \'%s\'. Function does not exist in any of the modules %s. It may be necessary to create a python module with the necessary functions and provide this file with the \'libfuncs\' option.' % (func_name, self["libfuncs"])) FromCERO._logger.error(msg) raise AttributeError(msg) FromCERO._logger.debug("Function call: %s(*arrays, **op)" % func.__name__) ret = func(self.inputs, *op_args, locs=arrays, **op) op['func'] = func.__name__ # For cleanliness of presentation if rename is not None: if ret is None: ret = getattr(libfuncs, "noop")(self.inputs, *op_args, locs=arrays, **op) if isinstance(rename, str): rename = {ret.index.tolist()[0]: rename} # Rename the first index by default if issubclass(type(rename), list): # Build mapping dictionary rename = _Identifier.get_mapping_dict(ret.index.tolist(), rename, sets=self.get("sets")) elif issubclass(type(rename), dict): rename = _Identifier.get_one_to_one_mapping(rename, sets=self.get("sets")) # At this point, rename should be one-to-one mapping dict renamed = CERO.rename_index_values(ret.loc[list(rename.keys())], rename, inplace=False) ret = renamed.loc[list(rename.values())] # Restrict renamed to only the rows that have been specified # Note that ret will be restricted to only those values that have been renamed. return ret
def init_df(): df = pd.DataFrame.from_dict( { "A": [1, 2, 3], "B": [3, 4, 5], "C": [6, 7, 8] }, orient="index") df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2018, 2019], format="%Y")) df.sort_index(inplace=True) df = df.astype(pd.np.float32) self.assertTrue(CERO.is_cero(df)) return df
def test_export_to_csv(self): cero = pd.DataFrame.from_dict({"A": [1], "B": [2], "C": [3], "D": [4], "E": [5], "F": [6], }, orient='index', dtype=pd.np.float32) cero.columns = pd.DatetimeIndex(data=pd.to_datetime([2018], format="%Y")) cero.sort_index(inplace=True) self.assertTrue(CERO.is_cero(cero)) fc = FromCERO(cfg.d_td + "test_procedure_export_csv.yaml") fc.exec_procedures(cero) df1 = pd.read_csv("csv_export.csv", index_col=0) test_list = [1, 2, 3] df1_vals = [x[0] for x in df1.values.tolist()] self.assertTrue(all([np.isclose(x, y) for (x, y) in zip(test_list, df1_vals)])) test_list = ["A", "B", "C"] self.assertTrue(all([x == y for (x, y) in zip(test_list, df1.index.tolist())])) os.remove("csv_export.csv")
def test_regex_format(self): tc = ToCERO({ "files": [{ "file": TestToCERO._dd + "test_csv_regex.csv", "time_regex": r"(Y\d{4}).*", # Regex could pick out just the year, but want to test 'time_fmt' as well... "time_fmt": r"Y%Y" }] }) cero = tc.create_cero() test_df = pd.DataFrame(data=[[1, 2], [3, 4]], columns=[2016, 2017], dtype=pd.np.float32) test_df.index = CERO.create_cero_index(["A", "B"]) test_df.columns = pd.DatetimeIndex( pd.to_datetime([2016, 2017], format="%Y")) test_df.sort_index(inplace=True) self.assertTrue(cero.equals(test_df))
def test_sets_and_mapping(self): cero = pd.DataFrame.from_dict( { "A": [1], "B": [2], "C": [3], "D": [4], "E": [5], "F": [6], }, orient='index', dtype=pd.np.float32) cero.sort_index(inplace=True) cero.columns = pd.DatetimeIndex( data=pd.to_datetime([2018], format="%Y")) self.assertTrue(CERO.is_cero(cero)) fc = FromCERO(TestFromCERO._dd + "test_fromcero_mapping.yaml") fc.exec_procedures(cero) df1 = pd.read_csv("test_fromcero_mapping1.csv", index_col=0) test_list = [1, 2, 3] df1_vals = [x[0] for x in df1.values.tolist()] self.assertTrue( all([np.isclose(x, y) for (x, y) in zip(test_list, df1_vals)])) test_list = ["A", "B", "C"] self.assertTrue( all([x == y for (x, y) in zip(test_list, df1.index.tolist())])) df2 = pd.read_csv("test_fromcero_mapping2.csv", index_col=0) test_list = [4, 5, 6] df2_vals = [x[0] for x in df2.values.tolist()] self.assertTrue(all([x == y for (x, y) in zip(test_list, df2_vals)])) test_list = ["G", "H", "I"] self.assertTrue( all([x == y for (x, y) in zip(test_list, df2.index.tolist())])) os.remove("test_fromcero_mapping1.csv") os.remove("test_fromcero_mapping2.csv")
def test_csv_orientation(self): tc = ToCERO._FileObj( {"file": TestToCERO_FileObj._dd + "test_csv_orientation.csv"}) with self.assertRaises(TypeError): df = tc._import_file() tc = ToCERO._FileObj({ "file": TestToCERO_FileObj._dd + "test_csv_orientation.csv", "orientation": "cols" }) df = tc._import_file() test_df = pd.DataFrame(data=[[1, 2], [3, 4]], columns=[2016, 2017], dtype=pd.np.float32) test_df.index = CERO.create_cero_index(["A", "B"]) test_df.sort_index(inplace=True) self.assertTrue(df.equals(test_df))
def test_stitch_time(self): init = pd.DataFrame.from_dict({"A": [1], "B": [2], "C": [3], }, orient='index', dtype=pd.np.float32) init.sort_index(inplace=True) init.columns = pd.DatetimeIndex(data=pd.to_datetime([2018], format="%Y")) cero = pd.DataFrame.from_dict({"D": [100, 200], "E": [50, 0], "F": [-50, 200]}, orient='index', dtype=pd.np.float32) cero.sort_index(inplace=True) cero.columns = pd.DatetimeIndex(data=pd.to_datetime([2019, 2020], format="%Y")) cero = CERO.combine_ceros([init, cero]) test_df = pd.DataFrame.from_dict({"A": [1, 2, 6], "B": [2, 3, 3], "C": [3, 1.5, 4.5], "D": [pd.np.nan, 100, 200], "E": [pd.np.nan, 50, 0], "F": [pd.np.nan, -50, 200] }, orient='index', dtype=pd.np.float32) test_df.sort_index(inplace=True) test_df.columns = pd.DatetimeIndex(data=pd.to_datetime([2018, 2019, 2020], format="%Y")) proc = FromCERO._Procedure({"name": "test_stitch_time", "file": "test_stitch_time.csv", "sets": {"a_set": ["A", "B", "C"], "b_set": ["D", "E", "F"]}, "inputs": ["a_set", "b_set"], "operations": [{"func": "noop", "rename": {"b_set": "a_set"}}, {"func": "pc_change", "arrays": ["a_set"], "init_cols": [2018]}], "ref_dir": "."}) proc.exec_ops(cero) tc = ToCERO({"files": [{"file": os.path.join(os.path.abspath("."), "test_stitch_time.csv")}]}) df = tc.create_cero() self.assertTrue(df.equals(test_df)) os.remove("test_stitch_time.csv") proc = FromCERO._Procedure({"name": "test_stitch_time", "file": "test_stitch_time2.csv", "sets": {"a_set": ["A", "B", "C"], "b_set": ["D", "E", "F"]}, "inputs": ["a_set", "b_set"], "operations": [{"func": "noop", "rename": {"b_set": "a_set"}}, {"func": "pc_change", "arrays": ["a_set"], "init_cols": 2018}], "ref_dir": "."}) proc.exec_ops(cero) tc = ToCERO({"files": [{"file": os.path.join(os.path.abspath("."), "test_stitch_time2.csv")}]}) df = tc.create_cero() self.assertTrue(df.equals(test_df)) os.remove("test_stitch_time2.csv") out_file = "test_stitch_time3.csv" proc = FromCERO._Procedure({"name": "test_stitch_time", "file": out_file, "sets": {"a_set": ["A", "B", "C"], "b_set": ["D", "E", "F"]}, "inputs": ["a_set", "b_set"], "operations": [{"func": "noop", "rename": {"b_set": "a_set"}}, {"func": "pc_change", "arrays": ["a_set"], "init_icols": 0}], "ref_dir": "."}) proc.exec_ops(cero) tc = ToCERO({"files": [{"file": os.path.join(os.path.abspath("."), out_file)}]}) df = tc.create_cero() self.assertTrue(df.equals(test_df)) os.remove(out_file) out_file = "test_stitch_time4.csv" proc = FromCERO._Procedure({"name": "test_stitch_time", "file": out_file, "sets": {"a_set": ["A", "B", "C"], "b_set": ["D", "E", "F"]}, "inputs": ["a_set", "b_set"], "operations": [{"func": "noop", "rename": {"b_set": "a_set"}}, {"func": "pc_change", "arrays": ["a_set"], "init_icols": [0]}], "ref_dir": "."}) proc.exec_ops(cero) tc = ToCERO({"files": [{"file": os.path.join(os.path.abspath("."), out_file)}]}) df = tc.create_cero() self.assertTrue(df.equals(test_df)) os.remove(out_file) out_file = "test_stitch_time5.csv" proc = FromCERO._Procedure({"name": "test_stitch_time", "file": out_file, "sets": {"a_set": ["A", "B", "C"], "b_set": ["D", "E", "F"]}, "inputs": ["a_set", "b_set"], "operations": [{"func": "noop", "rename": {"b_set": "a_set"}}, {"func": "pc_change", "arrays": ["a_set"], "init_icols": [-3]}], "ref_dir": "."}) proc.exec_ops(cero) tc = ToCERO({"files": [{"file": os.path.join(os.path.abspath("."), out_file)}]}) df = tc.create_cero() self.assertTrue(df.equals(test_df)) os.remove(out_file)
def test_is_cero(self): """Tests the validation method by feeding it deliberately False data.""" df = None with self.assertRaises(CERO.InvalidCERO, msg=CERO._msg_inv_type): CERO.is_cero(df) df = pd.DataFrame.from_dict({ "A": [1, 2, 3], "B": [4, 5, 6] }, orient="index", dtype=int) df.index = pd.MultiIndex.from_tuples([("A", 1), ("A", 2)]) # ^^ ``df`` is not even close to being a CERO... with self.assertRaises(CERO.InvalidCERO, msg=CERO._msg_bad_idx): CERO.is_cero(df) df.index = pd.Index(["A", "A"]) with self.assertRaises(CERO.InvalidCERO, msg=CERO._msg_bad_col): CERO.is_cero(df) df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2017, 2019], format="%Y")) with self.assertRaises(CERO.InvalidCERO, msg=CERO._msg_idx_nunique): CERO.is_cero(df) df.index = pd.Index(["A", "B"]) with self.assertRaises(CERO.InvalidCERO, msg=CERO._msg_col_nunique): CERO.is_cero(df) df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2018, 2019], format="%Y")) with self.assertRaises(CERO.InvalidCERO, msg=CERO._msg_val_type): CERO.is_cero(df) df = df.astype(pd.np.float32, copy=False) self.assertTrue(CERO.is_cero(df))
def test_sets_and_mapping2(self): cero = pd.DataFrame.from_dict( { ("A", "1"): [1], ("A", "2"): [2], ("A", "3"): [3], ("B", "1"): [4], ("B", "2"): [5], ("B", "3"): [6], ("C", "1"): [7], ("C", "2"): [8], ("C", "3"): [9], }, orient='index', dtype=pd.np.float32) cero.sort_index(inplace=True) cero.columns = pd.DatetimeIndex( data=pd.to_datetime([2018], format="%Y")) self.assertTrue(CERO.is_cero(cero)) fc = FromCERO(TestFromCERO._dd + "test_fromcero_mapping2.yaml") fc.exec_procedures(cero) tc = ToCERO({ "files": [{ "file": "test_fromcero_complexmapping1.xlsx", "sheet": "CERO", "index_col": [0, 1] }] }) df1 = tc.create_cero() test_list = list(range(1, 10)) df1_vals = [x[0] for x in df1.values.tolist()] self.assertTrue( all([np.isclose(x, y) for (x, y) in zip(test_list, df1_vals)])) test_list = [("G", "1"), ("G", "2"), ("G", "3"), ("H", "1"), ("H", "2"), ("H", "3"), ("I", "1"), ("I", "2"), ("I", "3")] self.assertTrue( all([x == y for (x, y) in zip(test_list, df1.index.tolist())])) tc = ToCERO({ "files": [{ "file": "test_fromcero_complexmapping2.xlsx", "sheet": "CERO", "index_col": [0, 1] }] }) df1 = tc.create_cero() test_list = list(range(1, 10)) df1_vals = [x[0] for x in df1.values.tolist()] self.assertTrue( all([np.isclose(x, y) for (x, y) in zip(test_list, df1_vals)])) test_list = [("A", "G"), ("A", "H"), ("A", "I"), ("B", "G"), ("B", "H"), ("B", "I"), ("C", "G"), ("C", "H"), ("C", "I")] self.assertTrue( all([x == y for (x, y) in zip(test_list, df1.index.tolist())])) os.remove("test_fromcero_complexmapping1.xlsx") os.remove("test_fromcero_complexmapping2.xlsx")
def test_create_empty(self): empty_cero = CERO.create_empty() self.assertTrue(CERO.is_cero(empty_cero))
def test_name_map(self): def init_df(): df = pd.DataFrame.from_dict( { "A": [1, 2, 3], "B": [3, 4, 5], "C": [6, 7, 8] }, orient="index") df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2018, 2019], format="%Y")) df.sort_index(inplace=True) df = df.astype(pd.np.float32) self.assertTrue(CERO.is_cero(df)) return df df = init_df() mapping = dict([("A", "D"), ("B", "E"), ("C", "F")]) res = CERO.rename_index_values(df, mapping) test_names = ["D", "E", "F"] self.assertIsNone(res) self.assertTrue( all([x == y for (x, y) in zip(df.index.tolist(), test_names)])) # Test 2 df = init_df() mapping = dict([("B", "E"), ("C", "F")]) res = CERO.rename_index_values(df, mapping) test_names = ["A", "E", "F"] self.assertIsNone(res) self.assertTrue( all([x == y for (x, y) in zip(df.index.tolist(), test_names)])) # Test 3 df = init_df() mapping = dict([("A", "D"), ("B", "E"), ("C", "F")]) res = CERO.rename_index_values(df, mapping, inplace=False) test_names = ["D", "E", "F"] test_names_df = ["A", "B", "C"] self.assertTrue( all([x == y for (x, y) in zip(res.index.tolist(), test_names)])) self.assertTrue( all([x == y for (x, y) in zip(df.index.tolist(), test_names_df)])) # Test 4 df = init_df() mapping = dict([("B", "E"), ("C", "F")]) res = CERO.rename_index_values(df, mapping, inplace=False) test_names = ["A", "E", "F"] test_names_df = ["A", "B", "C"] self.assertTrue( all([x == y for (x, y) in zip(res.index.tolist(), test_names)])) self.assertTrue( all([x == y for (x, y) in zip(df.index.tolist(), test_names_df)]))
def run(self, cero) -> 'CERO': """ Executes all data import/export operations (defined by ``input_conf`` and ``output_conf`` respectively) and the execution of any commands. :param pandas.DataFrame cero: A CERO that contains all necessary data for conversion to input files (for \ model execution). :return pandas.DataFrame: A CERO of relevant output data ('relevant' is defined by ``output_conf``). """ for input_conf in self["input_conf"]: input_conf.exec_procedures(cero) print( "Completed converting CERO to model input files (%s). Now processing commands..." % self["name"]) with _modified_environ(wd=self["wd"], **self.get("env_vars", {})): for cmdobj in self["cmds"]: cmd = {"type": "shell", "shell": True} # Default command if isinstance(cmdobj, str): # cmd is interpreted as shell command by default # cmdobj = cmdobj.split(" ") cmd.update({"args": cmdobj}) elif isinstance(cmdobj, dict): cmd.update(cmdobj) # Add user updates if "args" not in cmd: raise ValueError( "'args' must be provided for command of type 'dict'." ) else: raise TypeError( "Invalid command object in configuration file.") # Change to command-specific directory cmd_run_dir = cmd.pop("wd", self["wd"]) if cmd_run_dir: cmd_run_dir = os.path.abspath(cmd_run_dir) cmd_type = cmd.pop("type") # Execute commands msg = "In directory '%s', executing command '%s'." % ( cmd_run_dir, cmd) Model._logger.info(msg) with _modified_environ(wd=cmd_run_dir, **cmd.pop("env_vars", {})): # Depending on cmd_type, execute command in different ways... if cmd_type in ["shell"]: args = cmd.pop("args") Model._logger.info( "Executing shell command: %s, with keyword args: %s." % (args, cmd)) try: cmd["output"] = subprocess.check_output( args=args, stderr=subprocess.STDOUT, universal_newlines=True, **cmd) except subprocess.CalledProcessError as e: msg = ( "Command '%s' failed with returncode: %s, and message:\n" + "%s\n" + "Program logs may have more information.") % ( args, e.returncode, e.output) Model._logger.error(msg) print(msg) raise e Model._logger.info(cmd["output"]) print("Command returned: \n%s" % cmd["output"], end="") elif cmd_type in ["python_method"]: try: assert ("func" in cmd) except AssertionError: raise ValueError( "'func' must be defined for commands of type 'python_method'." ) func = getattr(modfuncs, cmd.pop("func")) cmd["output"] = func(*cmd["args"], **cmd["kwargs"]) else: raise ValueError("Unsupported command type specified.") if not self["output_conf"]: return CERO.create_empty() ceros = [] for oc in self["output_conf"]: ceros.append(oc.create_cero()) try: cero = CERO.combine_ceros(ceros, overwrite=False) except CERO.CEROIndexConflict: raise RuntimeWarning( "Attempts to duplicate the export of data - i.e. one or more data series are being " + "exported more than once (which should be avoided). The last procedure will define " + "the intended data.") cero = CERO.combine_ceros(ceros) return cero
def wrapper(df: pd.DataFrame, *args, locs: "List[Union[tuple, str]]" = None, ilocs: "List[int]" = None, start_year: "Union[pd.datetime, int]" = None, end_year: "Union[pd.datetime, int]" = None, **kwargs): """ :param df: An CERO, which may or may not be a strict superset of data to perform the operation on. :param args: Passed to the encapsulated function as positional arguments, immediately after the restricted \ ``df``. :param locs: ``locs``, if provided, must be a list of identifiers that correspond to values of ``df.index``. \ It is ``df``, reduced to these specific indices, that a wrapped function will receive as an argument. An \ error is raised if both ``locs`` and ``ilocs`` is specified. :param ilocs: Identical in nature to ``locs``, though instead a list of integers (zero-indexed) is \ provided (corresponding to the row number of ``df``). An \ error is raised if both ``locs`` and ``ilocs`` is specified. :param start_year: Note that ``df`` is a CERO, and CEROs have a ``pandas.DatetimeIndex`` on columns. \ ``start_year`` restricts the CERO to years after and including ``start_year``. :param end_year: Note that ``df`` is a CERO, and CEROs have a ``pandas.DatetimeIndex`` on columns. \ ``end_year`` restricts the CERO to years up to and including ``end_year``. :param kwargs: Keyword arguments to be passed to the encapsulated function. :return: The return value of the encapsulated function. """ try: assert(isinstance(df, pd.DataFrame)) except AssertionError: raise TypeError("First function argument must be of pandas.DataFrame type.") # Convert integer to datetime type if isinstance(start_year, int): start_year = pd.datetime(start_year, 1, 1) if isinstance(end_year, int): end_year = pd.datetime(end_year, 1, 1) # Get index locations if start_year is not None: start_year = df.columns.get_loc(start_year) if end_year is not None: end_year= df.columns.get_loc(end_year) if (locs is not None) and (ilocs is not None): raise TypeError("Only one of 'locs' or 'ilocs' can be provided (not both).") if locs is not None: ilocs = [df.index.get_loc(loc) for loc in locs] if ilocs is None: ilocs = pd.IndexSlice[0:] df_cp = df.iloc[ilocs, start_year:end_year].copy(deep=False) # df_cp is always different object to df ret = func(df_cp, *args, **kwargs) if ret is None: return ret elif issubclass(type(ret), pd.Series): # If series, convert to dataframe ret = pd.DataFrame(data=[ret]) CERO.is_cero(ret) # Performs checks to ensure ret is a valid CERO return ret
def test_idxconflict(self): cero = DataTools.get_test_data(TestCERO._dd + "test_cero.pickle") with self.assertRaises(CERO.CEROIndexConflict): CERO.combine_ceros([cero, cero], overwrite=False, verify_cero=True)
def test_multiindex_xlsx(self): to_cero = ToCERO(conf=(TestToCERO._dd + r'test_multiindex_xlsx.yaml')) cero = to_cero.create_cero() self.assertTrue(CERO.is_cero(cero))