def groupby(df: "CERO", *args, key: "Union[int, list[int]]" = None, match: str = None, agg: str = None, **kwargs): if key is None: raise TypeError( "'key' must be provided to 'groupby' function as either an int or list of ints." ) elif not issubclass(type(key), list): key = [key] if not all([issubclass(type(k), int) for k in key]): raise TypeError( "'key' must be provided to 'groupby' function as either an int or list of ints." ) defaults = {"axis": 0, "sort": False, "group_keys": False} defaults.update(kwargs) match = _Identifier.tupleize_name(match) m_ids = [match] if match is None: m_ids = _Identifier.unique_id_fields(df.index.values, key=key) conv = lambda x: tuple(x) if issubclass(type(x), str) else x m_ids = [conv(m) for m in m_ids] rename_dict = {} for m in m_ids: # Create func that identifies rows for grouping def f(x): return all([x[k] == m[idx] for idx, k in enumerate(key)]) # Groupby and apply aggregation function agg_df = df.groupby(by=f, **defaults).agg(agg) # Put aggregated calculation in first row that meets the condition row_loc = next(x for x in df.index.values if f(x)) df.iloc[df.index.get_loc(row_loc)] = agg_df.loc[True] # Rename row rename_dict.update( {row_loc: _Identifier.keep_only_fields(key, row_loc)[0]}) CERO.rename_index_values(df, rename_dict, inplace=True) return df
def _set_inputs(self, cero: pd.DataFrame): """Copies each data series in ``cero`` indexed by the items in ``inp_list`` to an ``OrderedDict``. This \ ensures that ``operations`` do not alter ``cero``. """ if self["inputs"] == []: # Input is entire CERO unless otherwise specified self["inputs"] = cero.index.tolist() # Check values in dataframe - check is necessary because KeyError is not thrown if some values are in index (pandas version 0.22). invalid_inputs = [i for i in self["inputs"] if i not in cero.index] try: self.inputs = copy.deepcopy(cero.iloc[[cero.index.get_loc(loc) for loc in self["inputs"]]]) # Reduce data frame to necessary data and copy except KeyError: invalid_inputs = [i for i in self["inputs"] if i not in cero.index] msg = ("Inputs %s do not exist. The most likely reason is that the configuration file is " + "incorrectly specified, or lacks specification. If debugging level has been set to " + "'DEBUG', then the input list is in the log file - note that this list may be " + "extraordinarily long. Common causes of this problem include: \n" + " 1. Set definition in configuration file includes elements that do not exist in the CERO.\n" + " 2. Mis-spellings of identifiers in the configuration file (which includes names of sets for 'inputs' or 'arrays').\n" + " 3. Incorrect ordering of sets in the identifier." ) % invalid_inputs FromCERO._logger.error(msg) raise KeyError(msg) assert (isinstance(self.inputs, pd.DataFrame)) map_dict = {} for map_op in self.get("map", []): idx = map_op.get("idx") orig_s = self["sets"][map_op["orig"]] ns = self["sets"][map_op["new"]] for val in self.inputs.index.values: new_val = val if idx is not None and (val[idx] in orig_s) and (not isinstance(val, str)): new_val = val[:idx] + (ns[orig_s.index(val[idx])],) + val[idx+1:] elif val in orig_s: new_val = ns[orig_s.index(val)] map_dict.update({val: new_val}) CERO.rename_index_values(self.inputs, map_dict, inplace=True)
def exec_procedures(self, cero): """ Execute all the procedures of the FromCERO object . :param pandas.DataFrame cero: A CERO to serve as input for the procedures. The argument is not mutated/modified. """ CERO.is_cero(cero, raise_exception=True, empty_ok=False) CERO.rename_index_values(cero, self.get("map", {})) self.output_procedures = OrderedDict() for procedure in self["procedures"]: try: ret = procedure.exec_ops(cero) # if ret is not None, should be dict with key: procedure["name"], value: resultant CERO except Exception as e: raise e.__class__(e.__str__() + " Error in procedure '%s'." % (procedure["name"])) if ret is None: ret = {} self.output_procedures.update(ret) else: if not self["procedures"]: # If empty list self.output_procedures["default_output"] = cero if any([not procedure.get("file") for procedure in self["procedures"]]): msg = "It has been detected that not all procedures direct output to file. Therefore some output will go to \'%s\'." % self["file"] print(msg) FromCERO._logger.info(msg) if self.output_procedures != {}: file_ext = os.path.splitext(self["file"])[1][1:] if file_ext in FromCERO.sup_output_types: out_df = CERO.combine_ceros(list(self.output_procedures.values())) FromCERO.dataframe_out(out_df, self["file"], output_type=file_ext) elif file_ext in FromCERO._Procedure.sup_output_types: raise ValueError("This data type is not supported for general export, because it probably has a more than 2 dimensions - export using 'procedures' instead.") else: raise ValueError("Unsupported data type detected for general export.")
def _exec_op(self, op: dict): # Apply operation to procedure func_name = op.pop('func', "noop") # Perform noop (no-operation) if no func provided. op_args = op.pop('args', []) rename = op.pop("rename", None) arrays = None if "arrays" in op: arrays = op.pop("arrays") if issubclass(type(arrays), str): arrays = [arrays] arrays = _Identifier.get_all_idents(arrays, sets=self["sets"]) for mod in self["libfuncs"]: if hasattr(mod, func_name): func = getattr(mod, func_name) break else: msg = ('Invalid function name provided - \'%s\'. Function does not exist in any of the modules %s. It may be necessary to create a python module with the necessary functions and provide this file with the \'libfuncs\' option.' % (func_name, self["libfuncs"])) FromCERO._logger.error(msg) raise AttributeError(msg) FromCERO._logger.debug("Function call: %s(*arrays, **op)" % func.__name__) ret = func(self.inputs, *op_args, locs=arrays, **op) op['func'] = func.__name__ # For cleanliness of presentation if rename is not None: if ret is None: ret = getattr(libfuncs, "noop")(self.inputs, *op_args, locs=arrays, **op) if isinstance(rename, str): rename = {ret.index.tolist()[0]: rename} # Rename the first index by default if issubclass(type(rename), list): # Build mapping dictionary rename = _Identifier.get_mapping_dict(ret.index.tolist(), rename, sets=self.get("sets")) elif issubclass(type(rename), dict): rename = _Identifier.get_one_to_one_mapping(rename, sets=self.get("sets")) # At this point, rename should be one-to-one mapping dict renamed = CERO.rename_index_values(ret.loc[list(rename.keys())], rename, inplace=False) ret = renamed.loc[list(rename.values())] # Restrict renamed to only the rows that have been specified # Note that ret will be restricted to only those values that have been renamed. return ret
def test_name_map(self): def init_df(): df = pd.DataFrame.from_dict( { "A": [1, 2, 3], "B": [3, 4, 5], "C": [6, 7, 8] }, orient="index") df.columns = pd.DatetimeIndex( pd.to_datetime([2017, 2018, 2019], format="%Y")) df.sort_index(inplace=True) df = df.astype(pd.np.float32) self.assertTrue(CERO.is_cero(df)) return df df = init_df() mapping = dict([("A", "D"), ("B", "E"), ("C", "F")]) res = CERO.rename_index_values(df, mapping) test_names = ["D", "E", "F"] self.assertIsNone(res) self.assertTrue( all([x == y for (x, y) in zip(df.index.tolist(), test_names)])) # Test 2 df = init_df() mapping = dict([("B", "E"), ("C", "F")]) res = CERO.rename_index_values(df, mapping) test_names = ["A", "E", "F"] self.assertIsNone(res) self.assertTrue( all([x == y for (x, y) in zip(df.index.tolist(), test_names)])) # Test 3 df = init_df() mapping = dict([("A", "D"), ("B", "E"), ("C", "F")]) res = CERO.rename_index_values(df, mapping, inplace=False) test_names = ["D", "E", "F"] test_names_df = ["A", "B", "C"] self.assertTrue( all([x == y for (x, y) in zip(res.index.tolist(), test_names)])) self.assertTrue( all([x == y for (x, y) in zip(df.index.tolist(), test_names_df)])) # Test 4 df = init_df() mapping = dict([("B", "E"), ("C", "F")]) res = CERO.rename_index_values(df, mapping, inplace=False) test_names = ["A", "E", "F"] test_names_df = ["A", "B", "C"] self.assertTrue( all([x == y for (x, y) in zip(res.index.tolist(), test_names)])) self.assertTrue( all([x == y for (x, y) in zip(df.index.tolist(), test_names_df)]))