Example #1
0
def groupby(df: "CERO",
            *args,
            key: "Union[int, list[int]]" = None,
            match: str = None,
            agg: str = None,
            **kwargs):

    if key is None:
        raise TypeError(
            "'key' must be provided to 'groupby' function as either an int or list of ints."
        )
    elif not issubclass(type(key), list):
        key = [key]

    if not all([issubclass(type(k), int) for k in key]):
        raise TypeError(
            "'key' must be provided to 'groupby' function as either an int or list of ints."
        )

    defaults = {"axis": 0, "sort": False, "group_keys": False}
    defaults.update(kwargs)

    match = _Identifier.tupleize_name(match)
    m_ids = [match]
    if match is None:
        m_ids = _Identifier.unique_id_fields(df.index.values, key=key)

    conv = lambda x: tuple(x) if issubclass(type(x), str) else x
    m_ids = [conv(m) for m in m_ids]

    rename_dict = {}
    for m in m_ids:

        # Create func that identifies rows for grouping
        def f(x):
            return all([x[k] == m[idx] for idx, k in enumerate(key)])

        # Groupby and apply aggregation function
        agg_df = df.groupby(by=f, **defaults).agg(agg)

        # Put aggregated calculation in first row that meets the condition
        row_loc = next(x for x in df.index.values if f(x))
        df.iloc[df.index.get_loc(row_loc)] = agg_df.loc[True]

        # Rename row
        rename_dict.update(
            {row_loc: _Identifier.keep_only_fields(key, row_loc)[0]})

    CERO.rename_index_values(df, rename_dict, inplace=True)
    return df
Example #2
0
        def _set_inputs(self, cero: pd.DataFrame):
            """Copies each data series in ``cero`` indexed by the items in ``inp_list`` to an ``OrderedDict``. This \
                    ensures that ``operations`` do not alter ``cero``.
                    """
            if self["inputs"] == []:
                # Input is entire CERO unless otherwise specified
                self["inputs"] = cero.index.tolist()

            # Check values in dataframe - check is necessary because KeyError is not thrown if some values are in index (pandas version 0.22).
            invalid_inputs = [i for i in self["inputs"] if i not in cero.index]

            try:
                self.inputs = copy.deepcopy(cero.iloc[[cero.index.get_loc(loc) for loc in self["inputs"]]]) # Reduce data frame to necessary data and copy
            except KeyError:
                invalid_inputs = [i for i in self["inputs"] if i not in cero.index]
                msg = ("Inputs %s do not exist. The most likely reason is that the configuration file is " +
                       "incorrectly specified, or lacks specification. If debugging level has been set to " +
                       "'DEBUG', then the input list is in the log file - note that this list may be " +
                       "extraordinarily long. Common causes of this problem include: \n" +
                       " 1. Set definition in configuration file includes elements that do not exist in the CERO.\n" +
                       " 2. Mis-spellings of identifiers in the configuration file (which includes names of sets for 'inputs' or 'arrays').\n" +
                       " 3. Incorrect ordering of sets in the identifier."
                       ) % invalid_inputs
                FromCERO._logger.error(msg)
                raise KeyError(msg)

            assert (isinstance(self.inputs, pd.DataFrame))

            map_dict = {}
            for map_op in self.get("map", []):

                idx = map_op.get("idx")
                orig_s = self["sets"][map_op["orig"]]
                ns = self["sets"][map_op["new"]]

                for val in self.inputs.index.values:

                    new_val = val

                    if idx is not None and (val[idx] in orig_s) and (not isinstance(val, str)):
                        new_val = val[:idx] + (ns[orig_s.index(val[idx])],) + val[idx+1:]
                    elif val in orig_s:
                        new_val = ns[orig_s.index(val)]

                    map_dict.update({val: new_val})
                CERO.rename_index_values(self.inputs, map_dict, inplace=True)
Example #3
0
    def exec_procedures(self, cero):
        """ Execute all the procedures of the FromCERO object
        .
        :param pandas.DataFrame cero: A CERO to serve as input for the procedures. The argument is not mutated/modified.
        """

        CERO.is_cero(cero, raise_exception=True, empty_ok=False)

        CERO.rename_index_values(cero, self.get("map", {}))

        self.output_procedures = OrderedDict()

        for procedure in self["procedures"]:

            try:
                ret = procedure.exec_ops(cero)
                # if ret is not None, should be dict with key: procedure["name"], value: resultant CERO
            except Exception as e:
                raise e.__class__(e.__str__() + " Error in procedure '%s'." % (procedure["name"]))

            if ret is None:
                ret = {}

            self.output_procedures.update(ret)
        else:
            if not self["procedures"]: # If empty list
                self.output_procedures["default_output"] = cero

        if any([not procedure.get("file") for procedure in self["procedures"]]):
            msg = "It has been detected that not all procedures direct output to file. Therefore some output will go to \'%s\'." % self["file"]
            print(msg)
            FromCERO._logger.info(msg)

        if self.output_procedures != {}:
            file_ext = os.path.splitext(self["file"])[1][1:]
            if file_ext in FromCERO.sup_output_types:
                out_df = CERO.combine_ceros(list(self.output_procedures.values()))
                FromCERO.dataframe_out(out_df, self["file"], output_type=file_ext)
            elif file_ext in FromCERO._Procedure.sup_output_types:
                raise ValueError("This data type is not supported for general export, because it probably has a more than 2 dimensions - export using 'procedures' instead.")
            else:
                raise ValueError("Unsupported data type detected for general export.")
Example #4
0
        def _exec_op(self, op: dict):

            # Apply operation to procedure
            func_name = op.pop('func', "noop") # Perform noop (no-operation) if no func provided.
            op_args = op.pop('args', [])
            rename = op.pop("rename", None)

            arrays = None
            if "arrays" in op:
                arrays = op.pop("arrays")
                if issubclass(type(arrays), str):
                    arrays = [arrays]
                arrays = _Identifier.get_all_idents(arrays, sets=self["sets"])

            for mod in self["libfuncs"]:
                if hasattr(mod, func_name):
                    func = getattr(mod, func_name)
                    break
            else:
                msg = ('Invalid function name provided - \'%s\'. Function does not exist in any of the modules %s. It may be necessary to create a python module with the necessary functions and provide this file with the \'libfuncs\' option.' %
                            (func_name, self["libfuncs"]))
                FromCERO._logger.error(msg)
                raise AttributeError(msg)

            FromCERO._logger.debug("Function call: %s(*arrays, **op)" % func.__name__)

            ret = func(self.inputs, *op_args, locs=arrays, **op)
            op['func'] = func.__name__  # For cleanliness of presentation

            if rename is not None:

                if ret is None:
                    ret = getattr(libfuncs, "noop")(self.inputs, *op_args, locs=arrays, **op)

                if isinstance(rename, str):
                    rename = {ret.index.tolist()[0]: rename} # Rename the first index by default

                if issubclass(type(rename), list):
                    # Build mapping dictionary
                    rename = _Identifier.get_mapping_dict(ret.index.tolist(), rename, sets=self.get("sets"))
                elif issubclass(type(rename), dict):
                    rename = _Identifier.get_one_to_one_mapping(rename, sets=self.get("sets"))

                # At this point, rename should be one-to-one mapping dict

                renamed = CERO.rename_index_values(ret.loc[list(rename.keys())], rename, inplace=False)
                ret = renamed.loc[list(rename.values())]  # Restrict renamed to only the rows that have been specified

                # Note that ret will be restricted to only those values that have been renamed.

            return ret
Example #5
0
    def test_name_map(self):
        def init_df():
            df = pd.DataFrame.from_dict(
                {
                    "A": [1, 2, 3],
                    "B": [3, 4, 5],
                    "C": [6, 7, 8]
                },
                orient="index")
            df.columns = pd.DatetimeIndex(
                pd.to_datetime([2017, 2018, 2019], format="%Y"))
            df.sort_index(inplace=True)
            df = df.astype(pd.np.float32)
            self.assertTrue(CERO.is_cero(df))
            return df

        df = init_df()
        mapping = dict([("A", "D"), ("B", "E"), ("C", "F")])

        res = CERO.rename_index_values(df, mapping)

        test_names = ["D", "E", "F"]

        self.assertIsNone(res)
        self.assertTrue(
            all([x == y for (x, y) in zip(df.index.tolist(), test_names)]))

        # Test 2

        df = init_df()
        mapping = dict([("B", "E"), ("C", "F")])

        res = CERO.rename_index_values(df, mapping)

        test_names = ["A", "E", "F"]

        self.assertIsNone(res)
        self.assertTrue(
            all([x == y for (x, y) in zip(df.index.tolist(), test_names)]))

        # Test 3

        df = init_df()
        mapping = dict([("A", "D"), ("B", "E"), ("C", "F")])

        res = CERO.rename_index_values(df, mapping, inplace=False)

        test_names = ["D", "E", "F"]
        test_names_df = ["A", "B", "C"]

        self.assertTrue(
            all([x == y for (x, y) in zip(res.index.tolist(), test_names)]))
        self.assertTrue(
            all([x == y for (x, y) in zip(df.index.tolist(), test_names_df)]))

        # Test 4

        df = init_df()
        mapping = dict([("B", "E"), ("C", "F")])

        res = CERO.rename_index_values(df, mapping, inplace=False)

        test_names = ["A", "E", "F"]
        test_names_df = ["A", "B", "C"]

        self.assertTrue(
            all([x == y for (x, y) in zip(res.index.tolist(), test_names)]))
        self.assertTrue(
            all([x == y for (x, y) in zip(df.index.tolist(), test_names_df)]))