Ejemplo n.º 1
def _check_shared_axes_argument(shared_axes, datacols, indexcols):
    for key, shared in shared_axes.items():
        # must be dictionary
        if not isinstance(shared, dict):
            raise PuffbirdError("All shared axes arguments must be "
                                "dictionaries, but the value for key "
                                f"'{key}' is of type '{type(shared)}'")
        # keys must be in columns
        not_in_columns = set(shared) - set(datacols)
        if not_in_columns:
            raise PuffbirdError("All keys of the dictionary of a shared "
                                "axes argument must be present in the "
                                f"'datacols' {datacols}; these keys "
                                f"are not in columns: '{not_in_columns}'.")
        # keys must be unique
        key_is_unique = (all(
            _col_no_match(datacol, key) for datacol in datacols)
                         and key not in indexcols)
        if not key_is_unique:
            raise PuffbirdError(f"The keyword '{key}' is not unique: "
                                "It must not exist in in the 'datacols' "
                                f"{datacols} or the 'indexcols' "
                                f"{indexcols}, and it cannot start "
                                "the same way as any 'datacolumn' in "
                                "the dataframe.")
Ejemplo n.º 2
    def drop(self,
             *cols: str,
             skip: bool = False,
             skip_index: bool = False,
             skip_data: bool = False):
        Drop columns in place.

        cols : str
            Columns to drop.
        skip : bool
            If True, skip values in `cols` that do not match with
            any columns. Defaults to False.
        skip_index : bool
            If True, skip dropping *"index columns"*. Defaults to False.
        skip_data : bool
            If True, skip dropping *"data columns"*. Defaults to False.


        See Also
        # substitute renamed columns
        columns = self._substitute_cols(cols)
        not_found = set(columns) - (set(self.datacols) | set(self.indexcols))
        if not_found and not skip:
            raise PuffbirdError(f"Columns '{not_found}' are not in "
                                "'data columns' or 'index columns'.")
        datacols = set(columns) & set(self.datacols)
        if datacols and not skip_data:
            self.table.drop(columns=datacols, inplace=True)
            # reassign renamed columns
            self._datacols_rename = {
                original_value: renamed_value
                for original_value, renamed_value in
                self.datacols_rename.items() if renamed_value not in datacols
        indexcols = set(columns) & set(self.indexcols)
        if indexcols and not skip_index:
            index = self.table.index.to_frame()
            index.drop(columns=indexcols, inplace=True)
            index = pd.MultiIndex.from_frame(index)
            if not index.is_unique:
                raise PuffbirdError(f"Dropping index columns '{indexcols}' "
                                    "results in non-unique indices.")
            self.table.index = index
            # reassign renamed columns
            self._indexcols_rename = {
                original_value: renamed_value
                for original_value, renamed_value in
                self.indexcols_rename.items() if renamed_value not in indexcols
        return self
Ejemplo n.º 3
def _process_table_when_series(table):
    if table.name is None:
        if "data_column" in table.index.names:
            raise PuffbirdError("When table is a pandas.Series "
                                "object, the index names cannot "
                                "contain the name 'data_column'.")
        return table.to_frame(name="data_column")
    return table.to_frame()
Ejemplo n.º 4
def _check_table_column_types(table, enforce_identifier_string):
    # columns and index names must be identifier string types
    for datacol in table.columns:
        if not isinstance(datacol, str):
            raise PuffbirdError(f"Datacolumn '{datacol}' is not a "
                                f"string type: {type(datacol)}")
        if not datacol.isidentifier() and enforce_identifier_string:
            raise PuffbirdError(f"Datacolumn '{datacol}' is not a "
                                "identifier string type.")
    if len(set(table.columns)) != len(table.columns):
        raise PuffbirdError(f"Datacols '{tuple(table.columns)}' "
                            "are not unique.")
    for indexcol in table.index.names:
        if not isinstance(indexcol, str):
            raise PuffbirdError(f"Indexcolumn '{indexcol}' is not a "
                                f"string type: {type(indexcol)}")
        if not indexcol.isidentifier() and enforce_identifier_string:
            raise PuffbirdError(f"Indexcolumn '{indexcol}' is not a"
                                "identifier string type.")
        for datacol in table.columns:
            if not _col_no_match(datacol, indexcol):
                raise PuffbirdError(f"Indexcolumn '{indexcol}' matches "
                                    f"datacol '{datacol}': Indexcol "
                                    "cannot start the same way as "
    if len(set(table.index.names)) != len(table.index.names):
        raise PuffbirdError(f"Indexcols '{tuple(table.index.names)}' "
                            f"are not unique.")
Ejemplo n.º 5
    def apply(self,
              func: Callable,
              new_col_name: Optional[str],
              *args: str,
              assign_to_index: bool = False,
              map_kws: Optional[Dict[str, str]] = None,
        Apply a function to each row in the `table`.

        func : callable
            Function to apply. The function cannot return a
            :obj:`~pandas.Series` object.
        new_col_name : str
            Name of computed new column. If None, `new_col_name` will be
        args : tuple
            Arguments passed to function. Each argument should be an
            *"index column"* or *"data column"* in the `table`.
            Thus, the argument will correspond to the cell value for each row.
        assign_to_index : bool, optional
            Assign new column as *"index column"*,
            instead of as *"data column"*..
        map_kws : dict
            Same as args just as keyword arguments.
        kwargs: dict
            Keyword arguments passed to function as is.

        map_kws = {} if map_kws is None else map_kws

        if new_col_name is None:
            new_col_name = "apply_result"
        # apply function
        series = self.table.reset_index().apply(lambda x: func(
            *(x[self._substitute_col(col)] for col in args), **
            {key: x[self._substitue_col(col)]
             for key, col in map_kws.items()}, **kwargs),
        if isinstance(series, pd.DataFrame):
            raise PuffbirdError("The function 'func' cannot return "
                                "a `pandas.Series` object.")
        # assign output
        self._assign_output_series(series, new_col_name, assign_to_index)
        return self
Ejemplo n.º 6
def _mapping_variable_converter(table, arg, default_arg, name):
    if isinstance(arg, dict):
        arg = arg.copy()
        default_arg = arg.pop("_default", default_arg)
        remaining = set(arg) - set(table.columns)
        if remaining:
            raise PuffbirdError(f"The '{name}' dictionary "
                                "contains keys that are not in "
                                f"the columns {tuple(table.columns)}: "
        default_arg = arg
        arg = {}
    return arg, default_arg
Ejemplo n.º 7
    def col_apply(self,
                  func: Callable,
                  col: str,
                  new_col_name: Optional[str] = None,
                  assign_to_index: Optional[bool] = None,
        Apply a function to a specific column in each row in the `table`.

        func : callable
            Function to apply. The function cannot return a
            :obj:`~pandas.Series` object.
        col : str
            Name of *"data column"*.
        new_col_name : str, optional
            Name of computed new column. If None, this will be set
            to the name of the column; i.e. the name of the column will be
            overwritten. Defaults to None.
        assign_to_index : bool, optional
            Assign new column as *"index column"*,
            instead of as *"data column"*..
        kwargs : dict
            Keyword Arguments passed each function call.


        # substitute column if renamed
        col = self._substitute_col(col)

        if new_col_name is None:
            new_col_name = col
        if assign_to_index is None:
            assign_to_index = col in self.indexcols
        # apply function
        series = self._select_frame(col).apply(func, **kwargs)
        if isinstance(series, pd.DataFrame):
            raise PuffbirdError("The function 'func' cannot return "
                                "a `pandas.Series` object.")
        # assign output
        self._assign_output_series(series, new_col_name, assign_to_index)
        return self
Ejemplo n.º 8
 def _assign_output_series(self, series, new_col_name, assign_to_index):
     assign a series to a particular column or index name
     if assign_to_index:
         if new_col_name in self.indexcols:
             index = self.table.index.to_frame(False)
             index.loc[:, new_col_name] = series
             self.table.index = pd.MultiIndex.from_frame(index)
             self.table.loc[:, new_col_name] = series
         if new_col_name in self.indexcols:
             raise PuffbirdError(f"Column name '{new_col_name}' already "
                                 "assigned to index columns; cannot "
                                 "assign to data columns. Choose "
                                 "different name.")
         self.table.loc[:, new_col_name] = series
Ejemplo n.º 9
    def to_long(self,
                *cols: str,
                iterable: Union[Callable, Dict[str,
                                               Callable]] = DEFAULT_ITERABLE,
                max_depth: Union[int, Dict[str, int]] = DEFAULT_MAX_DEPTH,
                dropna: bool = True,
                reindex: bool = False,
                cond: Union[int, Dict[str, int]] = DEFAULT_CONDITION,
                expand_cols: Optional[Sequence[str]] = None,
                **shared_axes: dict) -> pd.DataFrame:
        Transform the *"puffy"* table into a *long-format*

        cols : str
            A selection of *"data columns"* to create the long dataframe with.
            If not given, the algorithm will use all *"data columns"*.
        iterable : callable or dict of callables, optional
            This function is called on each cell for each *"data column"*
            to create a new :obj:`~pandas.Series` object.
            If the *"data columns"* contains :obj:`dict`, :obj:`list`,
            :obj:`int`, :obj:`float`,
            :obj:`~numpy.array`, :obj:`~numpy.recarray`,
            :obj:`~pandas.DataFrame`, or :obj:`~pandas.Series` object types
            than the default iterable will handle these appropriately.
            When passing a dictionary of iterables, the keys should
            correspond to values in :obj:`~FrameEngine.datacols` (i.e.
            the *"data columns"* of the `table`). In this case, each column can
            have a custom iterable used. If a column's iterable is not
            specified the default iterable is used.
        max_depth : int or dict of ints, optional
            Maximum depth of expanding each cell, before the algorithm stops
            for each *"data column"*. If we set the max_depth to 3,
            for example,
            a *"data column"* consisting of 4-D :obj:`~numpy.array` objects
            will result in a :obj:`~pandas.DataFrame`
            where the *"data column"* cells contain
            1-D :obj:`~numpy.array` objects.
            If the arrays were 3-D, it will result in a
            long dataframe with scalars in each cell.
            Defaults to 3.
        dropna : bool, optional
            Drop rows in *long-format* :obj:`~pandas.DataFrame`,
            where **all** *"data columns"* are NaNs.
        cond : callable or dict of callables, optional
            This function should return `True` or `False` and accept a
            :obj:`~pandas.Series` object as an argument. If True, the algorithm
            will stop *"exploding"* a *"data column"*. The default `cond`
            argument suffices for all non-hashable types, such as
            :obj:`list` or :obj:`~numpy.array` objects. If you want
            to *"explode"* hashable types such as :obj:`tuple` objects, a
            custom `cond` callable has to be defined. However, it is
            recommended that hashable types are first converted into non-hashable
            types using a custom conversion function and the
            :obj:`~FrameEngine.col_apply` method.
        expand_cols : list-like, optional
            Specify a list of *"data columns"* to apply the
            :obj:`~FrameEngine.expand_col` method instead of *"exploding"*
            the column in the table.
            If all cells within a *"data column"* contains similarly
            :obj:`~pandas.DataFrame` or :obj:`~pandas.Series` object types,
            the :obj:`~FrameEngine.expand_col` method can be used instead
            of *"exploding"* the *"data column"*. Default to None.
        shared_axes : dict, optional
            Specify if two or more *"data columns"* share axes
            (i.e. *"explosion"* iterations). The keyword
            will correspond to what the column will be called in the long
            dataframe. Each argument is a dictionary where the keys
            correspond to the names of the *"data columns"*, which share
            an axis, and the value correspond to the depth/axis is shared
            for each *"data column"*. `shared_axis` argument is usually defined
            for *"data columns"* that contain :obj:`~numpy.array` objects.
            For example, one *"data column"* may consists of one-dimensional
            timestamp arrays and another *"data column"* may consist of
            two-dimensional timeseries arrays where the first axis of the
            latter is shared with the zeroth axis of the former.

            A `long-format` :obj:`~pandas.DataFrame`.

        See Also

        If you find yourself writing custom `iterable` and `cond` arguments
        and believe these may be of general use, please open an
        `issue <https://github.com/gucky92/puffbird/issues>`_ or
        start a pull request.

        >>> import pandas as pd
        >>> import puffbird as pb
        >>> df = pd.DataFrame({
        ...     'a': [[1,2,3], [4,5,6,7], [3,4,5]],
        ...     'b': [{'c':['asdf'], 'd':['ret']}, {'d':['r']}, {'c':['ff']}],
        ... })
        >>> df
                      a                              b
        0     [1, 2, 3]  {'c': ['asdf'], 'd': ['ret']}
        1  [4, 5, 6, 7]                   {'d': ['r']}
        2     [3, 4, 5]                  {'c': ['ff']}
        >>> engine = pb.FrameEngine(df)

        Now we can use the :obj:`~FrameEngine.to_long` method to create
        a `long-format` :obj:`~pandas.DataFrame`:

        >>> engine.to_long()
            index_level0  a_level0    a b_level0  b_level1     b
        0              0         0  1.0        c         0  asdf
        1              0         0  1.0        d         0   ret
        2              0         1  2.0        c         0  asdf
        3              0         1  2.0        d         0   ret
        4              0         2  3.0        c         0  asdf
        5              0         2  3.0        d         0   ret
        6              1         0  4.0        d         0     r
        7              1         1  5.0        d         0     r
        8              1         2  6.0        d         0     r
        9              1         3  7.0        d         0     r
        10             2         0  3.0        c         0    ff
        11             2         1  4.0        c         0    ff
        12             2         2  5.0        c         0    ff
        if cols:
            self = self[cols]

        expand_cols = [] if expand_cols is None else expand_cols
        truth = set(expand_cols) - set(self.datacols)
        if truth:
            raise PuffbirdError(f"Keys '{truth}' in 'expand_cols' not "
                                f"in 'data columns' {self.datacols}")

        # check shared axes arguments for correct formatting
        _check_shared_axes_argument(shared_axes, self.datacols, self.indexcols)

        # convert max_depth correctly
        max_depth, default_max_depth = _mapping_variable_converter(
            self.table, max_depth, DEFAULT_MAX_DEPTH, "max_depth")
        # iterable dictionary
        iterable, default_iterable = _mapping_variable_converter(
            self.table, iterable, DEFAULT_ITERABLE, "iterable")
        # get condition
        cond, default_cond = _mapping_variable_converter(
            self.table, cond, DEFAULT_CONDITION, "cond")

        # iterate of each data column
        for m, (datacol, series) in enumerate(self.table.items()):
            if datacol in expand_cols:
                # this only works if all objects within a series are dataframes
                names = set(series.index.names)
                _df = self.expand_col(datacol,
                # handle multiindex
                if isinstance(_df.columns, pd.MultiIndex):
                    _df.columns = _df.columns.to_flat_index()
                    # this way it is known where the column came from
                    _df.rename(columns=lambda x: f"{datacol}_{x}",
                if dropna:
                    series = series.dropna()
                # set first depth
                n = 0
                # if series already not object skip
                # TODO different conditions meet
                while (not cond.get(datacol, default_cond)(series)
                       and max_depth.get(datacol, default_max_depth) > n):
                    # superstack pandas.Series object
                    series = self._superstack_series(
                        series, datacol,
                        iterable.get(datacol, default_iterable), dropna,
                        _get_col_name(datacol, n, shared_axes))
                    n += 1

                # convert series to frame
                names = set(series.index.names)
                _df = series.reset_index()

            # merge with previous dataframe
            if not m:
                df = _df
                on = list(names & set(df.columns))
                df = pd.merge(df,
                              suffixes=("", f"_{datacol}"))

        # reindex if necessary:
        if reindex:
            # this will not necessarily produce unique indices
            index = list(set(df.columns) - set(self.datacols))
            df.set_index(index, inplace=True)

        return df
Ejemplo n.º 10
    def __init__(self,
                 datacols: Optional[Collection] = None,
                 indexcols: Optional[Collection] = None,
                 inplace: bool = False,
                 handle_column_types: bool = True,
                 enforce_identifier_string: bool = False,
                 fastpath: bool = False):
        # used internally
        if fastpath:
            self._table = table
            self._datacols_rename = {}
            self._indexcols_rename = {}

        if isinstance(table, type(self)):
            table = table.table

        # check table type
        if isinstance(table, pd.Series):
            table = _process_table_when_series(table)
        elif not isinstance(table, pd.DataFrame):
            table = _process_table_when_unknown_object(table)

        truth = RESERVED_COLUMNS & set(table.columns)
        if truth:
            raise PuffbirdError(f"Dataframe table has columns "
                                f"that are reserved: {truth}")

        if not inplace:
            table = table.copy()

        if isinstance(table.columns, pd.MultiIndex):
            table.columns = table.columns.to_flat_index()

        table, datacols, indexcols = _process_column_types(
            table, datacols, indexcols)

        # table index must be a multiindex
        if not isinstance(table.index, pd.MultiIndex):
            table.index = pd.MultiIndex.from_frame(
                    'index_level0' if table.index.name is None else None)))

        table, datacols_rename, indexcols_rename = \
                table, handle_column_types, enforce_identifier_string

        # check table index and column types
        _check_table_column_types(table, enforce_identifier_string)

        # check if index is unique
        if not table.index.is_unique:
            raise PuffbirdError("Each row for all index columns "
                                "must be a unique set.")

        # assign table
        self._table = table
        # used internally
        self._datacols_rename = datacols_rename
        self._indexcols_rename = indexcols_rename
Ejemplo n.º 11
def _enforce_identifier_column_types(table, handle_column_types,
    # if not handling column types
    if not handle_column_types:
        return table

    # convert data columns
    datacols_rename = {}
    for datacol in table.columns:
        if isinstance(datacol, tuple):
            if not enforce_identifier_string:
                new_datacol = str(datacol)
            elif all(str(idata).isdigit() for idata in datacol):
                new_datacol = "data_tuple_col_" + "_".join(datacol)
                # replace various characters
                new_datacol = _label_character_replacement("_".join(datacol))
        elif isinstance(datacol, str):
            if not enforce_identifier_string:
                new_datacol = datacol
            elif datacol.isdigit():
                new_datacol = "data_col_" + datacol
                # replace various characters
                new_datacol = _label_character_replacement(datacol)
        elif isinstance(datacol, Number):
            if not enforce_identifier_string:
                new_datacol = str(datacol)
                new_datacol = f"index_number_{datacol}"
                new_datacol = _label_character_replacement(new_datacol)
        elif not enforce_identifier_string:
            new_datacol = str(datacol)
            raise PuffbirdError("Datacolumn must string or integer "
                                f"but is type: {type(datacol)}.")

        if datacol != new_datacol:
            datacols_rename[datacol] = new_datacol

    # rename columns
    if datacols_rename:
        table.rename(columns=datacols_rename, inplace=True)

    # convert index columns
    indexcols_rename = {}
    for idx, indexcol in enumerate(table.index.names):
        if isinstance(indexcol, tuple):
            if not enforce_identifier_string:
                new_indexcol = str(indexcol)
            elif all(str(idx).isdigit() for idx in indexcol):
                new_indexcol = "index_tuple_col_" + "_".join(indexcol)
                # replace various characters
                new_indexcol = _label_character_replacement("_".join(indexcol))
        elif indexcol is None:
            new_indexcol = f"index_level{idx}"
        elif isinstance(indexcol, str):
            if not enforce_identifier_string:
                new_indexcol = indexcol
            elif indexcol.isdigit():
                new_indexcol = f"index_col_{indexcol}"
                # replace various characters
                new_indexcol = _label_character_replacement(indexcol)
        elif isinstance(indexcol, Number):
            if not enforce_identifier_string:
                new_indexcol = str(indexcol)
                new_indexcol = f"index_number_{indexcol}"
                new_indexcol = _label_character_replacement(new_indexcol)
        elif not enforce_identifier_string:
            new_indexcol = str(new_indexcol)
            raise PuffbirdError("Indexcolumn must string or integer "
                                f"but is type: {type(indexcol)}.")

        if indexcol != new_indexcol:
            indexcols_rename[indexcol] = new_indexcol

    # rename indices
    if indexcols_rename:
        table.rename_axis(index=indexcols_rename, inplace=True)

    return table, datacols_rename, indexcols_rename
Ejemplo n.º 12
def _process_table_when_unknown_object(table):
        return pd.DataFrame(table)
    except Exception as e:
        raise PuffbirdError("Cannot convert 'table' argument of type "
                            f"'{type(table)}' to dataframe: {e}")