Example #1
0
def normalize_keyword_aggregation(
        kwargs: dict) -> Tuple[dict, List[str], List[int]]:
    """
    Normalize user-provided "named aggregation" kwargs.
    Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs
    to the old Dict[str, List[scalar]]].

    Parameters
    ----------
    kwargs : dict

    Returns
    -------
    aggspec : dict
        The transformed kwargs.
    columns : List[str]
        The user-provided keys.
    col_idx_order : List[int]
        List of columns indices.

    Examples
    --------
    >>> normalize_keyword_aggregation({"output": ("input", "sum")})
    (defaultdict(<class 'list'>, {'input': ['sum']}), ('output',), array([0]))
    """
    # Normalize the aggregation functions as Mapping[column, List[func]],
    # process normally, then fixup the names.
    # TODO: aggspec type: typing.Dict[str, List[AggScalar]]
    # May be hitting https://github.com/python/mypy/issues/5958
    # saying it doesn't have an attribute __name__
    aggspec: DefaultDict = defaultdict(list)
    order = []
    columns, pairs = list(zip(*kwargs.items()))

    for name, (column, aggfunc) in zip(columns, pairs):
        aggspec[column].append(aggfunc)
        order.append((column, com.get_callable_name(aggfunc) or aggfunc))

    # uniquify aggfunc name if duplicated in order list
    uniquified_order = _make_unique_kwarg_list(order)

    # GH 25719, due to aggspec will change the order of assigned columns in aggregation
    # uniquified_aggspec will store uniquified order list and will compare it with order
    # based on index
    aggspec_order = [(column, com.get_callable_name(aggfunc) or aggfunc)
                     for column, aggfuncs in aggspec.items()
                     for aggfunc in aggfuncs]
    uniquified_aggspec = _make_unique_kwarg_list(aggspec_order)

    # get the new indice of columns by comparison
    col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order)
    return aggspec, columns, col_idx_order
Example #2
0
def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]:
    """
    Possibly mangle a list of aggfuncs.

    Parameters
    ----------
    aggfuncs : Sequence

    Returns
    -------
    mangled: list-like
        A new AggSpec sequence, where lambdas have been converted
        to have unique names.

    Notes
    -----
    If just one aggfunc is passed, the name will not be mangled.
    """
    if len(aggfuncs) <= 1:
        # don't mangle for .agg([lambda x: .])
        return aggfuncs
    i = 0
    mangled_aggfuncs = []
    for aggfunc in aggfuncs:
        if com.get_callable_name(aggfunc) == "<lambda>":
            aggfunc = partial(aggfunc)
            aggfunc.__name__ = f"<lambda_{i}>"
            i += 1
        mangled_aggfuncs.append(aggfunc)

    return mangled_aggfuncs
Example #3
0
    def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
        mutated = self.mutated
        splitter = self._get_splitter(data, axis=axis)
        group_keys = self._get_group_keys()
        result_values = None

        sdata: FrameOrSeries = splitter._get_sorted_data()
        if sdata.ndim == 2 and np.any(
                sdata.dtypes.apply(is_extension_array_dtype)):
            # calling splitter.fast_apply will raise TypeError via apply_frame_axis0
            #  if we pass EA instead of ndarray
            #  TODO: can we have a workaround for EAs backed by ndarray?
            pass

        elif (com.get_callable_name(f) not in base.plotting_methods
              and isinstance(splitter, FrameSplitter) and axis == 0
              # fast_apply/libreduction doesn't allow non-numpy backed indexes
              and not sdata.index._has_complex_internals):
            try:
                result_values, mutated = splitter.fast_apply(
                    f, sdata, group_keys)

            except libreduction.InvalidApply as err:
                # This Exception is raised if `f` triggers an exception
                # but it is preferable to raise the exception in Python.
                if "Let this error raise above us" not in str(err):
                    # TODO: can we infer anything about whether this is
                    #  worth-retrying in pure-python?
                    raise

            else:
                # If the fast apply path could be used we can return here.
                # Otherwise we need to fall back to the slow implementation.
                if len(result_values) == len(group_keys):
                    return group_keys, result_values, mutated

        for key, (i, group) in zip(group_keys, splitter):
            object.__setattr__(group, "name", key)

            # result_values is None if fast apply path wasn't taken
            # or fast apply aborted with an unexpected exception.
            # In either case, initialize the result list and perform
            # the slow iteration.
            if result_values is None:
                result_values = []

            # If result_values is not None we're in the case that the
            # fast apply loop was broken prematurely but we have
            # already the result for the first group which we can reuse.
            elif i == 0:
                continue

            # group might be modified
            group_axes = group.axes
            res = f(group)
            if not _is_indexed_like(res, group_axes):
                mutated = True
            result_values.append(res)

        return group_keys, result_values, mutated
Example #4
0
    def apply(self, f, data, axis=0):
        mutated = self.mutated
        splitter = self._get_splitter(data, axis=axis)
        group_keys = self._get_group_keys()

        # oh boy
        f_name = com.get_callable_name(f)
        if (f_name not in base.plotting_methods
                and hasattr(splitter, 'fast_apply') and axis == 0):
            try:
                values, mutated = splitter.fast_apply(f, group_keys)
                return group_keys, values, mutated
            except reduction.InvalidApply:
                # we detect a mutation of some kind
                # so take slow path
                pass
            except Exception:
                # raise this error to the caller
                pass

        result_values = []
        for key, (i, group) in zip(group_keys, splitter):
            object.__setattr__(group, 'name', key)

            # group might be modified
            group_axes = _get_axes(group)
            res = f(group)
            if not _is_indexed_like(res, group_axes):
                mutated = True
            result_values.append(res)

        return group_keys, result_values, mutated
Example #5
0
    def apply(self, f, data, axis=0):
        mutated = self.mutated
        splitter = self._get_splitter(data, axis=axis)
        group_keys = self._get_group_keys()

        # oh boy
        f_name = com.get_callable_name(f)
        if (f_name not in base.plotting_methods and
                hasattr(splitter, 'fast_apply') and axis == 0):
            try:
                values, mutated = splitter.fast_apply(f, group_keys)
                return group_keys, values, mutated
            except reduction.InvalidApply:
                # we detect a mutation of some kind
                # so take slow path
                pass
            except Exception:
                # raise this error to the caller
                pass

        result_values = []
        for key, (i, group) in zip(group_keys, splitter):
            object.__setattr__(group, 'name', key)

            # group might be modified
            group_axes = _get_axes(group)
            res = f(group)
            if not _is_indexed_like(res, group_axes):
                mutated = True
            result_values.append(res)

        return group_keys, result_values, mutated
Example #6
0
    def transform(self) -> FrameOrSeriesUnion:
        """
        Transform a DataFrame or Series.

        Returns
        -------
        DataFrame or Series
            Result of applying ``func`` along the given axis of the
            Series or DataFrame.

        Raises
        ------
        ValueError
            If the transform function fails or does not transform.
        """
        obj = self.obj
        func = self.orig_f
        axis = self.axis
        args = self.args
        kwargs = self.kwargs

        is_series = obj.ndim == 1

        if obj._get_axis_number(axis) == 1:
            assert not is_series
            return obj.T.transform(func, 0, *args, **kwargs).T

        if is_list_like(func) and not is_dict_like(func):
            func = cast(List[AggFuncTypeBase], func)
            # Convert func equivalent dict
            if is_series:
                func = {com.get_callable_name(v) or v: v for v in func}
            else:
                func = {col: func for col in obj}

        if is_dict_like(func):
            func = cast(AggFuncTypeDict, func)
            return self.transform_dict_like(func)

        # func is either str or callable
        func = cast(AggFuncTypeBase, func)
        try:
            result = self.transform_str_or_callable(func)
        except Exception:
            raise ValueError("Transform function failed")

        # Functions that transform may return empty Series/DataFrame
        # when the dtype is not appropriate
        if (
            isinstance(result, (ABCSeries, ABCDataFrame))
            and result.empty
            and not obj.empty
        ):
            raise ValueError("Transform function failed")
        if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
            obj.index
        ):
            raise ValueError("Function did not transform")

        return result
Example #7
0
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args,
              **kwargs) -> FrameOrSeries:
    """
    Transform a DataFrame or Series

    Parameters
    ----------
    obj : DataFrame or Series
        Object to compute the transform on.
    func : string, function, list, or dictionary
        Function(s) to compute the transform with.
    axis : {0 or 'index', 1 or 'columns'}
        Axis along which the function is applied:

        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.

    Returns
    -------
    DataFrame or Series
        Result of applying ``func`` along the given axis of the
        Series or DataFrame.

    Raises
    ------
    ValueError
        If the transform function fails or does not transform.
    """
    is_series = obj.ndim == 1

    if obj._get_axis_number(axis) == 1:
        assert not is_series
        return transform(obj.T, func, 0, *args, **kwargs).T

    if isinstance(func, list):
        if is_series:
            func = {com.get_callable_name(v) or v: v for v in func}
        else:
            func = {col: func for col in obj}

    if isinstance(func, dict):
        return transform_dict_like(obj, func, *args, **kwargs)

    # func is either str or callable
    try:
        result = transform_str_or_callable(obj, func, *args, **kwargs)
    except Exception:
        raise ValueError("Transform function failed")

    # Functions that transform may return empty Series/DataFrame
    # when the dtype is not appropriate
    if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
        raise ValueError("Transform function failed")
    if not isinstance(result,
                      (ABCSeries, ABCDataFrame)) or not result.index.equals(
                          obj.index):
        raise ValueError("Function did not transform")

    return result
Example #8
0
def aggregation(
    input_df: pd.DataFrame,
    group_key: str,
    group_values: List[str],
    agg_methods: List[Union[str, FunctionType]],
) -> Tuple[pd.DataFrame, List[str]]:
    """
    Aggregate values after grouping table rows by a given key.

    Args:
        input_df:
            Input data frame.
        group_key:
            Used to determine the groups for the groupby.
        group_values:
            Used to aggregate values for the groupby.
        agg_methods:
            List of function or function names, e.g. ['mean', 'max', 'min', numpy.mean].
            Do not use a lambda function because the name attribute of the lambda function cannot generate a unique string of column names in <lambda>.
    Returns:
        Tuple of output dataframe and new column names.
    """
    new_df = input_df.copy()

    new_cols = []
    for agg_method in agg_methods:
        if _is_lambda_function(agg_method):
            raise ValueError('Not supported lambda function.')
        elif isinstance(agg_method, str):
            pass
        elif isinstance(agg_method, FunctionType):
            pass
        else:
            raise ValueError('Supported types are: {} or {}.'
                             ' Got {} instead.'.format(str, Callable,
                                                       type(agg_method)))

    for agg_method in agg_methods:
        for col in group_values:
            # only str or FunctionType
            if isinstance(agg_method, str):
                agg_method_name = agg_method
            else:
                agg_method_name = get_callable_name(agg_method)
            new_col = "agg_{}_{}_by_{}".format(agg_method_name, col, group_key)

            df_agg = (input_df[[col] + [group_key]].groupby(group_key)[[
                col
            ]].agg(agg_method))
            df_agg.columns = [new_col]
            new_cols.append(new_col)
            new_df = new_df.merge(df_agg,
                                  how="left",
                                  right_index=True,
                                  left_on=group_key)

    return new_df, new_cols
Example #9
0
File: ops.py Project: vdasu/pandas
    def apply(self, f, data, axis=0):
        mutated = self.mutated
        splitter = self._get_splitter(data, axis=axis)
        group_keys = self._get_group_keys()
        result_values = None

        # oh boy
        f_name = com.get_callable_name(f)
        if (
            f_name not in base.plotting_methods
            and hasattr(splitter, "fast_apply")
            and axis == 0
        ):
            try:
                result_values, mutated = splitter.fast_apply(f, group_keys)

                # If the fast apply path could be used we can return here.
                # Otherwise we need to fall back to the slow implementation.
                if len(result_values) == len(group_keys):
                    return group_keys, result_values, mutated

            except libreduction.InvalidApply:
                # Cannot fast apply on MultiIndex (_has_complex_internals).
                # This Exception is also raised if `f` triggers an exception
                # but it is preferable to raise the exception in Python.
                pass
            except TypeError as err:
                if "Cannot convert" in str(err):
                    # via apply_frame_axis0 if we pass a non-ndarray
                    pass
                else:
                    raise

        for key, (i, group) in zip(group_keys, splitter):
            object.__setattr__(group, "name", key)

            # result_values is None if fast apply path wasn't taken
            # or fast apply aborted with an unexpected exception.
            # In either case, initialize the result list and perform
            # the slow iteration.
            if result_values is None:
                result_values = []

            # If result_values is not None we're in the case that the
            # fast apply loop was broken prematurely but we have
            # already the result for the first group which we can reuse.
            elif i == 0:
                continue

            # group might be modified
            group_axes = _get_axes(group)
            res = f(group)
            if not _is_indexed_like(res, group_axes):
                mutated = True
            result_values.append(res)

        return group_keys, result_values, mutated
Example #10
0
    def apply(self, f, data, axis=0):
        mutated = self.mutated
        splitter = self._get_splitter(data, axis=axis)
        group_keys = self._get_group_keys()
        result_values = None

        # oh boy
        f_name = com.get_callable_name(f)
        if (f_name not in base.plotting_methods and
                hasattr(splitter, 'fast_apply') and axis == 0):
            try:
                result_values, mutated = splitter.fast_apply(f, group_keys)

                # If the fast apply path could be used we can return here.
                # Otherwise we need to fall back to the slow implementation.
                if len(result_values) == len(group_keys):
                    return group_keys, result_values, mutated

            except reduction.InvalidApply:
                # Cannot fast apply on MultiIndex (_has_complex_internals).
                # This Exception is also raised if `f` triggers an exception
                # but it is preferable to raise the exception in Python.
                pass
            except Exception:
                # raise this error to the caller
                pass

        for key, (i, group) in zip(group_keys, splitter):
            object.__setattr__(group, 'name', key)

            # result_values is None if fast apply path wasn't taken
            # or fast apply aborted with an unexpected exception.
            # In either case, initialize the result list and perform
            # the slow iteration.
            if result_values is None:
                result_values = []

            # If result_values is not None we're in the case that the
            # fast apply loop was broken prematurely but we have
            # already the result for the first group which we can reuse.
            elif i == 0:
                continue

            # group might be modified
            group_axes = _get_axes(group)
            res = f(group)
            if not _is_indexed_like(res, group_axes):
                mutated = True
            result_values.append(res)

        return group_keys, result_values, mutated
Example #11
0
    def agg_list_like(self) -> DataFrame | Series:
        """
        Compute aggregation in the case of a list-like argument.

        Returns
        -------
        Result of aggregation.
        """
        from pandas.core.reshape.concat import concat

        obj = self.obj
        arg = cast(List[AggFuncTypeBase], self.f)

        if not isinstance(obj, SelectionMixin):
            # i.e. obj is Series or DataFrame
            selected_obj = obj
        elif obj._selected_obj.ndim == 1:
            # For SeriesGroupBy this matches _obj_with_exclusions
            selected_obj = obj._selected_obj
        else:
            selected_obj = obj._obj_with_exclusions

        results = []
        keys = []
        failed_names = []

        depr_nuisance_columns_msg = (
            "{} did not aggregate successfully. If any error is "
            "raised this will raise in a future version of pandas. "
            "Drop these columns/ops to avoid this warning.")

        # degenerate case
        if selected_obj.ndim == 1:
            for a in arg:
                colg = obj._gotitem(selected_obj.name,
                                    ndim=1,
                                    subset=selected_obj)
                try:
                    new_res = colg.aggregate(a)

                except TypeError:
                    failed_names.append(com.get_callable_name(a) or a)
                else:
                    results.append(new_res)

                    # make sure we find a good name
                    name = com.get_callable_name(a) or a
                    keys.append(name)

        # multiples
        else:
            indices = []
            for index, col in enumerate(selected_obj):
                colg = obj._gotitem(col,
                                    ndim=1,
                                    subset=selected_obj.iloc[:, index])
                try:
                    # Capture and suppress any warnings emitted by us in the call
                    # to agg below, but pass through any warnings that were
                    # generated otherwise.
                    # This is necessary because of https://bugs.python.org/issue29672
                    # See GH #43741 for more details
                    with warnings.catch_warnings(record=True) as record:
                        new_res = colg.aggregate(arg)
                    if len(record) > 0:
                        match = re.compile(
                            depr_nuisance_columns_msg.format(".*"))
                        for warning in record:
                            if re.match(match, str(warning.message)):
                                failed_names.append(col)
                            else:
                                warnings.warn_explicit(
                                    message=warning.message,
                                    category=warning.category,
                                    filename=warning.filename,
                                    lineno=warning.lineno,
                                )

                except (TypeError, DataError):
                    failed_names.append(col)
                except ValueError as err:
                    # cannot aggregate
                    if "Must produce aggregated value" in str(err):
                        # raised directly in _aggregate_named
                        failed_names.append(col)
                    elif "no results" in str(err):
                        # reached in test_frame_apply.test_nuiscance_columns
                        #  where the colg.aggregate(arg) ends up going through
                        #  the selected_obj.ndim == 1 branch above with arg == ["sum"]
                        #  on a datetime64[ns] column
                        failed_names.append(col)
                    else:
                        raise
                else:
                    results.append(new_res)
                    indices.append(index)

            keys = selected_obj.columns.take(indices)

        # if we are empty
        if not len(results):
            raise ValueError("no results")

        if len(failed_names) > 0:
            warnings.warn(
                depr_nuisance_columns_msg.format(failed_names),
                FutureWarning,
                stacklevel=find_stack_level(),
            )

        try:
            concatenated = concat(results, keys=keys, axis=1, sort=False)
        except TypeError as err:
            # we are concatting non-NDFrame objects,
            # e.g. a list of scalars
            from pandas import Series

            result = Series(results, index=keys, name=obj.name)
            if is_nested_object(result):
                raise ValueError(
                    "cannot combine transform and aggregation operations"
                ) from err
            return result
        else:
            # Concat uses the first index to determine the final indexing order.
            # The union of a shorter first index with the other indices causes
            # the index sorting to be different from the order of the aggregating
            # functions. Reindex if this is the case.
            index_size = concatenated.index.size
            full_ordered_index = next(result.index for result in results
                                      if result.index.size == index_size)
            return concatenated.reindex(full_ordered_index, copy=False)
Example #12
0
def relabel_result(
    result: DataFrame | Series,
    func: dict[str, list[Callable | str]],
    columns: Iterable[Hashable],
    order: Iterable[int],
) -> dict[Hashable, Series]:
    """
    Internal function to reorder result if relabelling is True for
    dataframe.agg, and return the reordered result in dict.

    Parameters:
    ----------
    result: Result from aggregation
    func: Dict of (column name, funcs)
    columns: New columns name for relabelling
    order: New order for relabelling

    Examples:
    ---------
    >>> result = DataFrame({"A": [np.nan, 2, np.nan],
    ...       "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]})  # doctest: +SKIP
    >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]}
    >>> columns = ("foo", "aab", "bar", "dat")
    >>> order = [0, 1, 2, 3]
    >>> _relabel_result(result, func, columns, order)  # doctest: +SKIP
    dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]),
         C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]),
         B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"]))
    """
    from pandas.core.indexes.base import Index

    reordered_indexes = [
        pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1])
    ]
    reordered_result_in_dict: dict[Hashable, Series] = {}
    idx = 0

    reorder_mask = not isinstance(result,
                                  ABCSeries) and len(result.columns) > 1
    for col, fun in func.items():
        s = result[col].dropna()

        # In the `_aggregate`, the callable names are obtained and used in `result`, and
        # these names are ordered alphabetically. e.g.
        #           C2   C1
        # <lambda>   1  NaN
        # amax     NaN  4.0
        # max      NaN  4.0
        # sum     18.0  6.0
        # Therefore, the order of functions for each column could be shuffled
        # accordingly so need to get the callable name if it is not parsed names, and
        # reorder the aggregated result for each column.
        # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is
        # [sum, <lambda>], but in `result`, it will be [<lambda>, sum], and we need to
        # reorder so that aggregated values map to their functions regarding the order.

        # However there is only one column being used for aggregation, not need to
        # reorder since the index is not sorted, and keep as is in `funcs`, e.g.
        #         A
        # min   1.0
        # mean  1.5
        # mean  1.5
        if reorder_mask:
            fun = [
                com.get_callable_name(f) if not isinstance(f, str) else f
                for f in fun
            ]
            col_idx_order = Index(s.index).get_indexer(fun)
            s = s[col_idx_order]

        # assign the new user-provided "named aggregation" as index names, and reindex
        # it based on the whole user-provided names.
        s.index = reordered_indexes[idx:idx + len(fun)]
        reordered_result_in_dict[col] = s.reindex(columns, copy=False)
        idx = idx + len(fun)
    return reordered_result_in_dict
Example #13
0
    def _aggregate_multiple_funcs(self, arg, _axis):
        from pandas.core.reshape.concat import concat

        if _axis != 0:
            raise NotImplementedError("axis other than 0 is not supported")

        if self._selected_obj.ndim == 1:
            obj = self._selected_obj
        else:
            obj = self._obj_with_exclusions

        results = []
        keys = []

        # degenerate case
        if obj.ndim == 1:
            for a in arg:
                colg = self._gotitem(obj.name, ndim=1, subset=obj)
                try:
                    new_res = colg.aggregate(a)

                except TypeError:
                    pass
                else:
                    results.append(new_res)

                    # make sure we find a good name
                    name = com.get_callable_name(a) or a
                    keys.append(name)

        # multiples
        else:
            for index, col in enumerate(obj):
                colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index])
                try:
                    new_res = colg.aggregate(arg)
                except (TypeError, DataError):
                    pass
                except ValueError as err:
                    # cannot aggregate
                    if "Must produce aggregated value" in str(err):
                        # raised directly in _aggregate_named
                        pass
                    elif "no results" in str(err):
                        # raised direcly in _aggregate_multiple_funcs
                        pass
                    else:
                        raise
                else:
                    results.append(new_res)
                    keys.append(col)

        # if we are empty
        if not len(results):
            raise ValueError("no results")

        try:
            return concat(results, keys=keys, axis=1, sort=False)
        except TypeError as err:

            # we are concatting non-NDFrame objects,
            # e.g. a list of scalars

            from pandas import Series

            result = Series(results, index=keys, name=self.name)
            if is_nested_object(result):
                raise ValueError(
                    "cannot combine transform and aggregation operations"
                ) from err
            return result
Example #14
0
    def _aggregate_multiple_funcs(self, arg, _level, _axis):
        from pandas.core.reshape.concat import concat

        if _axis != 0:
            raise NotImplementedError("axis other than 0 is not supported")

        if self._selected_obj.ndim == 1:
            obj = self._selected_obj
        else:
            obj = self._obj_with_exclusions

        results = []
        keys = []

        # degenerate case
        if obj.ndim == 1:
            for a in arg:
                try:
                    colg = self._gotitem(obj.name, ndim=1, subset=obj)
                    results.append(colg.aggregate(a))

                    # make sure we find a good name
                    name = com.get_callable_name(a) or a
                    keys.append(name)
                except (TypeError, DataError):
                    pass
                except SpecificationError:
                    raise

        # multiples
        else:
            for index, col in enumerate(obj):
                try:
                    colg = self._gotitem(col,
                                         ndim=1,
                                         subset=obj.iloc[:, index])
                    results.append(colg.aggregate(arg))
                    keys.append(col)
                except (TypeError, DataError):
                    pass
                except ValueError:
                    # cannot aggregate
                    continue
                except SpecificationError:
                    raise

        # if we are empty
        if not len(results):
            raise ValueError("no results")

        try:
            return concat(results, keys=keys, axis=1, sort=False)
        except TypeError:

            # we are concatting non-NDFrame objects,
            # e.g. a list of scalars

            from pandas.core.dtypes.cast import is_nested_object
            from pandas import Series
            result = Series(results, index=keys, name=self.name)
            if is_nested_object(result):
                raise ValueError("cannot combine transform and "
                                 "aggregation operations")
            return result
    def agg_list_like(self) -> FrameOrSeriesUnion:
        """
        Compute aggregation in the case of a list-like argument.

        Returns
        -------
        Result of aggregation.
        """
        from pandas.core.reshape.concat import concat

        obj = self.obj
        arg = cast(List[AggFuncTypeBase], self.f)

        if not isinstance(obj, SelectionMixin):
            # i.e. obj is Series or DataFrame
            selected_obj = obj
        elif obj._selected_obj.ndim == 1:
            # For SeriesGroupBy this matches _obj_with_exclusions
            selected_obj = obj._selected_obj
        else:
            selected_obj = obj._obj_with_exclusions

        results = []
        keys = []

        # degenerate case
        if selected_obj.ndim == 1:
            for a in arg:
                colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
                try:
                    new_res = colg.aggregate(a)

                except TypeError:
                    pass
                else:
                    results.append(new_res)

                    # make sure we find a good name
                    name = com.get_callable_name(a) or a
                    keys.append(name)

        # multiples
        else:
            indices = []
            for index, col in enumerate(selected_obj):
                colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
                try:
                    new_res = colg.aggregate(arg)
                except (TypeError, DataError):
                    pass
                except ValueError as err:
                    # cannot aggregate
                    if "Must produce aggregated value" in str(err):
                        # raised directly in _aggregate_named
                        pass
                    elif "no results" in str(err):
                        # reached in test_frame_apply.test_nuiscance_columns
                        #  where the colg.aggregate(arg) ends up going through
                        #  the selected_obj.ndim == 1 branch above with arg == ["sum"]
                        #  on a datetime64[ns] column
                        pass
                    else:
                        raise
                else:
                    results.append(new_res)
                    indices.append(index)

            keys = selected_obj.columns.take(indices)

        # if we are empty
        if not len(results):
            raise ValueError("no results")

        try:
            concatenated = concat(results, keys=keys, axis=1, sort=False)
        except TypeError as err:
            # we are concatting non-NDFrame objects,
            # e.g. a list of scalars
            from pandas import Series

            result = Series(results, index=keys, name=obj.name)
            if is_nested_object(result):
                raise ValueError(
                    "cannot combine transform and aggregation operations"
                ) from err
            return result
        else:
            # Concat uses the first index to determine the final indexing order.
            # The union of a shorter first index with the other indices causes
            # the index sorting to be different from the order of the aggregating
            # functions. Reindex if this is the case.
            index_size = concatenated.index.size
            full_ordered_index = next(
                result.index for result in results if result.index.size == index_size
            )
            return concatenated.reindex(full_ordered_index, copy=False)
Example #16
0
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args,
              **kwargs) -> FrameOrSeries:
    """
    Transform a DataFrame or Series

    Parameters
    ----------
    obj : DataFrame or Series
        Object to compute the transform on.
    func : string, function, list, or dictionary
        Function(s) to compute the transform with.
    axis : {0 or 'index', 1 or 'columns'}
        Axis along which the function is applied:

        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.

    Returns
    -------
    DataFrame or Series
        Result of applying ``func`` along the given axis of the
        Series or DataFrame.

    Raises
    ------
    ValueError
        If the transform function fails or does not transform.
    """
    from pandas.core.reshape.concat import concat

    is_series = obj.ndim == 1

    if obj._get_axis_number(axis) == 1:
        assert not is_series
        return transform(obj.T, func, 0, *args, **kwargs).T

    if isinstance(func, list):
        if is_series:
            func = {com.get_callable_name(v) or v: v for v in func}
        else:
            func = {col: func for col in obj}

    if isinstance(func, dict):
        if not is_series:
            cols = sorted(set(func.keys()) - set(obj.columns))
            if len(cols) > 0:
                raise SpecificationError(f"Column(s) {cols} do not exist")

        if any(isinstance(v, dict) for v in func.values()):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        results = {}
        for name, how in func.items():
            colg = obj._gotitem(name, ndim=1)
            try:
                results[name] = transform(colg, how, 0, *args, **kwargs)
            except Exception as e:
                if str(e) == "Function did not transform":
                    raise e

        # combine results
        if len(results) == 0:
            raise ValueError("Transform function failed")
        return concat(results, axis=1)

    # func is either str or callable
    try:
        if isinstance(func, str):
            result = obj._try_aggregate_string_function(func, *args, **kwargs)
        else:
            f = obj._get_cython_func(func)
            if f and not args and not kwargs:
                result = getattr(obj, f)()
            else:
                try:
                    result = obj.apply(func, args=args, **kwargs)
                except Exception:
                    result = func(obj, *args, **kwargs)
    except Exception:
        raise ValueError("Transform function failed")

    # Functions that transform may return empty Series/DataFrame
    # when the dtype is not appropriate
    if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
        raise ValueError("Transform function failed")
    if not isinstance(result,
                      (ABCSeries, ABCDataFrame)) or not result.index.equals(
                          obj.index):
        raise ValueError("Function did not transform")

    return result
Example #17
0
    def agg_list_like(self) -> FrameOrSeriesUnion:
        """
        Compute aggregation in the case of a list-like argument.

        Returns
        -------
        Result of aggregation.
        """
        from pandas.core.reshape.concat import concat

        obj = self.obj
        arg = cast(List[AggFuncTypeBase], self.f)

        if not isinstance(obj, SelectionMixin):
            # i.e. obj is Series or DataFrame
            selected_obj = obj
        elif obj._selected_obj.ndim == 1:
            selected_obj = obj._selected_obj
        else:
            selected_obj = obj._obj_with_exclusions

        results = []
        keys = []

        # degenerate case
        if selected_obj.ndim == 1:
            for a in arg:
                colg = obj._gotitem(selected_obj.name,
                                    ndim=1,
                                    subset=selected_obj)
                try:
                    new_res = colg.aggregate(a)

                except TypeError:
                    pass
                else:
                    results.append(new_res)

                    # make sure we find a good name
                    name = com.get_callable_name(a) or a
                    keys.append(name)

        # multiples
        else:
            for index, col in enumerate(selected_obj):
                colg = obj._gotitem(col,
                                    ndim=1,
                                    subset=selected_obj.iloc[:, index])
                try:
                    new_res = colg.aggregate(arg)
                except (TypeError, DataError):
                    pass
                except ValueError as err:
                    # cannot aggregate
                    if "Must produce aggregated value" in str(err):
                        # raised directly in _aggregate_named
                        pass
                    elif "no results" in str(err):
                        # raised directly in _aggregate_multiple_funcs
                        pass
                    else:
                        raise
                else:
                    results.append(new_res)
                    keys.append(col)

        # if we are empty
        if not len(results):
            raise ValueError("no results")

        try:
            return concat(results, keys=keys, axis=1, sort=False)
        except TypeError as err:

            # we are concatting non-NDFrame objects,
            # e.g. a list of scalars

            from pandas import Series

            result = Series(results, index=keys, name=obj.name)
            if is_nested_object(result):
                raise ValueError(
                    "cannot combine transform and aggregation operations"
                ) from err
            return result
Example #18
0
    def _aggregate_multiple_funcs(self, arg, _level, _axis):
        from pandas.core.reshape.concat import concat

        if _axis != 0:
            raise NotImplementedError("axis other than 0 is not supported")

        if self._selected_obj.ndim == 1:
            obj = self._selected_obj
        else:
            obj = self._obj_with_exclusions

        results = []
        keys = []

        # degenerate case
        if obj.ndim == 1:
            for a in arg:
                try:
                    colg = self._gotitem(obj.name, ndim=1, subset=obj)
                    results.append(colg.aggregate(a))

                    # make sure we find a good name
                    name = com.get_callable_name(a) or a
                    keys.append(name)
                except (TypeError, DataError):
                    pass
                except SpecificationError:
                    raise

        # multiples
        else:
            for index, col in enumerate(obj):
                try:
                    colg = self._gotitem(col, ndim=1,
                                         subset=obj.iloc[:, index])
                    results.append(colg.aggregate(arg))
                    keys.append(col)
                except (TypeError, DataError):
                    pass
                except ValueError:
                    # cannot aggregate
                    continue
                except SpecificationError:
                    raise

        # if we are empty
        if not len(results):
            raise ValueError("no results")

        try:
            return concat(results, keys=keys, axis=1, sort=False)
        except TypeError:

            # we are concatting non-NDFrame objects,
            # e.g. a list of scalars

            from pandas.core.dtypes.cast import is_nested_object
            from pandas import Series
            result = Series(results, index=keys, name=self.name)
            if is_nested_object(result):
                raise ValueError("cannot combine transform and "
                                 "aggregation operations")
            return result
Example #19
0
    def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
        mutated = self.mutated
        splitter = self._get_splitter(data, axis=axis)
        group_keys = self._get_group_keys()
        result_values = None

        if data.ndim == 2 and np.any(
                data.dtypes.apply(is_extension_array_dtype)):
            # calling splitter.fast_apply will raise TypeError via apply_frame_axis0
            #  if we pass EA instead of ndarray
            #  TODO: can we have a workaround for EAs backed by ndarray?
            pass

        elif isinstance(data._mgr, ArrayManager):
            # TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0
            # for now -> relies on BlockManager internals
            pass
        elif (com.get_callable_name(f) not in base.plotting_methods
              and isinstance(splitter, FrameSplitter) and axis == 0
              # fast_apply/libreduction doesn't allow non-numpy backed indexes
              and not data.index._has_complex_internals):
            try:
                sdata = splitter.sorted_data
                result_values, mutated = splitter.fast_apply(
                    f, sdata, group_keys)

            except IndexError:
                # This is a rare case in which re-running in python-space may
                #  make a difference, see  test_apply_mutate.test_mutate_groups
                pass

            else:
                # If the fast apply path could be used we can return here.
                # Otherwise we need to fall back to the slow implementation.
                if len(result_values) == len(group_keys):
                    return group_keys, result_values, mutated

        for key, (i, group) in zip(group_keys, splitter):
            object.__setattr__(group, "name", key)

            # result_values is None if fast apply path wasn't taken
            # or fast apply aborted with an unexpected exception.
            # In either case, initialize the result list and perform
            # the slow iteration.
            if result_values is None:
                result_values = []

            # If result_values is not None we're in the case that the
            # fast apply loop was broken prematurely but we have
            # already the result for the first group which we can reuse.
            elif i == 0:
                continue

            # group might be modified
            group_axes = group.axes
            res = f(group)
            if not _is_indexed_like(res, group_axes, axis):
                mutated = True
            result_values.append(res)

        return group_keys, result_values, mutated
Example #20
0
    def transform(self) -> DataFrame | Series:
        """
        Transform a DataFrame or Series.

        Returns
        -------
        DataFrame or Series
            Result of applying ``func`` along the given axis of the
            Series or DataFrame.

        Raises
        ------
        ValueError
            If the transform function fails or does not transform.
        """
        obj = self.obj
        func = self.orig_f
        axis = self.axis
        args = self.args
        kwargs = self.kwargs

        is_series = obj.ndim == 1

        if obj._get_axis_number(axis) == 1:
            assert not is_series
            return obj.T.transform(func, 0, *args, **kwargs).T

        if is_list_like(func) and not is_dict_like(func):
            func = cast(List[AggFuncTypeBase], func)
            # Convert func equivalent dict
            if is_series:
                func = {com.get_callable_name(v) or v: v for v in func}
            else:
                func = {col: func for col in obj}

        if is_dict_like(func):
            func = cast(AggFuncTypeDict, func)
            return self.transform_dict_like(func)

        # func is either str or callable
        func = cast(AggFuncTypeBase, func)
        try:
            result = self.transform_str_or_callable(func)
        except TypeError:
            raise
        except Exception as err:
            raise ValueError("Transform function failed") from err

        # Functions that transform may return empty Series/DataFrame
        # when the dtype is not appropriate
        if (isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty
                and not obj.empty):
            raise ValueError("Transform function failed")
        # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type
        # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy,
        # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame,
        # Series]"
        if not isinstance(
                result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
                    obj.index  # type:ignore[arg-type]
                ):
            raise ValueError("Function did not transform")

        return result