def normalize_keyword_aggregation( kwargs: dict) -> Tuple[dict, List[str], List[int]]: """ Normalize user-provided "named aggregation" kwargs. Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs to the old Dict[str, List[scalar]]]. Parameters ---------- kwargs : dict Returns ------- aggspec : dict The transformed kwargs. columns : List[str] The user-provided keys. col_idx_order : List[int] List of columns indices. Examples -------- >>> normalize_keyword_aggregation({"output": ("input", "sum")}) (defaultdict(<class 'list'>, {'input': ['sum']}), ('output',), array([0])) """ # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. # TODO: aggspec type: typing.Dict[str, List[AggScalar]] # May be hitting https://github.com/python/mypy/issues/5958 # saying it doesn't have an attribute __name__ aggspec: DefaultDict = defaultdict(list) order = [] columns, pairs = list(zip(*kwargs.items())) for name, (column, aggfunc) in zip(columns, pairs): aggspec[column].append(aggfunc) order.append((column, com.get_callable_name(aggfunc) or aggfunc)) # uniquify aggfunc name if duplicated in order list uniquified_order = _make_unique_kwarg_list(order) # GH 25719, due to aggspec will change the order of assigned columns in aggregation # uniquified_aggspec will store uniquified order list and will compare it with order # based on index aggspec_order = [(column, com.get_callable_name(aggfunc) or aggfunc) for column, aggfuncs in aggspec.items() for aggfunc in aggfuncs] uniquified_aggspec = _make_unique_kwarg_list(aggspec_order) # get the new indice of columns by comparison col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) return aggspec, columns, col_idx_order
def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: """ Possibly mangle a list of aggfuncs. Parameters ---------- aggfuncs : Sequence Returns ------- mangled: list-like A new AggSpec sequence, where lambdas have been converted to have unique names. Notes ----- If just one aggfunc is passed, the name will not be mangled. """ if len(aggfuncs) <= 1: # don't mangle for .agg([lambda x: .]) return aggfuncs i = 0 mangled_aggfuncs = [] for aggfunc in aggfuncs: if com.get_callable_name(aggfunc) == "<lambda>": aggfunc = partial(aggfunc) aggfunc.__name__ = f"<lambda_{i}>" i += 1 mangled_aggfuncs.append(aggfunc) return mangled_aggfuncs
def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None sdata: FrameOrSeries = splitter._get_sorted_data() if sdata.ndim == 2 and np.any( sdata.dtypes.apply(is_extension_array_dtype)): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray # TODO: can we have a workaround for EAs backed by ndarray? pass elif (com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 # fast_apply/libreduction doesn't allow non-numpy backed indexes and not sdata.index._has_complex_internals): try: result_values, mutated = splitter.fast_apply( f, sdata, group_keys) except libreduction.InvalidApply as err: # This Exception is raised if `f` triggers an exception # but it is preferable to raise the exception in Python. if "Let this error raise above us" not in str(err): # TODO: can we infer anything about whether this is # worth-retrying in pure-python? raise else: # If the fast apply path could be used we can return here. # Otherwise we need to fall back to the slow implementation. if len(result_values) == len(group_keys): return group_keys, result_values, mutated for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) # result_values is None if fast apply path wasn't taken # or fast apply aborted with an unexpected exception. # In either case, initialize the result list and perform # the slow iteration. if result_values is None: result_values = [] # If result_values is not None we're in the case that the # fast apply loop was broken prematurely but we have # already the result for the first group which we can reuse. elif i == 0: continue # group might be modified group_axes = group.axes res = f(group) if not _is_indexed_like(res, group_axes): mutated = True result_values.append(res) return group_keys, result_values, mutated
def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() # oh boy f_name = com.get_callable_name(f) if (f_name not in base.plotting_methods and hasattr(splitter, 'fast_apply') and axis == 0): try: values, mutated = splitter.fast_apply(f, group_keys) return group_keys, values, mutated except reduction.InvalidApply: # we detect a mutation of some kind # so take slow path pass except Exception: # raise this error to the caller pass result_values = [] for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, 'name', key) # group might be modified group_axes = _get_axes(group) res = f(group) if not _is_indexed_like(res, group_axes): mutated = True result_values.append(res) return group_keys, result_values, mutated
def transform(self) -> FrameOrSeriesUnion: """ Transform a DataFrame or Series. Returns ------- DataFrame or Series Result of applying ``func`` along the given axis of the Series or DataFrame. Raises ------ ValueError If the transform function fails or does not transform. """ obj = self.obj func = self.orig_f axis = self.axis args = self.args kwargs = self.kwargs is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: assert not is_series return obj.T.transform(func, 0, *args, **kwargs).T if is_list_like(func) and not is_dict_like(func): func = cast(List[AggFuncTypeBase], func) # Convert func equivalent dict if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: func = {col: func for col in obj} if is_dict_like(func): func = cast(AggFuncTypeDict, func) return self.transform_dict_like(func) # func is either str or callable func = cast(AggFuncTypeBase, func) try: result = self.transform_str_or_callable(func) except Exception: raise ValueError("Transform function failed") # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate if ( isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty ): raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index ): raise ValueError("Function did not transform") return result
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs) -> FrameOrSeries: """ Transform a DataFrame or Series Parameters ---------- obj : DataFrame or Series Object to compute the transform on. func : string, function, list, or dictionary Function(s) to compute the transform with. axis : {0 or 'index', 1 or 'columns'} Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. Returns ------- DataFrame or Series Result of applying ``func`` along the given axis of the Series or DataFrame. Raises ------ ValueError If the transform function fails or does not transform. """ is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: assert not is_series return transform(obj.T, func, 0, *args, **kwargs).T if isinstance(func, list): if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: func = {col: func for col in obj} if isinstance(func, dict): return transform_dict_like(obj, func, *args, **kwargs) # func is either str or callable try: result = transform_str_or_callable(obj, func, *args, **kwargs) except Exception: raise ValueError("Transform function failed") # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index): raise ValueError("Function did not transform") return result
def aggregation( input_df: pd.DataFrame, group_key: str, group_values: List[str], agg_methods: List[Union[str, FunctionType]], ) -> Tuple[pd.DataFrame, List[str]]: """ Aggregate values after grouping table rows by a given key. Args: input_df: Input data frame. group_key: Used to determine the groups for the groupby. group_values: Used to aggregate values for the groupby. agg_methods: List of function or function names, e.g. ['mean', 'max', 'min', numpy.mean]. Do not use a lambda function because the name attribute of the lambda function cannot generate a unique string of column names in <lambda>. Returns: Tuple of output dataframe and new column names. """ new_df = input_df.copy() new_cols = [] for agg_method in agg_methods: if _is_lambda_function(agg_method): raise ValueError('Not supported lambda function.') elif isinstance(agg_method, str): pass elif isinstance(agg_method, FunctionType): pass else: raise ValueError('Supported types are: {} or {}.' ' Got {} instead.'.format(str, Callable, type(agg_method))) for agg_method in agg_methods: for col in group_values: # only str or FunctionType if isinstance(agg_method, str): agg_method_name = agg_method else: agg_method_name = get_callable_name(agg_method) new_col = "agg_{}_{}_by_{}".format(agg_method_name, col, group_key) df_agg = (input_df[[col] + [group_key]].groupby(group_key)[[ col ]].agg(agg_method)) df_agg.columns = [new_col] new_cols.append(new_col) new_df = new_df.merge(df_agg, how="left", right_index=True, left_on=group_key) return new_df, new_cols
def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None # oh boy f_name = com.get_callable_name(f) if ( f_name not in base.plotting_methods and hasattr(splitter, "fast_apply") and axis == 0 ): try: result_values, mutated = splitter.fast_apply(f, group_keys) # If the fast apply path could be used we can return here. # Otherwise we need to fall back to the slow implementation. if len(result_values) == len(group_keys): return group_keys, result_values, mutated except libreduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. pass except TypeError as err: if "Cannot convert" in str(err): # via apply_frame_axis0 if we pass a non-ndarray pass else: raise for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) # result_values is None if fast apply path wasn't taken # or fast apply aborted with an unexpected exception. # In either case, initialize the result list and perform # the slow iteration. if result_values is None: result_values = [] # If result_values is not None we're in the case that the # fast apply loop was broken prematurely but we have # already the result for the first group which we can reuse. elif i == 0: continue # group might be modified group_axes = _get_axes(group) res = f(group) if not _is_indexed_like(res, group_axes): mutated = True result_values.append(res) return group_keys, result_values, mutated
def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None # oh boy f_name = com.get_callable_name(f) if (f_name not in base.plotting_methods and hasattr(splitter, 'fast_apply') and axis == 0): try: result_values, mutated = splitter.fast_apply(f, group_keys) # If the fast apply path could be used we can return here. # Otherwise we need to fall back to the slow implementation. if len(result_values) == len(group_keys): return group_keys, result_values, mutated except reduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. pass except Exception: # raise this error to the caller pass for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, 'name', key) # result_values is None if fast apply path wasn't taken # or fast apply aborted with an unexpected exception. # In either case, initialize the result list and perform # the slow iteration. if result_values is None: result_values = [] # If result_values is not None we're in the case that the # fast apply loop was broken prematurely but we have # already the result for the first group which we can reuse. elif i == 0: continue # group might be modified group_axes = _get_axes(group) res = f(group) if not _is_indexed_like(res, group_axes): mutated = True result_values.append(res) return group_keys, result_values, mutated
def agg_list_like(self) -> DataFrame | Series: """ Compute aggregation in the case of a list-like argument. Returns ------- Result of aggregation. """ from pandas.core.reshape.concat import concat obj = self.obj arg = cast(List[AggFuncTypeBase], self.f) if not isinstance(obj, SelectionMixin): # i.e. obj is Series or DataFrame selected_obj = obj elif obj._selected_obj.ndim == 1: # For SeriesGroupBy this matches _obj_with_exclusions selected_obj = obj._selected_obj else: selected_obj = obj._obj_with_exclusions results = [] keys = [] failed_names = [] depr_nuisance_columns_msg = ( "{} did not aggregate successfully. If any error is " "raised this will raise in a future version of pandas. " "Drop these columns/ops to avoid this warning.") # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) try: new_res = colg.aggregate(a) except TypeError: failed_names.append(com.get_callable_name(a) or a) else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: # Capture and suppress any warnings emitted by us in the call # to agg below, but pass through any warnings that were # generated otherwise. # This is necessary because of https://bugs.python.org/issue29672 # See GH #43741 for more details with warnings.catch_warnings(record=True) as record: new_res = colg.aggregate(arg) if len(record) > 0: match = re.compile( depr_nuisance_columns_msg.format(".*")) for warning in record: if re.match(match, str(warning.message)): failed_names.append(col) else: warnings.warn_explicit( message=warning.message, category=warning.category, filename=warning.filename, lineno=warning.lineno, ) except (TypeError, DataError): failed_names.append(col) except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named failed_names.append(col) elif "no results" in str(err): # reached in test_frame_apply.test_nuiscance_columns # where the colg.aggregate(arg) ends up going through # the selected_obj.ndim == 1 branch above with arg == ["sum"] # on a datetime64[ns] column failed_names.append(col) else: raise else: results.append(new_res) indices.append(index) keys = selected_obj.columns.take(indices) # if we are empty if not len(results): raise ValueError("no results") if len(failed_names) > 0: warnings.warn( depr_nuisance_columns_msg.format(failed_names), FutureWarning, stacklevel=find_stack_level(), ) try: concatenated = concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=obj.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations" ) from err return result else: # Concat uses the first index to determine the final indexing order. # The union of a shorter first index with the other indices causes # the index sorting to be different from the order of the aggregating # functions. Reindex if this is the case. index_size = concatenated.index.size full_ordered_index = next(result.index for result in results if result.index.size == index_size) return concatenated.reindex(full_ordered_index, copy=False)
def relabel_result( result: DataFrame | Series, func: dict[str, list[Callable | str]], columns: Iterable[Hashable], order: Iterable[int], ) -> dict[Hashable, Series]: """ Internal function to reorder result if relabelling is True for dataframe.agg, and return the reordered result in dict. Parameters: ---------- result: Result from aggregation func: Dict of (column name, funcs) columns: New columns name for relabelling order: New order for relabelling Examples: --------- >>> result = DataFrame({"A": [np.nan, 2, np.nan], ... "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}) # doctest: +SKIP >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} >>> columns = ("foo", "aab", "bar", "dat") >>> order = [0, 1, 2, 3] >>> _relabel_result(result, func, columns, order) # doctest: +SKIP dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]), C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]), B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"])) """ from pandas.core.indexes.base import Index reordered_indexes = [ pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) ] reordered_result_in_dict: dict[Hashable, Series] = {} idx = 0 reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1 for col, fun in func.items(): s = result[col].dropna() # In the `_aggregate`, the callable names are obtained and used in `result`, and # these names are ordered alphabetically. e.g. # C2 C1 # <lambda> 1 NaN # amax NaN 4.0 # max NaN 4.0 # sum 18.0 6.0 # Therefore, the order of functions for each column could be shuffled # accordingly so need to get the callable name if it is not parsed names, and # reorder the aggregated result for each column. # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is # [sum, <lambda>], but in `result`, it will be [<lambda>, sum], and we need to # reorder so that aggregated values map to their functions regarding the order. # However there is only one column being used for aggregation, not need to # reorder since the index is not sorted, and keep as is in `funcs`, e.g. # A # min 1.0 # mean 1.5 # mean 1.5 if reorder_mask: fun = [ com.get_callable_name(f) if not isinstance(f, str) else f for f in fun ] col_idx_order = Index(s.index).get_indexer(fun) s = s[col_idx_order] # assign the new user-provided "named aggregation" as index names, and reindex # it based on the whole user-provided names. s.index = reordered_indexes[idx:idx + len(fun)] reordered_result_in_dict[col] = s.reindex(columns, copy=False) idx = idx + len(fun) return reordered_result_in_dict
def _aggregate_multiple_funcs(self, arg, _axis): from pandas.core.reshape.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: obj = self._selected_obj else: obj = self._obj_with_exclusions results = [] keys = [] # degenerate case if obj.ndim == 1: for a in arg: colg = self._gotitem(obj.name, ndim=1, subset=obj) try: new_res = colg.aggregate(a) except TypeError: pass else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: for index, col in enumerate(obj): colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) try: new_res = colg.aggregate(arg) except (TypeError, DataError): pass except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named pass elif "no results" in str(err): # raised direcly in _aggregate_multiple_funcs pass else: raise else: results.append(new_res) keys.append(col) # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=self.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations" ) from err return result
def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.core.reshape.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: obj = self._selected_obj else: obj = self._obj_with_exclusions results = [] keys = [] # degenerate case if obj.ndim == 1: for a in arg: try: colg = self._gotitem(obj.name, ndim=1, subset=obj) results.append(colg.aggregate(a)) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) except (TypeError, DataError): pass except SpecificationError: raise # multiples else: for index, col in enumerate(obj): try: colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): pass except ValueError: # cannot aggregate continue except SpecificationError: raise # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1, sort=False) except TypeError: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas.core.dtypes.cast import is_nested_object from pandas import Series result = Series(results, index=keys, name=self.name) if is_nested_object(result): raise ValueError("cannot combine transform and " "aggregation operations") return result
def agg_list_like(self) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a list-like argument. Returns ------- Result of aggregation. """ from pandas.core.reshape.concat import concat obj = self.obj arg = cast(List[AggFuncTypeBase], self.f) if not isinstance(obj, SelectionMixin): # i.e. obj is Series or DataFrame selected_obj = obj elif obj._selected_obj.ndim == 1: # For SeriesGroupBy this matches _obj_with_exclusions selected_obj = obj._selected_obj else: selected_obj = obj._obj_with_exclusions results = [] keys = [] # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) try: new_res = colg.aggregate(a) except TypeError: pass else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: new_res = colg.aggregate(arg) except (TypeError, DataError): pass except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named pass elif "no results" in str(err): # reached in test_frame_apply.test_nuiscance_columns # where the colg.aggregate(arg) ends up going through # the selected_obj.ndim == 1 branch above with arg == ["sum"] # on a datetime64[ns] column pass else: raise else: results.append(new_res) indices.append(index) keys = selected_obj.columns.take(indices) # if we are empty if not len(results): raise ValueError("no results") try: concatenated = concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=obj.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations" ) from err return result else: # Concat uses the first index to determine the final indexing order. # The union of a shorter first index with the other indices causes # the index sorting to be different from the order of the aggregating # functions. Reindex if this is the case. index_size = concatenated.index.size full_ordered_index = next( result.index for result in results if result.index.size == index_size ) return concatenated.reindex(full_ordered_index, copy=False)
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs) -> FrameOrSeries: """ Transform a DataFrame or Series Parameters ---------- obj : DataFrame or Series Object to compute the transform on. func : string, function, list, or dictionary Function(s) to compute the transform with. axis : {0 or 'index', 1 or 'columns'} Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. Returns ------- DataFrame or Series Result of applying ``func`` along the given axis of the Series or DataFrame. Raises ------ ValueError If the transform function fails or does not transform. """ from pandas.core.reshape.concat import concat is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: assert not is_series return transform(obj.T, func, 0, *args, **kwargs).T if isinstance(func, list): if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: func = {col: func for col in obj} if isinstance(func, dict): if not is_series: cols = sorted(set(func.keys()) - set(obj.columns)) if len(cols) > 0: raise SpecificationError(f"Column(s) {cols} do not exist") if any(isinstance(v, dict) for v in func.values()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") results = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as e: if str(e) == "Function did not transform": raise e # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1) # func is either str or callable try: if isinstance(func, str): result = obj._try_aggregate_string_function(func, *args, **kwargs) else: f = obj._get_cython_func(func) if f and not args and not kwargs: result = getattr(obj, f)() else: try: result = obj.apply(func, args=args, **kwargs) except Exception: result = func(obj, *args, **kwargs) except Exception: raise ValueError("Transform function failed") # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index): raise ValueError("Function did not transform") return result
def agg_list_like(self) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a list-like argument. Returns ------- Result of aggregation. """ from pandas.core.reshape.concat import concat obj = self.obj arg = cast(List[AggFuncTypeBase], self.f) if not isinstance(obj, SelectionMixin): # i.e. obj is Series or DataFrame selected_obj = obj elif obj._selected_obj.ndim == 1: selected_obj = obj._selected_obj else: selected_obj = obj._obj_with_exclusions results = [] keys = [] # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) try: new_res = colg.aggregate(a) except TypeError: pass else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: new_res = colg.aggregate(arg) except (TypeError, DataError): pass except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named pass elif "no results" in str(err): # raised directly in _aggregate_multiple_funcs pass else: raise else: results.append(new_res) keys.append(col) # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=obj.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations" ) from err return result
def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None if data.ndim == 2 and np.any( data.dtypes.apply(is_extension_array_dtype)): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray # TODO: can we have a workaround for EAs backed by ndarray? pass elif isinstance(data._mgr, ArrayManager): # TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0 # for now -> relies on BlockManager internals pass elif (com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 # fast_apply/libreduction doesn't allow non-numpy backed indexes and not data.index._has_complex_internals): try: sdata = splitter.sorted_data result_values, mutated = splitter.fast_apply( f, sdata, group_keys) except IndexError: # This is a rare case in which re-running in python-space may # make a difference, see test_apply_mutate.test_mutate_groups pass else: # If the fast apply path could be used we can return here. # Otherwise we need to fall back to the slow implementation. if len(result_values) == len(group_keys): return group_keys, result_values, mutated for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) # result_values is None if fast apply path wasn't taken # or fast apply aborted with an unexpected exception. # In either case, initialize the result list and perform # the slow iteration. if result_values is None: result_values = [] # If result_values is not None we're in the case that the # fast apply loop was broken prematurely but we have # already the result for the first group which we can reuse. elif i == 0: continue # group might be modified group_axes = group.axes res = f(group) if not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) return group_keys, result_values, mutated
def transform(self) -> DataFrame | Series: """ Transform a DataFrame or Series. Returns ------- DataFrame or Series Result of applying ``func`` along the given axis of the Series or DataFrame. Raises ------ ValueError If the transform function fails or does not transform. """ obj = self.obj func = self.orig_f axis = self.axis args = self.args kwargs = self.kwargs is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: assert not is_series return obj.T.transform(func, 0, *args, **kwargs).T if is_list_like(func) and not is_dict_like(func): func = cast(List[AggFuncTypeBase], func) # Convert func equivalent dict if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: func = {col: func for col in obj} if is_dict_like(func): func = cast(AggFuncTypeDict, func) return self.transform_dict_like(func) # func is either str or callable func = cast(AggFuncTypeBase, func) try: result = self.transform_str_or_callable(func) except TypeError: raise except Exception as err: raise ValueError("Transform function failed") from err # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate if (isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty): raise ValueError("Transform function failed") # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy, # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame, # Series]" if not isinstance( result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index # type:ignore[arg-type] ): raise ValueError("Function did not transform") return result