def _index_grouped(self): """ Implement [METHOD_NAME]. TODO: Add more details for this docstring template. Parameters ---------- What arguments does this function have. [ PARAMETER_NAME: PARAMETERS TYPES Description. ] Returns ------- What this returns (if anything) """ if self._index_grouped_cache is None: if hasattr(self._by, "columns") and len(self._by.columns) > 1: by = list(self._by.columns) is_multi_by = True else: by = self._by is_multi_by = self._is_multi_by if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( hashable(o) and ( o in self._df or o in self._df._query_compiler.get_index_names(self._axis) ) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def caller( cls, query_compiler, by, axis, groupby_args, map_args, map_func, numeric_only=True, **kwargs, ): if not (isinstance(by, (type(query_compiler)) or hashable(by))) or isinstance( by, pandas.Grouper ): by = try_cast_to_pandas(by, squeeze=True) default_func = ( (lambda grp: grp.agg(map_func)) if isinstance(map_func, dict) else map_func ) return query_compiler.default_to_pandas( lambda df: default_func( df.groupby(by=by, axis=axis, **groupby_args), **map_args ) ) assert axis == 0, "Can only groupby reduce with axis=0" if numeric_only: qc = query_compiler.getitem_column_array( query_compiler._modin_frame._numeric_columns(True) ) else: qc = query_compiler map_fn, reduce_fn = cls.build_map_reduce_functions( by=by, axis=axis, groupby_args=groupby_args, map_func=map_func, map_args=map_args, **kwargs, ) broadcastable_by = getattr(by, "_modin_frame", None) apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None new_modin_frame = qc._modin_frame.groupby_reduce( axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices ) result = query_compiler.__constructor__(new_modin_frame) if result.index.name == "__reduced__": result.index.name = None return result
def __init__( self, df, by, axis, level, as_index, sort, group_keys, squeeze, idx_name, drop, **kwargs, ): self._axis = axis self._idx_name = idx_name self._df = df self._query_compiler = self._df._query_compiler self._columns = self._query_compiler.columns self._by = by self._drop = drop if ( level is None and is_list_like(by) or isinstance(by, type(self._query_compiler)) ): # This tells us whether or not there are multiple columns/rows in the groupby self._is_multi_by = ( isinstance(by, type(self._query_compiler)) and len(by.columns) > 1 ) or ( not isinstance(by, type(self._query_compiler)) and axis == 0 and all( (hashable(obj) and obj in self._query_compiler.columns) or isinstance(obj, type(self._query_compiler)) or is_list_like(obj) for obj in self._by ) ) else: self._is_multi_by = False self._level = level self._kwargs = { "level": level, "sort": sort, "as_index": as_index, "group_keys": group_keys, } self._squeeze = squeeze self._kwargs.update(kwargs)
def __getitem__(self, key): kwargs = {**self._kwargs.copy(), "squeeze": self._squeeze} # Most of time indexing DataFrameGroupBy results in another DataFrameGroupBy object unless circumstances are # special in which case SeriesGroupBy has to be returned. Such circumstances are when key equals to a single # column name and is not a list of column names or list of one column name. make_dataframe = True if self._drop and self._as_index: if not isinstance(key, list): key = [key] kwargs["squeeze"] = True make_dataframe = False # When `as_index` is False, pandas will always convert to a `DataFrame`, we # convert to a list here so that the result will be a `DataFrame`. elif not self._as_index and not isinstance(key, list): # Sometimes `__getitem__` doesn't only get the item, it also gets the `by` # column. This logic is here to ensure that we also get the `by` data so # that it is there for `as_index=False`. if ( isinstance(self._by, type(self._query_compiler)) and all(c in self._columns for c in self._by.columns) and self._drop ): key = list(self._by.columns) + [key] else: key = [key] if isinstance(key, list) and (make_dataframe or not self._as_index): return DataFrameGroupBy( self._df[key], self._by, self._axis, idx_name=self._idx_name, drop=self._drop, **kwargs, ) if ( self._is_multi_by and isinstance(self._by, list) and not all(hashable(o) and o in self._df for o in self._by) ): raise NotImplementedError( "Column lookups on GroupBy with arbitrary Series in by" " is not yet supported." ) return SeriesGroupBy( self._df[key], self._by, self._axis, idx_name=self._idx_name, drop=False, **kwargs, )
def is_label(obj, label, axis=0): """ Check whether or not 'obj' contain column or index level with name 'label'. Parameters ---------- obj: DataFrame, Series or QueryCompiler Object to check. label: object, Label name to check. axis: int, Axis to search name along. Returns ------- Boolean """ qc = getattr(obj, "_query_compiler", obj) return hashable(label) and (label in qc.get_axis(axis ^ 1) or label in qc.get_index_names(axis))
def is_label(obj, label, axis=0): """ Check whether or not 'obj' contain column or index level with name 'label'. Parameters ---------- obj : modin.pandas.DataFrame, modin.pandas.Series or modin.core.storage_formats.base.BaseQueryCompiler Object to check. label : object Label name to check. axis : {0, 1}, default: 0 Axis to search for `label` along. Returns ------- bool True if check is successful, False otherwise. """ qc = getattr(obj, "_query_compiler", obj) return hashable(label) and (label in qc.get_axis(axis ^ 1) or label in qc.get_index_names(axis))
def caller( cls, query_compiler, by, map_func, reduce_func, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, method=None, default_to_pandas_func=None, ): """ Execute GroupBy aggregation with TreeReduce approach. Parameters ---------- query_compiler : BaseQueryCompiler Frame to group. by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Map phase. reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Reduce phase. axis : {0, 1} Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_kwargs : dict Dictionary which carries arguments for pandas.DataFrame.groupby. agg_args : list-like Positional arguments to pass to the aggregation functions. agg_kwargs : dict Keyword arguments to pass to the aggregation functions. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. method : str, optional Name of the GroupBy aggregation function. This is a hint to be able to do special casing. default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional The pandas aggregation function equivalent to the `map_func + reduce_func`. Used in case of defaulting to pandas. If not specified `map_func` is used. Returns ------- The same type as `query_compiler` QueryCompiler which carries the result of GroupBy aggregation. """ if (axis != 0 or groupby_kwargs.get("level", None) is None and (not (isinstance(by, (type(query_compiler))) or hashable(by)) or isinstance(by, pandas.Grouper))): by = try_cast_to_pandas(by, squeeze=True) # Since 'by' may be a 2D query compiler holding columns to group by, # to_pandas will also produce a pandas DataFrame containing them. # So splitting 2D 'by' into a list of 1D Series using 'GroupBy.validate_by': by = GroupBy.validate_by(by) if default_to_pandas_func is None: default_to_pandas_func = ((lambda grp: grp.agg(map_func)) if isinstance(map_func, dict) else map_func) return query_compiler.default_to_pandas( lambda df: default_to_pandas_func( df.groupby(by=by, axis=axis, **groupby_kwargs), *agg_args, **agg_kwargs, )) # The bug only occurs in the case of Categorical 'by', so we might want to check whether any of # the 'by' dtypes is Categorical before going into this branch, however triggering 'dtypes' # computation if they're not computed may take time, so we don't do it if not groupby_kwargs.get("sort", True) and isinstance( by, type(query_compiler)): ErrorMessage.missmatch_with_pandas( operation="df.groupby(categorical_by, sort=False)", message= ("the groupby keys will be sorted anyway, although the 'sort=False' was passed. " "See the following issue for more details: " "https://github.com/modin-project/modin/issues/3571"), ) groupby_kwargs = groupby_kwargs.copy() groupby_kwargs["sort"] = True map_fn, reduce_fn = cls.build_map_reduce_functions( by=by, axis=axis, groupby_kwargs=groupby_kwargs, map_func=map_func, reduce_func=reduce_func, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, method=method, ) # If `by` is a ModinFrame, then its partitions will be broadcasted to every # `self` partition in a way determined by engine (modin_frame.groupby_reduce) # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`. broadcastable_by = getattr(by, "_modin_frame", None) apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None new_modin_frame = query_compiler._modin_frame.groupby_reduce( axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices) result = query_compiler.__constructor__(new_modin_frame) if result.index.name == "__reduced__": result.index.name = None return result
def _compute_index_grouped(self, numerical=False): """ Construct an index of group IDs. Parameters ---------- numerical : bool, default: False Whether a group indices should be positional (True) or label-based (False). Returns ------- dict A dict of {group name -> group indices} values. See Also -------- pandas.core.groupby.GroupBy.groups """ # We end up using pure pandas to compute group indices, so raising a warning ErrorMessage.default_to_pandas("Group indices computation") # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) # `dropna` param is the only one that matters for the group indices result dropna = self._kwargs.get("dropna", True) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by): pandas_df = self._df._query_compiler.getitem_column_array( by).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) groupby_obj = pandas_df.groupby(by=by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by axis_labels = self._query_compiler.get_axis(self._axis) if numerical: # Since we want positional indices of the groups, we want to group # on a `RangeIndex`, not on the actual index labels axis_labels = pandas.RangeIndex(len(axis_labels)) # `pandas.Index.groupby` doesn't take any parameters except `by`. # Have to convert an Index to a Series to be able to process `dropna=False`: if dropna: return axis_labels.groupby(by) else: groupby_obj = axis_labels.to_series().groupby(by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups
def __getitem__(self, key): """ Implement indexing operation on a DataFrameGroupBy object. Parameters ---------- key : list or str Names of columns to use as subset of original object. Returns ------- DataFrameGroupBy or SeriesGroupBy Result of indexing operation. Raises ------ NotImplementedError Column lookups on GroupBy with arbitrary Series in by is not yet supported. """ # These parameters are common for building the resulted Series or DataFrame groupby object kwargs = { **self._kwargs.copy(), "by": self._by, "axis": self._axis, "idx_name": self._idx_name, "squeeze": self._squeeze, } # The rules of type deduction for the resulted object is the following: # 1. If `key` is a list-like or `as_index is False`, then the resulted object is a DataFrameGroupBy # 2. Otherwise, the resulted object is SeriesGroupBy # 3. Result type does not depend on the `by` origin # Examples: # - drop: any, as_index: any, __getitem__(key: list_like) -> DataFrameGroupBy # - drop: any, as_index: False, __getitem__(key: any) -> DataFrameGroupBy # - drop: any, as_index: True, __getitem__(key: label) -> SeriesGroupBy if is_list_like(key): make_dataframe = True else: if self._as_index: make_dataframe = False else: make_dataframe = True key = [key] if make_dataframe: internal_by = frozenset(self._internal_by) if len(internal_by.intersection(key)) != 0: ErrorMessage.missmatch_with_pandas( operation="GroupBy.__getitem__", message= ("intersection of the selection and 'by' columns is not yet supported, " + "to achieve the desired result rewrite the original code from:\n" + "df.groupby('by_column')['by_column']\n" + "to the:\n" + "df.groupby(df['by_column'].copy())['by_column']"), ) cols_to_grab = internal_by.union(key) key = [col for col in self._df.columns if col in cols_to_grab] return DataFrameGroupBy( self._df[key], drop=self._drop, **kwargs, ) if (self._is_multi_by and isinstance(self._by, list) and not all(hashable(o) and o in self._df for o in self._by)): raise NotImplementedError( "Column lookups on GroupBy with arbitrary Series in by" + " is not yet supported.") return SeriesGroupBy( self._df[key], drop=False, **kwargs, )
def _index_grouped(self): """ Implement [METHOD_NAME]. TODO: Add more details for this docstring template. Parameters ---------- What arguments does this function have. [ PARAMETER_NAME: PARAMETERS TYPES Description. ] Returns ------- What this returns (if anything) """ if self._index_grouped_cache is None: # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis ): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def caller( cls, query_compiler, by, axis, groupby_args, map_args, map_func, reduce_func, reduce_args, numeric_only=True, drop=False, method=None, default_to_pandas_func=None, ): """ Execute GroupBy aggregation with MapReduce approach. Parameters ---------- query_compiler : BaseQueryCompiler Frame to group. by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. axis : {0, 1}, default: 0 Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_args : dict Dictionary which carries arguments for pandas.DataFrame.groupby. map_args : dict Arguments which will be passed to `map_func`. map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Map phase. reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Reduce phase. reduce_args : dict Arguments which will be passed to `reduce_func`. numeric_only : bool, default: True Whether or not to drop non-numeric columns before executing GroupBy. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. method : str, optional Name of the GroupBy aggregation function. This is a hint to be able to do special casing. default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional The pandas aggregation function equivalent to the `map_func + reduce_func`. Used in case of defaulting to pandas. If not specified `map_func` is used. Returns ------- The same type as `query_compiler` QueryCompiler which carries the result of GroupBy aggregation. """ if groupby_args.get("level", None) is None and ( not (isinstance(by, (type(query_compiler))) or hashable(by)) or isinstance(by, pandas.Grouper)): by = try_cast_to_pandas(by, squeeze=True) if default_to_pandas_func is None: default_to_pandas_func = ((lambda grp: grp.agg(map_func)) if isinstance(map_func, dict) else map_func) return query_compiler.default_to_pandas( lambda df: default_to_pandas_func( df.groupby(by=by, axis=axis, **groupby_args), **map_args)) assert axis == 0, "Can only groupby reduce with axis=0" if numeric_only: qc = query_compiler.getitem_column_array( query_compiler._modin_frame.numeric_columns(True)) else: qc = query_compiler map_fn, reduce_fn = cls.build_map_reduce_functions( by=by, axis=axis, groupby_args=groupby_args, map_func=map_func, map_args=map_args, reduce_func=reduce_func, reduce_args=reduce_args, drop=drop, method=method, ) # If `by` is a ModinFrame, then its partitions will be broadcasted to every # `self` partition in a way determined by engine (modin_frame.groupby_reduce) # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`. broadcastable_by = getattr(by, "_modin_frame", None) apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None new_modin_frame = qc._modin_frame.groupby_reduce( axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices) result = query_compiler.__constructor__(new_modin_frame) if result.index.name == "__reduced__": result.index.name = None return result
def _index_grouped(self): """ Construct an index of group IDs. Returns ------- dict A dict of {group name -> group labels} values. See Also -------- pandas.core.groupby.GroupBy.groups """ if self._index_grouped_cache is None: # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis ): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache