Example #1
0
        def args_cast(self, *args, **kwargs):
            """
            Preprocess `default_to_pandas` function arguments and apply default function.

            Cast all Modin objects that function arguments contain to its pandas representation.
            """
            args = try_cast_to_pandas(args)
            kwargs = try_cast_to_pandas(kwargs)
            return wrapper(self, *args, **kwargs)
Example #2
0
    def _default_to_pandas(self, f, *args, **kwargs):
        """
        Defaults the execution of this function to pandas.

        TODO: add types.

        Parameters
        ----------
        f:
            The function to apply to each group.

        Returns
        -------
        A new Modin DataFrame with the result of the pandas function.
        """
        if (isinstance(self._by, type(self._query_compiler))
                and len(self._by.columns) == 1):
            by = self._by.columns[0] if self._drop else self._by.to_pandas(
            ).squeeze()
        elif isinstance(self._by, type(self._query_compiler)):
            by = list(self._by.columns)
        else:
            by = self._by

        by = try_cast_to_pandas(by)

        def groupby_on_multiple_columns(df, *args, **kwargs):
            return f(df.groupby(by=by, axis=self._axis, **self._kwargs), *args,
                     **kwargs)

        return self._df._default_to_pandas(groupby_on_multiple_columns, *args,
                                           **kwargs)
Example #3
0
def align_datetime_dtypes(*dfs):
    """
    Make all of the passed frames have DateTime dtype for the same columns.

    Cast column type of the certain frame to the DateTime type if any frame in
    the `dfs` sequence has DateTime type for this column.

    Parameters
    ----------
    *dfs : iterable of DataFrames
        DataFrames to align DateTime dtypes.

    Notes
    -----
    Passed Modin frames may be casted to pandas in the result.
    """
    datetime_cols = {}
    for df in dfs:
        for col, dtype in df.dtypes.items():
            # If we already decided to cast this column to DateTime no more actions are needed
            if col not in datetime_cols and is_datetime64_any_dtype(dtype):
                datetime_cols[col] = dtype

    casted_dfs = (
        # OmniSci has difficulties with casting to certain dtypes (i.e. datetime64),
        # so casting it to pandas before doing 'astype'
        tuple(try_cast_to_pandas(df).astype(datetime_cols) for df in dfs)
        # This is required so we don't try to cast empty OmniSci frames to pandas:
        # https://github.com/modin-project/modin/issues/3428
        if len(datetime_cols)
        else dfs
    )
    return casted_dfs
Example #4
0
 def _index_grouped(self):
     if self._index_grouped_cache is None:
         if hasattr(self._by, "columns") and len(self._by.columns) > 1:
             by = list(self._by.columns)
             is_multi_by = True
         else:
             by = self._by
             is_multi_by = self._is_multi_by
         if is_multi_by:
             # Because we are doing a collect (to_pandas) here and then groupby, we
             # end up using pandas implementation. Add the warning so the user is
             # aware.
             ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
             ErrorMessage.default_to_pandas("Groupby with multiple columns")
             if isinstance(by, list) and all(isinstance(o, str) for o in by):
                 pandas_df = self._df._query_compiler.getitem_column_array(
                     by
                 ).to_pandas()
             else:
                 by = try_cast_to_pandas(by)
                 pandas_df = self._df._to_pandas()
             self._index_grouped_cache = pandas_df.groupby(by=by).groups
         else:
             if isinstance(self._by, type(self._query_compiler)):
                 by = self._by.to_pandas().squeeze().values
             else:
                 by = self._by
             if self._axis == 0:
                 self._index_grouped_cache = self._index.groupby(by)
             else:
                 self._index_grouped_cache = self._columns.groupby(by)
     return self._index_grouped_cache
Example #5
0
    def _index_grouped(self):
        """
        Implement [METHOD_NAME].

        TODO: Add more details for this docstring template.

        Parameters
        ----------
        What arguments does this function have.
        [
        PARAMETER_NAME: PARAMETERS TYPES
            Description.
        ]

        Returns
        -------
        What this returns (if anything)
        """
        if self._index_grouped_cache is None:
            if hasattr(self._by, "columns") and len(self._by.columns) > 1:
                by = list(self._by.columns)
                is_multi_by = True
            else:
                by = self._by
                is_multi_by = self._is_multi_by
            if is_multi_by:
                # Because we are doing a collect (to_pandas) here and then groupby, we
                # end up using pandas implementation. Add the warning so the user is
                # aware.
                ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
                ErrorMessage.default_to_pandas("Groupby with multiple columns")
                if isinstance(by, list) and all(
                    hashable(o)
                    and (
                        o in self._df
                        or o in self._df._query_compiler.get_index_names(self._axis)
                    )
                    for o in by
                ):
                    pandas_df = self._df._query_compiler.getitem_column_array(
                        by
                    ).to_pandas()
                else:
                    by = try_cast_to_pandas(by, squeeze=True)
                    pandas_df = self._df._to_pandas()
                self._index_grouped_cache = pandas_df.groupby(by=by).groups
            else:
                if isinstance(self._by, type(self._query_compiler)):
                    by = self._by.to_pandas().squeeze().values
                else:
                    by = self._by
                if self._axis == 0:
                    self._index_grouped_cache = self._index.groupby(by)
                else:
                    self._index_grouped_cache = self._columns.groupby(by)
        return self._index_grouped_cache
Example #6
0
    def caller(
        cls,
        query_compiler,
        by,
        axis,
        groupby_args,
        map_args,
        map_func,
        numeric_only=True,
        **kwargs,
    ):
        if not (isinstance(by, (type(query_compiler)) or hashable(by))) or isinstance(
            by, pandas.Grouper
        ):
            by = try_cast_to_pandas(by, squeeze=True)
            default_func = (
                (lambda grp: grp.agg(map_func))
                if isinstance(map_func, dict)
                else map_func
            )
            return query_compiler.default_to_pandas(
                lambda df: default_func(
                    df.groupby(by=by, axis=axis, **groupby_args), **map_args
                )
            )
        assert axis == 0, "Can only groupby reduce with axis=0"

        if numeric_only:
            qc = query_compiler.getitem_column_array(
                query_compiler._modin_frame._numeric_columns(True)
            )
        else:
            qc = query_compiler

        map_fn, reduce_fn = cls.build_map_reduce_functions(
            by=by,
            axis=axis,
            groupby_args=groupby_args,
            map_func=map_func,
            map_args=map_args,
            **kwargs,
        )

        broadcastable_by = getattr(by, "_modin_frame", None)
        apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None
        new_modin_frame = qc._modin_frame.groupby_reduce(
            axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices
        )

        result = query_compiler.__constructor__(new_modin_frame)
        if result.index.name == "__reduced__":
            result.index.name = None
        return result
Example #7
0
    def _default_to_pandas(self, f, *args, **kwargs):
        """
        Execute function `f` in default-to-pandas way.

        Parameters
        ----------
        f : callable
            The function to apply to each group.
        *args : list
            Extra positional arguments to pass to `f`.
        **kwargs : dict
            Extra keyword arguments to pass to `f`.

        Returns
        -------
        modin.pandas.DataFrame
            A new Modin DataFrame with the result of the pandas function.
        """
        if (isinstance(self._by, type(self._query_compiler))
                and len(self._by.columns) == 1):
            by = self._by.columns[0] if self._drop else self._by.to_pandas(
            ).squeeze()
        # converting QC 'by' to a list of column labels only if this 'by' comes from the self (if drop is True)
        elif self._drop and isinstance(self._by, type(self._query_compiler)):
            by = list(self._by.columns)
        else:
            by = self._by

        by = try_cast_to_pandas(by, squeeze=True)
        # Since 'by' may be a 2D query compiler holding columns to group by,
        # to_pandas will also produce a pandas DataFrame containing them.
        # So splitting 2D 'by' into a list of 1D Series using 'GroupBy.validate_by':
        by = GroupBy.validate_by(by)

        def groupby_on_multiple_columns(df, *args, **kwargs):
            return f(
                df.groupby(by=by,
                           axis=self._axis,
                           squeeze=self._squeeze,
                           **self._kwargs),
                *args,
                **kwargs,
            )

        return self._df._default_to_pandas(groupby_on_multiple_columns, *args,
                                           **kwargs)
Example #8
0
    def _default_to_pandas(self, f, *args, **kwargs):
        """
        Execute function `f` in default-to-pandas way.

        Parameters
        ----------
        f : callable
            The function to apply to each group.
        *args : list
            Extra positional arguments to pass to `f`.
        **kwargs : dict
            Extra keyword arguments to pass to `f`.

        Returns
        -------
        modin.pandas.DataFrame
            A new Modin DataFrame with the result of the pandas function.
        """
        if (
            isinstance(self._by, type(self._query_compiler))
            and len(self._by.columns) == 1
        ):
            by = self._by.columns[0] if self._drop else self._by.to_pandas().squeeze()
        elif isinstance(self._by, type(self._query_compiler)):
            by = list(self._by.columns)
        else:
            by = self._by

        by = try_cast_to_pandas(by, squeeze=True)

        def groupby_on_multiple_columns(df, *args, **kwargs):
            return f(
                df.groupby(
                    by=by, axis=self._axis, squeeze=self._squeeze, **self._kwargs
                ),
                *args,
                **kwargs,
            )

        return self._df._default_to_pandas(groupby_on_multiple_columns, *args, **kwargs)
Example #9
0
def run_and_compare(
    fn,
    data,
    data2=None,
    force_lazy=True,
    force_arrow_execute=False,
    allow_subqueries=False,
    comparator=df_equals,
    **kwargs,
):
    """Verify equality of the results of the passed function executed against pandas and modin frame."""

    def run_modin(
        fn,
        data,
        data2,
        force_lazy,
        force_arrow_execute,
        allow_subqueries,
        constructor_kwargs,
        **kwargs,
    ):
        kwargs["df1"] = pd.DataFrame(data, **constructor_kwargs)
        kwargs["df2"] = pd.DataFrame(data2, **constructor_kwargs)
        kwargs["df"] = kwargs["df1"]

        if force_lazy:
            set_execution_mode(kwargs["df1"], "lazy")
            set_execution_mode(kwargs["df2"], "lazy")
        elif force_arrow_execute:
            set_execution_mode(kwargs["df1"], "arrow")
            set_execution_mode(kwargs["df2"], "arrow")

        exp_res = fn(lib=pd, **kwargs)

        if force_arrow_execute:
            set_execution_mode(exp_res, "arrow", allow_subqueries)
        elif force_lazy:
            set_execution_mode(exp_res, None, allow_subqueries)

        return exp_res

    constructor_kwargs = kwargs.pop("constructor_kwargs", {})
    try:
        kwargs["df1"] = pandas.DataFrame(data, **constructor_kwargs)
        kwargs["df2"] = pandas.DataFrame(data2, **constructor_kwargs)
        kwargs["df"] = kwargs["df1"]
        ref_res = fn(lib=pandas, **kwargs)
    except Exception as e:
        with pytest.raises(type(e)):
            exp_res = run_modin(
                fn=fn,
                data=data,
                data2=data2,
                force_lazy=force_lazy,
                force_arrow_execute=force_arrow_execute,
                allow_subqueries=allow_subqueries,
                constructor_kwargs=constructor_kwargs,
                **kwargs,
            )
            _ = exp_res.index
    else:
        exp_res = run_modin(
            fn=fn,
            data=data,
            data2=data2,
            force_lazy=force_lazy,
            force_arrow_execute=force_arrow_execute,
            allow_subqueries=allow_subqueries,
            constructor_kwargs=constructor_kwargs,
            **kwargs,
        )

        # Currently, strings are converted to categories when exported from OmniSci,
        # this makes the equality comparison fail. Converting string cols back to
        # their original dtypes until the issue is resolved:
        # https://github.com/modin-project/modin/issues/2747
        if isinstance(exp_res, pd.DataFrame):
            external_dtypes = exp_res.dtypes
            exp_res = try_cast_to_pandas(exp_res)
            internal_dtypes = exp_res.dtypes

            new_schema = {}
            for col in exp_res.columns:
                if (
                    internal_dtypes[col] == "category"
                    and external_dtypes[col] != "category"
                ):
                    new_schema[col] = external_dtypes[col]
            exp_res = exp_res.astype(new_schema)

        comparator(ref_res, exp_res)
Example #10
0
    def _compute_index_grouped(self, numerical=False):
        """
        Construct an index of group IDs.

        Parameters
        ----------
        numerical : bool, default: False
            Whether a group indices should be positional (True) or label-based (False).

        Returns
        -------
        dict
            A dict of {group name -> group indices} values.

        See Also
        --------
        pandas.core.groupby.GroupBy.groups
        """
        # We end up using pure pandas to compute group indices, so raising a warning
        ErrorMessage.default_to_pandas("Group indices computation")

        # Splitting level-by and column-by since we serialize them in a different ways
        by = None
        level = []
        if self._level is not None:
            level = self._level
            if not isinstance(level, list):
                level = [level]
        elif isinstance(self._by, list):
            by = []
            for o in self._by:
                if hashable(o) and o in self._query_compiler.get_index_names(
                        self._axis):
                    level.append(o)
                else:
                    by.append(o)
        else:
            by = self._by

        is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)
        # `dropna` param is the only one that matters for the group indices result
        dropna = self._kwargs.get("dropna", True)

        if hasattr(self._by, "columns") and is_multi_by:
            by = list(self._by.columns)

        if is_multi_by:
            # Because we are doing a collect (to_pandas) here and then groupby, we
            # end up using pandas implementation. Add the warning so the user is
            # aware.
            ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
            if isinstance(by, list) and all(
                    is_label(self._df, o, self._axis) for o in by):
                pandas_df = self._df._query_compiler.getitem_column_array(
                    by).to_pandas()
            else:
                by = try_cast_to_pandas(by, squeeze=True)
                pandas_df = self._df._to_pandas()
            by = wrap_into_list(by, level)
            groupby_obj = pandas_df.groupby(by=by, dropna=dropna)
            return groupby_obj.indices if numerical else groupby_obj.groups
        else:
            if isinstance(self._by, type(self._query_compiler)):
                by = self._by.to_pandas().squeeze().values
            elif self._by is None:
                index = self._query_compiler.get_axis(self._axis)
                levels_to_drop = [
                    i for i, name in enumerate(index.names)
                    if name not in level and i not in level
                ]
                by = index.droplevel(levels_to_drop)
                if isinstance(by, pandas.MultiIndex):
                    by = by.reorder_levels(level)
            else:
                by = self._by
            axis_labels = self._query_compiler.get_axis(self._axis)
            if numerical:
                # Since we want positional indices of the groups, we want to group
                # on a `RangeIndex`, not on the actual index labels
                axis_labels = pandas.RangeIndex(len(axis_labels))
            # `pandas.Index.groupby` doesn't take any parameters except `by`.
            # Have to convert an Index to a Series to be able to process `dropna=False`:
            if dropna:
                return axis_labels.groupby(by)
            else:
                groupby_obj = axis_labels.to_series().groupby(by,
                                                              dropna=dropna)
                return groupby_obj.indices if numerical else groupby_obj.groups
Example #11
0
    def _index_grouped(self):
        """
        Implement [METHOD_NAME].

        TODO: Add more details for this docstring template.

        Parameters
        ----------
        What arguments does this function have.
        [
        PARAMETER_NAME: PARAMETERS TYPES
            Description.
        ]

        Returns
        -------
        What this returns (if anything)
        """
        if self._index_grouped_cache is None:
            # Splitting level-by and column-by since we serialize them in a different ways
            by = None
            level = []
            if self._level is not None:
                level = self._level
                if not isinstance(level, list):
                    level = [level]
            elif isinstance(self._by, list):
                by = []
                for o in self._by:
                    if hashable(o) and o in self._query_compiler.get_index_names(
                        self._axis
                    ):
                        level.append(o)
                    else:
                        by.append(o)
            else:
                by = self._by

            is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)

            if hasattr(self._by, "columns") and is_multi_by:
                by = list(self._by.columns)

            if is_multi_by:
                # Because we are doing a collect (to_pandas) here and then groupby, we
                # end up using pandas implementation. Add the warning so the user is
                # aware.
                ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
                ErrorMessage.default_to_pandas("Groupby with multiple columns")
                if isinstance(by, list) and all(
                    is_label(self._df, o, self._axis) for o in by
                ):
                    pandas_df = self._df._query_compiler.getitem_column_array(
                        by
                    ).to_pandas()
                else:
                    by = try_cast_to_pandas(by, squeeze=True)
                    pandas_df = self._df._to_pandas()
                by = wrap_into_list(by, level)
                self._index_grouped_cache = pandas_df.groupby(by=by).groups
            else:
                if isinstance(self._by, type(self._query_compiler)):
                    by = self._by.to_pandas().squeeze().values
                elif self._by is None:
                    index = self._query_compiler.get_axis(self._axis)
                    levels_to_drop = [
                        i
                        for i, name in enumerate(index.names)
                        if name not in level and i not in level
                    ]
                    by = index.droplevel(levels_to_drop)
                    if isinstance(by, pandas.MultiIndex):
                        by = by.reorder_levels(level)
                else:
                    by = self._by
                if self._axis == 0:
                    self._index_grouped_cache = self._index.groupby(by)
                else:
                    self._index_grouped_cache = self._columns.groupby(by)
        return self._index_grouped_cache
Example #12
0
def test_simple_row_groupby(by, as_index, col1_category):
    pandas_df = pandas.DataFrame({
        "col1": [0, 1, 2, 3],
        "col2": [4, 5, np.NaN, 7],
        "col3": [np.NaN, np.NaN, 12, 10],
        "col4": [17, 13, 16, 15],
        "col5": [-4, -5, -6, -7],
    })

    if col1_category:
        pandas_df = pandas_df.astype({"col1": "category"})

    modin_df = from_pandas(pandas_df)
    n = 1

    def maybe_get_columns(df, by):
        if isinstance(by, list):
            return [o(df) if isinstance(o, GetColumn) else o for o in by]
        else:
            return by

    modin_groupby = modin_df.groupby(by=maybe_get_columns(modin_df, by),
                                     as_index=as_index)

    pandas_by = maybe_get_columns(pandas_df, try_cast_to_pandas(by))
    pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index)

    modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
    eval_ngroups(modin_groupby, pandas_groupby)
    eval_shift(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.ffill(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.sem(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_mean(modin_groupby, pandas_groupby)
    eval_any(modin_groupby, pandas_groupby)
    eval_min(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmax(),
                 is_default=True)
    eval_ndim(modin_groupby, pandas_groupby)
    if not check_df_columns_have_nans(modin_df, by):
        # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cumsum(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cummax(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cummin(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cumprod(axis=0))

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.pct_change(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    # Workaround for Pandas bug #34656. Recreate groupby object for Pandas
    pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index)
    apply_functions = [lambda df: df.sum(), min]
    for func in apply_functions:
        eval_apply(modin_groupby, pandas_groupby, func)

    eval_dtypes(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.first(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.backfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.bfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmin(),
                 is_default=True)
    eval_prod(modin_groupby, pandas_groupby)
    if as_index:
        eval_std(modin_groupby, pandas_groupby)
        eval_var(modin_groupby, pandas_groupby)
        eval_skew(modin_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        eval_agg(modin_groupby, pandas_groupby, func)
        eval_aggregate(modin_groupby, pandas_groupby, func)

    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.last(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.mad(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_general(modin_groupby, pandas_groupby, lambda df: df.rank())
    eval_max(modin_groupby, pandas_groupby)
    eval_len(modin_groupby, pandas_groupby)
    eval_sum(modin_groupby, pandas_groupby)
    eval_ngroup(modin_groupby, pandas_groupby)
    eval_general(modin_groupby, pandas_groupby, lambda df: df.nunique())
    eval_median(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.head(n),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.cov(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    if not check_df_columns_have_nans(modin_df, by):
        # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093.
        transform_functions = [lambda df: df + 4, lambda df: -df - 10]
        for func in transform_functions:
            eval_general(
                modin_groupby,
                pandas_groupby,
                lambda df: df.transform(func),
                check_exception_type=None,
            )

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        eval_pipe(modin_groupby, pandas_groupby, func)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.corr(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_fillna(modin_groupby, pandas_groupby)
    eval_count(modin_groupby, pandas_groupby)
    if get_current_backend() != "BaseOnPython":
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.size(),
            check_exception_type=None,
        )
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.tail(n),
                 is_default=True)
    eval_quantile(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.take(),
                 is_default=True)
    if isinstance(by, list) and not any(
            isinstance(o, (pd.Series, pandas.Series)) for o in by):
        # Not yet supported for non-original-column-from-dataframe Series in by:
        eval___getattr__(modin_groupby, pandas_groupby, "col3")
    eval_groups(modin_groupby, pandas_groupby)
Example #13
0
    def caller(
        cls,
        query_compiler,
        by,
        axis,
        groupby_args,
        map_args,
        map_func,
        reduce_func,
        reduce_args,
        numeric_only=True,
        drop=False,
        method=None,
        default_to_pandas_func=None,
    ):
        """
        Execute GroupBy aggregation with MapReduce approach.

        Parameters
        ----------
        query_compiler : BaseQueryCompiler
            Frame to group.
        by : BaseQueryCompiler, column or index label, Grouper or list of such
            Object that determine groups.
        axis : {0, 1}, default: 0
            Axis to group and apply aggregation function along. 0 means index axis
            when 1 means column axis.
        groupby_args : dict
            Dictionary which carries arguments for pandas.DataFrame.groupby.
        map_args : dict
            Arguments which will be passed to `map_func`.
        map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame
            Function to apply to the `GroupByObject` at the Map phase.
        reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame
            Function to apply to the `GroupByObject` at the Reduce phase.
        reduce_args : dict
            Arguments which will be passed to `reduce_func`.
        numeric_only : bool, default: True
            Whether or not to drop non-numeric columns before executing GroupBy.
        drop : bool, default: False
            Indicates whether or not by-data came from the `self` frame.
        method : str, optional
            Name of the GroupBy aggregation function. This is a hint to be able to do special casing.
        default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional
            The pandas aggregation function equivalent to the `map_func + reduce_func`.
            Used in case of defaulting to pandas. If not specified `map_func` is used.

        Returns
        -------
        The same type as `query_compiler`
            QueryCompiler which carries the result of GroupBy aggregation.
        """
        if groupby_args.get("level", None) is None and (
                not (isinstance(by, (type(query_compiler))) or hashable(by))
                or isinstance(by, pandas.Grouper)):
            by = try_cast_to_pandas(by, squeeze=True)
            if default_to_pandas_func is None:
                default_to_pandas_func = ((lambda grp: grp.agg(map_func))
                                          if isinstance(map_func, dict) else
                                          map_func)
            return query_compiler.default_to_pandas(
                lambda df: default_to_pandas_func(
                    df.groupby(by=by, axis=axis, **groupby_args), **map_args))
        assert axis == 0, "Can only groupby reduce with axis=0"

        if numeric_only:
            qc = query_compiler.getitem_column_array(
                query_compiler._modin_frame.numeric_columns(True))
        else:
            qc = query_compiler

        map_fn, reduce_fn = cls.build_map_reduce_functions(
            by=by,
            axis=axis,
            groupby_args=groupby_args,
            map_func=map_func,
            map_args=map_args,
            reduce_func=reduce_func,
            reduce_args=reduce_args,
            drop=drop,
            method=method,
        )

        # If `by` is a ModinFrame, then its partitions will be broadcasted to every
        # `self` partition in a way determined by engine (modin_frame.groupby_reduce)
        # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`.
        broadcastable_by = getattr(by, "_modin_frame", None)
        apply_indices = list(map_func.keys()) if isinstance(map_func,
                                                            dict) else None
        new_modin_frame = qc._modin_frame.groupby_reduce(
            axis,
            broadcastable_by,
            map_fn,
            reduce_fn,
            apply_indices=apply_indices)

        result = query_compiler.__constructor__(new_modin_frame)
        if result.index.name == "__reduced__":
            result.index.name = None
        return result
Example #14
0
def align_datetime_dtypes(*dfs):
    """
    Make all of the passed frames have DateTime dtype for the same columns.

    Cast column type of the certain frame to the DateTime type if any frame in
    the `dfs` sequence has DateTime type for this column.

    Parameters
    ----------
    *dfs : iterable of DataFrames
        DataFrames to align DateTime dtypes.

    Notes
    -----
    Passed Modin frames may be casted to pandas in the result.
    """
    datetime_cols = {}
    time_cols = set()
    for df in dfs:
        for col, dtype in df.dtypes.items():
            # If we already decided to cast this column to DateTime no more actions are needed
            if col not in datetime_cols and is_datetime64_any_dtype(dtype):
                datetime_cols[col] = dtype
            # datetime.time is considered to be an 'object' dtype in pandas that's why
            # we have to explicitly check the values type in the column
            elif (
                dtype == np.dtype("O")
                and col not in time_cols
                # OmniSci has difficulties with empty frames, so explicitly skip them
                # https://github.com/modin-project/modin/issues/3428
                and len(df) > 0
                and all(
                    isinstance(val, datetime.time) or pandas.isna(val)
                    for val in df[col]
                )
            ):
                time_cols.add(col)

    if len(datetime_cols) == 0 and len(time_cols) == 0:
        return dfs

    def convert_to_time(value):
        """Convert passed value to `datetime.time`."""
        if isinstance(value, datetime.time):
            return value
        elif isinstance(value, str):
            return datetime.time.fromisoformat(value)
        else:
            return datetime.time(value)

    time_cols_list = list(time_cols)
    casted_dfs = []
    for df in dfs:
        # OmniSci has difficulties with casting to certain dtypes (i.e. datetime64),
        # so casting it to pandas
        pandas_df = try_cast_to_pandas(df)
        if datetime_cols:
            pandas_df = pandas_df.astype(datetime_cols)
        if time_cols:
            pandas_df[time_cols_list] = pandas_df[time_cols_list].applymap(
                convert_to_time
            )
        casted_dfs.append(pandas_df)

    return casted_dfs
Example #15
0
        def caller(
            query_compiler,
            by,
            axis,
            groupby_args,
            map_args,
            reduce_args=None,
            numeric_only=True,
            drop=False,
        ):
            if not isinstance(by, (type(query_compiler), str)):
                by = try_cast_to_pandas(by, squeeze=True)
                return query_compiler.default_to_pandas(lambda df: map_func(
                    df.groupby(by=by, axis=axis, **groupby_args), **map_args))
            assert axis == 0, "Can only groupby reduce with axis=0"

            if numeric_only:
                qc = query_compiler.getitem_column_array(
                    query_compiler._modin_frame._numeric_columns(True))
            else:
                qc = query_compiler
            # since we're going to modify `groupby_args` dict in a `compute_map`,
            # we want to copy it to not propagate these changes into source dict, in case
            # of unsuccessful end of function
            groupby_args = groupby_args.copy()

            as_index = groupby_args.get("as_index", True)
            observed = groupby_args.get("observed", False)

            if isinstance(by, str):

                def _map(df):
                    # Set `as_index` to True to track the metadata of the grouping
                    # object It is used to make sure that between phases we are
                    # constructing the right index and placing columns in the correct
                    # order.
                    groupby_args["as_index"] = True
                    groupby_args["observed"] = True

                    result = map_func(
                        df.groupby(by=by, axis=axis, **groupby_args),
                        **map_args)
                    # The _modin_groupby_ prefix indicates that this is the first
                    # partition, and since we may need to insert the grouping data in
                    # the reduce phase
                    if (not isinstance(result.index, pandas.MultiIndex)
                            and result.index.name is not None
                            and result.index.name in result.columns):
                        result.index.name = "{}{}".format(
                            "_modin_groupby_", result.index.name)
                    return result

            else:

                def _map(df, other):
                    def compute_map(df, other):
                        # Set `as_index` to True to track the metadata of the grouping object
                        # It is used to make sure that between phases we are constructing the
                        # right index and placing columns in the correct order.
                        groupby_args["as_index"] = True
                        groupby_args["observed"] = True

                        other = other.squeeze(axis=axis ^ 1)
                        if isinstance(other, pandas.DataFrame):
                            df = pandas.concat(
                                [df] +
                                [other[[o for o in other if o not in df]]],
                                axis=1,
                            )
                            other = list(other.columns)
                        result = map_func(
                            df.groupby(by=other, axis=axis, **groupby_args),
                            **map_args)
                        # if `other` has category dtype, then pandas will drop that
                        # column after groupby, inserting it back to correctly process
                        # reduce phase
                        if (drop and not as_index
                                and isinstance(other, pandas.Series)
                                and isinstance(other.dtype,
                                               pandas.CategoricalDtype)
                                and result.index.name is not None
                                and result.index.name not in result.columns):
                            result.insert(loc=0,
                                          column=result.index.name,
                                          value=result.index)
                        # The _modin_groupby_ prefix indicates that this is the first partition,
                        # and since we may need to insert the grouping data in the reduce phase
                        if (not isinstance(result.index, pandas.MultiIndex)
                                and result.index.name is not None
                                and result.index.name in result.columns):
                            result.index.name = "{}{}".format(
                                "_modin_groupby_", result.index.name)
                        return result

                    try:
                        return compute_map(df, other)
                    # This will happen with Arrow buffer read-only errors. We don't want to copy
                    # all the time, so this will try to fast-path the code first.
                    except ValueError:
                        return compute_map(df.copy(), other.copy())

            def _reduce(df):
                def compute_reduce(df):
                    other_len = len(df.index.names)
                    df = df.reset_index(drop=False)
                    # See note above about setting `as_index`
                    groupby_args["as_index"] = as_index
                    groupby_args["observed"] = observed
                    if other_len > 1:
                        by_part = list(df.columns[0:other_len])
                    else:
                        by_part = df.columns[0]
                    result = reduce_func(
                        df.groupby(by=by_part, axis=axis, **groupby_args),
                        **reduce_args)
                    if (not isinstance(result.index, pandas.MultiIndex)
                            and result.index.name is not None
                            and "_modin_groupby_" in result.index.name):
                        result.index.name = result.index.name[
                            len("_modin_groupby_"):]
                    if isinstance(by_part, str) and by_part in result.columns:
                        if "_modin_groupby_" in by_part and drop:
                            col_name = by_part[len("_modin_groupby_"):]
                            new_result = result.drop(columns=col_name,
                                                     errors="ignore")
                            new_result.columns = [
                                col_name if "_modin_groupby_" in c else c
                                for c in new_result.columns
                            ]
                            return new_result
                        else:
                            return (result.drop(columns=by_part)
                                    if call_kwds.get("method", None) != "size"
                                    else result)
                    return result

                try:
                    return compute_reduce(df)
                # This will happen with Arrow buffer read-only errors. We don't want to copy
                # all the time, so this will try to fast-path the code first.
                except ValueError:
                    return compute_reduce(df.copy())

            # TODO: try to precompute `new_index` and `new_columns`
            if isinstance(by, str):
                new_modin_frame = qc._modin_frame._map_reduce(
                    axis, _map, reduce_func=_reduce, preserve_index=False)
            else:
                new_modin_frame = qc._modin_frame.groupby_reduce(
                    axis, by._modin_frame, _map, _reduce)
            result = query_compiler.__constructor__(new_modin_frame)
            if result.index.name == "__reduced__":
                result.index.name = None
            return result
Example #16
0
    def _index_grouped(self):
        """
        Construct an index of group IDs.

        Returns
        -------
        dict
            A dict of {group name -> group labels} values.

        See Also
        --------
        pandas.core.groupby.GroupBy.groups
        """
        if self._index_grouped_cache is None:
            # Splitting level-by and column-by since we serialize them in a different ways
            by = None
            level = []
            if self._level is not None:
                level = self._level
                if not isinstance(level, list):
                    level = [level]
            elif isinstance(self._by, list):
                by = []
                for o in self._by:
                    if hashable(o) and o in self._query_compiler.get_index_names(
                        self._axis
                    ):
                        level.append(o)
                    else:
                        by.append(o)
            else:
                by = self._by

            is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)

            if hasattr(self._by, "columns") and is_multi_by:
                by = list(self._by.columns)

            if is_multi_by:
                # Because we are doing a collect (to_pandas) here and then groupby, we
                # end up using pandas implementation. Add the warning so the user is
                # aware.
                ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
                ErrorMessage.default_to_pandas("Groupby with multiple columns")
                if isinstance(by, list) and all(
                    is_label(self._df, o, self._axis) for o in by
                ):
                    pandas_df = self._df._query_compiler.getitem_column_array(
                        by
                    ).to_pandas()
                else:
                    by = try_cast_to_pandas(by, squeeze=True)
                    pandas_df = self._df._to_pandas()
                by = wrap_into_list(by, level)
                self._index_grouped_cache = pandas_df.groupby(by=by).groups
            else:
                if isinstance(self._by, type(self._query_compiler)):
                    by = self._by.to_pandas().squeeze().values
                elif self._by is None:
                    index = self._query_compiler.get_axis(self._axis)
                    levels_to_drop = [
                        i
                        for i, name in enumerate(index.names)
                        if name not in level and i not in level
                    ]
                    by = index.droplevel(levels_to_drop)
                    if isinstance(by, pandas.MultiIndex):
                        by = by.reorder_levels(level)
                else:
                    by = self._by
                if self._axis == 0:
                    self._index_grouped_cache = self._index.groupby(by)
                else:
                    self._index_grouped_cache = self._columns.groupby(by)
        return self._index_grouped_cache
Example #17
0
    def caller(
        cls,
        query_compiler,
        by,
        map_func,
        reduce_func,
        axis,
        groupby_kwargs,
        agg_args,
        agg_kwargs,
        drop=False,
        method=None,
        default_to_pandas_func=None,
    ):
        """
        Execute GroupBy aggregation with TreeReduce approach.

        Parameters
        ----------
        query_compiler : BaseQueryCompiler
            Frame to group.
        by : BaseQueryCompiler, column or index label, Grouper or list of such
            Object that determine groups.
        map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame
            Function to apply to the `GroupByObject` at the Map phase.
        reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame
            Function to apply to the `GroupByObject` at the Reduce phase.
        axis : {0, 1}
            Axis to group and apply aggregation function along. 0 means index axis
            when 1 means column axis.
        groupby_kwargs : dict
            Dictionary which carries arguments for pandas.DataFrame.groupby.
        agg_args : list-like
            Positional arguments to pass to the aggregation functions.
        agg_kwargs : dict
            Keyword arguments to pass to the aggregation functions.
        drop : bool, default: False
            Indicates whether or not by-data came from the `self` frame.
        method : str, optional
            Name of the GroupBy aggregation function. This is a hint to be able to do special casing.
        default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional
            The pandas aggregation function equivalent to the `map_func + reduce_func`.
            Used in case of defaulting to pandas. If not specified `map_func` is used.

        Returns
        -------
        The same type as `query_compiler`
            QueryCompiler which carries the result of GroupBy aggregation.
        """
        if (axis != 0 or groupby_kwargs.get("level", None) is None and
            (not (isinstance(by, (type(query_compiler))) or hashable(by))
             or isinstance(by, pandas.Grouper))):
            by = try_cast_to_pandas(by, squeeze=True)
            # Since 'by' may be a 2D query compiler holding columns to group by,
            # to_pandas will also produce a pandas DataFrame containing them.
            # So splitting 2D 'by' into a list of 1D Series using 'GroupBy.validate_by':
            by = GroupBy.validate_by(by)
            if default_to_pandas_func is None:
                default_to_pandas_func = ((lambda grp: grp.agg(map_func))
                                          if isinstance(map_func, dict) else
                                          map_func)
            return query_compiler.default_to_pandas(
                lambda df: default_to_pandas_func(
                    df.groupby(by=by, axis=axis, **groupby_kwargs),
                    *agg_args,
                    **agg_kwargs,
                ))

        # The bug only occurs in the case of Categorical 'by', so we might want to check whether any of
        # the 'by' dtypes is Categorical before going into this branch, however triggering 'dtypes'
        # computation if they're not computed may take time, so we don't do it
        if not groupby_kwargs.get("sort", True) and isinstance(
                by, type(query_compiler)):
            ErrorMessage.missmatch_with_pandas(
                operation="df.groupby(categorical_by, sort=False)",
                message=
                ("the groupby keys will be sorted anyway, although the 'sort=False' was passed. "
                 "See the following issue for more details: "
                 "https://github.com/modin-project/modin/issues/3571"),
            )
            groupby_kwargs = groupby_kwargs.copy()
            groupby_kwargs["sort"] = True

        map_fn, reduce_fn = cls.build_map_reduce_functions(
            by=by,
            axis=axis,
            groupby_kwargs=groupby_kwargs,
            map_func=map_func,
            reduce_func=reduce_func,
            agg_args=agg_args,
            agg_kwargs=agg_kwargs,
            drop=drop,
            method=method,
        )

        # If `by` is a ModinFrame, then its partitions will be broadcasted to every
        # `self` partition in a way determined by engine (modin_frame.groupby_reduce)
        # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`.
        broadcastable_by = getattr(by, "_modin_frame", None)
        apply_indices = list(map_func.keys()) if isinstance(map_func,
                                                            dict) else None
        new_modin_frame = query_compiler._modin_frame.groupby_reduce(
            axis,
            broadcastable_by,
            map_fn,
            reduce_fn,
            apply_indices=apply_indices)

        result = query_compiler.__constructor__(new_modin_frame)
        if result.index.name == "__reduced__":
            result.index.name = None
        return result
Example #18
0
 def args_cast(self, *args, **kwargs):
     args = try_cast_to_pandas(args)
     kwargs = try_cast_to_pandas(kwargs)
     return wrapper(self, *args, **kwargs)