Ejemplo n.º 1
0
    def test_get_dummies_boolean(self):
        pdf = pd.DataFrame({"b": [True, False, True]})
        kdf = ps.from_pandas(pdf)

        self.assert_eq(ps.get_dummies(kdf), pd.get_dummies(pdf, dtype=np.int8))
        self.assert_eq(ps.get_dummies(kdf.b), pd.get_dummies(pdf.b, dtype=np.int8))
Ejemplo n.º 2
0
 def psser(self):
     return ps.from_pandas(self.pser)
Ejemplo n.º 3
0
 def ks_start_date(self):
     return ps.from_pandas(self.pd_start_date)
Ejemplo n.º 4
0
    def _test_groupby_expanding_func(self, f):
        pser = pd.Series([1, 2, 3, 2], index=np.random.rand(4), name="a")
        psser = ps.from_pandas(pser)
        self.assert_eq(
            getattr(psser.groupby(psser).expanding(2), f)().sort_index(),
            getattr(pser.groupby(pser).expanding(2), f)().sort_index(),
        )
        self.assert_eq(
            getattr(psser.groupby(psser).expanding(2), f)().sum(),
            getattr(pser.groupby(pser).expanding(2), f)().sum(),
        )

        # Multiindex
        pser = pd.Series(
            [1, 2, 3, 2],
            index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z"), ("c", "z")]),
            name="a",
        )
        psser = ps.from_pandas(pser)
        self.assert_eq(
            getattr(psser.groupby(psser).expanding(2), f)().sort_index(),
            getattr(pser.groupby(pser).expanding(2), f)().sort_index(),
        )

        pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 1.0]})
        psdf = ps.from_pandas(pdf)

        # The behavior of GroupBy.expanding is changed from pandas 1.3.
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            self.assert_eq(
                getattr(psdf.groupby(psdf.a).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(pdf.a).expanding(2), f)().sort_index(),
            )
            self.assert_eq(
                getattr(psdf.groupby(psdf.a).expanding(2), f)().sum(),
                getattr(pdf.groupby(pdf.a).expanding(2), f)().sum(),
            )
            self.assert_eq(
                getattr(psdf.groupby(psdf.a + 1).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(pdf.a + 1).expanding(2), f)().sort_index(),
            )
        else:
            self.assert_eq(
                getattr(psdf.groupby(psdf.a).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(pdf.a).expanding(2), f)().drop("a", axis=1).sort_index(),
            )
            self.assert_eq(
                getattr(psdf.groupby(psdf.a).expanding(2), f)().sum(),
                getattr(pdf.groupby(pdf.a).expanding(2), f)().sum().drop("a"),
            )
            self.assert_eq(
                getattr(psdf.groupby(psdf.a + 1).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(pdf.a + 1).expanding(2), f)().drop("a", axis=1).sort_index(),
            )

        self.assert_eq(
            getattr(psdf.b.groupby(psdf.a).expanding(2), f)().sort_index(),
            getattr(pdf.b.groupby(pdf.a).expanding(2), f)().sort_index(),
        )
        self.assert_eq(
            getattr(psdf.groupby(psdf.a)["b"].expanding(2), f)().sort_index(),
            getattr(pdf.groupby(pdf.a)["b"].expanding(2), f)().sort_index(),
        )
        self.assert_eq(
            getattr(psdf.groupby(psdf.a)[["b"]].expanding(2), f)().sort_index(),
            getattr(pdf.groupby(pdf.a)[["b"]].expanding(2), f)().sort_index(),
        )

        # Multiindex column
        columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
        pdf.columns = columns
        psdf.columns = columns

        # The behavior of GroupBy.expanding is changed from pandas 1.3.
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            self.assert_eq(
                getattr(psdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
            )

            self.assert_eq(
                getattr(psdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(),
                getattr(pdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(),
            )
        else:
            self.assert_eq(
                getattr(psdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(("a", "x")).expanding(2), f)()
                .drop(("a", "x"), axis=1)
                .sort_index(),
            )

            self.assert_eq(
                getattr(psdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(),
                getattr(pdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)()
                .drop([("a", "x"), ("a", "y")], axis=1)
                .sort_index(),
            )
Ejemplo n.º 5
0
    def test_eq(self):
        pdf, psdf = self.pdf, self.psdf

        pser, psser = pdf["this_numeric_cat"], psdf["this_numeric_cat"]
        ordered_pser, ordered_psser = (
            pdf["this_ordered_numeric_cat"],
            psdf["this_ordered_numeric_cat"],
        )
        self.assert_eq(ordered_pser == 1, ordered_psser == 1)
        self.assert_eq(pser == pser, psser == psser)
        self.assert_eq(ordered_pser == ordered_pser, ordered_psser == ordered_psser)

        pser, psser = pdf["this_string_cat"], psdf["this_string_cat"]
        ordered_pser, ordered_psser = (
            pdf["this_ordered_string_cat"],
            psdf["this_ordered_string_cat"],
        )
        self.assert_eq(pser == "x", psser == "x")
        self.assert_eq(pser == pser, psser == psser)
        self.assert_eq(ordered_pser == ordered_pser, ordered_psser == ordered_psser)

        self.assertRaisesRegex(
            TypeError,
            "Cannot compare a Categorical with a scalar, which is not a category",
            lambda: ordered_psser == 4,
        )
        self.assertRaisesRegex(
            TypeError,
            "Cannot compare a Categorical with a scalar, which is not a category",
            lambda: ordered_psser == "a",
        )
        self.assertRaisesRegex(
            TypeError,
            "Cannot compare a Categorical with the given type",
            lambda: ordered_psser == ps.Series([1, 2, 3]),
        )
        self.assertRaisesRegex(
            TypeError,
            "The operation can not be applied to list",
            lambda: ordered_psser == [1, 2, 3],
        )

        self.assert_eq(
            pdf["this_numeric_cat"] == pdf["that_numeric_cat"],
            psdf["this_numeric_cat"] == psdf["that_numeric_cat"],
        )
        self.assert_eq(
            pdf["this_string_cat"] == pdf["that_string_cat"],
            psdf["this_string_cat"] == psdf["that_string_cat"],
        )

        self.assert_eq(
            pdf["this_string_cat"] == pdf["this_given_cat_string_cat"],
            psdf["this_string_cat"] == psdf["this_given_cat_string_cat"],
        )

        pser1 = pd.Series(pd.Categorical(list("abca")))
        pser2 = pd.Series(pd.Categorical(list("bcaa"), categories=list("bca")))
        psser1 = ps.from_pandas(pser1)
        psser2 = ps.from_pandas(pser2)
        with option_context("compute.ops_on_diff_frames", True):
            self.assert_eq(pser1 == pser2, (psser1 == psser2).sort_index())
Ejemplo n.º 6
0
 def float_psser(self):
     return ps.from_pandas(self.float_pser)
    def test_split_apply_combine_on_series(self):
        pdf1 = pd.DataFrame({
            "C": [0.362, 0.227, 1.267, -0.562],
            "B": [1, 2, 3, 4]
        })
        pdf2 = pd.DataFrame({"A": [1, 1, 2, 2]})
        psdf1 = ps.from_pandas(pdf1)
        psdf2 = ps.from_pandas(pdf2)

        for as_index in [True, False]:
            if as_index:
                sort = lambda df: df.sort_index()
            else:
                sort = lambda df: df.sort_values(list(df.columns)).reset_index(
                    drop=True)

            with self.subTest(as_index=as_index):
                self.assert_eq(
                    sort(psdf1.groupby(psdf2.A, as_index=as_index).sum()),
                    sort(pdf1.groupby(pdf2.A, as_index=as_index).sum()),
                )
                self.assert_eq(
                    sort(psdf1.groupby(psdf2.A, as_index=as_index).B.sum()),
                    sort(pdf1.groupby(pdf2.A, as_index=as_index).B.sum()),
                )
                self.assert_eq(
                    sort(
                        psdf1.groupby([psdf1.C, psdf2.A],
                                      as_index=as_index).sum()),
                    sort(
                        pdf1.groupby([pdf1.C, pdf2.A],
                                     as_index=as_index).sum()),
                )
                self.assert_eq(
                    sort(
                        psdf1.groupby([psdf1.C + 1, psdf2.A],
                                      as_index=as_index).sum()),
                    sort(
                        pdf1.groupby([pdf1.C + 1, pdf2.A],
                                     as_index=as_index).sum()),
                )

        self.assert_eq(
            psdf1.B.groupby(psdf2.A).sum().sort_index(),
            pdf1.B.groupby(pdf2.A).sum().sort_index(),
        )
        self.assert_eq(
            (psdf1.B + 1).groupby(psdf2.A).sum().sort_index(),
            (pdf1.B + 1).groupby(pdf2.A).sum().sort_index(),
        )

        self.assert_eq(
            psdf1.B.groupby(psdf2.A.rename()).sum().sort_index(),
            pdf1.B.groupby(pdf2.A.rename()).sum().sort_index(),
        )
        self.assert_eq(
            psdf1.B.rename().groupby(psdf2.A).sum().sort_index(),
            pdf1.B.rename().groupby(pdf2.A).sum().sort_index(),
        )
        self.assert_eq(
            psdf1.B.rename().groupby(psdf2.A.rename()).sum().sort_index(),
            pdf1.B.rename().groupby(pdf2.A.rename()).sum().sort_index(),
        )
Ejemplo n.º 8
0
    def test_axis_on_dataframe(self):
        # The number of each count is intentionally big
        # because when data is small, it executes a shortcut.
        # Less than 'compute.shortcut_limit' will execute a shortcut
        # by using collected pandas dataframe directly.
        # now we set the 'compute.shortcut_limit' as 1000 explicitly
        with option_context("compute.shortcut_limit", 1000):
            pdf = pd.DataFrame(
                {
                    "A": [1, -2, 3, -4, 5] * 300,
                    "B": [1.0, -2, 3, -4, 5] * 300,
                    "C": [-6.0, -7, -8, -9, 10] * 300,
                    "D": [True, False, True, False, False] * 300,
                },
                index=range(10, 15001, 10),
            )
            kdf = ps.from_pandas(pdf)
            self.assert_eq(kdf.count(axis=1), pdf.count(axis=1))
            self.assert_eq(kdf.var(axis=1), pdf.var(axis=1))
            self.assert_eq(kdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0))
            self.assert_eq(kdf.std(axis=1), pdf.std(axis=1))
            self.assert_eq(kdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0))
            self.assert_eq(kdf.max(axis=1), pdf.max(axis=1))
            self.assert_eq(kdf.min(axis=1), pdf.min(axis=1))
            self.assert_eq(kdf.sum(axis=1), pdf.sum(axis=1))
            self.assert_eq(kdf.product(axis=1), pdf.product(axis=1))
            self.assert_eq(kdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
            self.assert_eq(kdf.skew(axis=1), pdf.skew(axis=1))
            self.assert_eq(kdf.mean(axis=1), pdf.mean(axis=1))
            self.assert_eq(kdf.sem(axis=1), pdf.sem(axis=1))
            self.assert_eq(kdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0))

            self.assert_eq(kdf.count(axis=1, numeric_only=True),
                           pdf.count(axis=1, numeric_only=True))
            self.assert_eq(kdf.var(axis=1, numeric_only=True),
                           pdf.var(axis=1, numeric_only=True))
            self.assert_eq(
                kdf.var(axis=1, ddof=0, numeric_only=True),
                pdf.var(axis=1, ddof=0, numeric_only=True),
            )
            self.assert_eq(kdf.std(axis=1, numeric_only=True),
                           pdf.std(axis=1, numeric_only=True))
            self.assert_eq(
                kdf.std(axis=1, ddof=0, numeric_only=True),
                pdf.std(axis=1, ddof=0, numeric_only=True),
            )
            self.assert_eq(kdf.max(axis=1, numeric_only=True),
                           pdf.max(axis=1, numeric_only=True).astype(float))
            self.assert_eq(kdf.min(axis=1, numeric_only=True),
                           pdf.min(axis=1, numeric_only=True).astype(float))
            self.assert_eq(kdf.sum(axis=1, numeric_only=True),
                           pdf.sum(axis=1, numeric_only=True).astype(float))
            self.assert_eq(
                kdf.product(axis=1, numeric_only=True),
                pdf.product(axis=1, numeric_only=True).astype(float),
            )
            self.assert_eq(kdf.kurtosis(axis=1, numeric_only=True),
                           pdf.kurtosis(axis=1, numeric_only=True))
            self.assert_eq(kdf.skew(axis=1, numeric_only=True),
                           pdf.skew(axis=1, numeric_only=True))
            self.assert_eq(kdf.mean(axis=1, numeric_only=True),
                           pdf.mean(axis=1, numeric_only=True))
            self.assert_eq(kdf.sem(axis=1, numeric_only=True),
                           pdf.sem(axis=1, numeric_only=True))
            self.assert_eq(
                kdf.sem(axis=1, ddof=0, numeric_only=True),
                pdf.sem(axis=1, ddof=0, numeric_only=True),
            )
Ejemplo n.º 9
0
    def test_stats_on_non_numeric_columns_should_be_discarded_if_numeric_only_is_true(
            self):
        pdf = pd.DataFrame({
            "i": [0, 1, 2],
            "b": [False, False, True],
            "s": ["x", "y", "z"]
        })
        kdf = ps.from_pandas(pdf)

        self.assert_eq(kdf[["i", "s"]].max(numeric_only=True),
                       pdf[["i", "s"]].max(numeric_only=True))
        self.assert_eq(kdf[["b", "s"]].max(numeric_only=True),
                       pdf[["b", "s"]].max(numeric_only=True))
        self.assert_eq(kdf[["i", "s"]].min(numeric_only=True),
                       pdf[["i", "s"]].min(numeric_only=True))
        self.assert_eq(kdf[["b", "s"]].min(numeric_only=True),
                       pdf[["b", "s"]].min(numeric_only=True))
        self.assert_eq(kdf.count(numeric_only=True),
                       pdf.count(numeric_only=True))

        if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
            self.assert_eq(kdf.sum(numeric_only=True),
                           pdf.sum(numeric_only=True))
            self.assert_eq(kdf.product(numeric_only=True),
                           pdf.product(numeric_only=True))
        else:
            self.assert_eq(kdf.sum(numeric_only=True),
                           pdf.sum(numeric_only=True).astype(int))
            self.assert_eq(kdf.product(numeric_only=True),
                           pdf.product(numeric_only=True).astype(int))

        self.assert_eq(kdf.mean(numeric_only=True),
                       pdf.mean(numeric_only=True))

        self.assert_eq(kdf.var(numeric_only=True),
                       pdf.var(numeric_only=True),
                       check_exact=False)
        self.assert_eq(
            kdf.var(ddof=0, numeric_only=True),
            pdf.var(ddof=0, numeric_only=True),
            check_exact=False,
        )
        self.assert_eq(kdf.std(numeric_only=True),
                       pdf.std(numeric_only=True),
                       check_exact=False)
        self.assert_eq(
            kdf.std(ddof=0, numeric_only=True),
            pdf.std(ddof=0, numeric_only=True),
            check_exact=False,
        )
        self.assert_eq(kdf.sem(numeric_only=True),
                       pdf.sem(numeric_only=True),
                       check_exact=False)
        self.assert_eq(
            kdf.sem(ddof=0, numeric_only=True),
            pdf.sem(ddof=0, numeric_only=True),
            check_exact=False,
        )

        self.assert_eq(len(kdf.median(numeric_only=True)),
                       len(pdf.median(numeric_only=True)))
        self.assert_eq(len(kdf.kurtosis(numeric_only=True)),
                       len(pdf.kurtosis(numeric_only=True)))
        self.assert_eq(len(kdf.skew(numeric_only=True)),
                       len(pdf.skew(numeric_only=True)))

        self.assert_eq(len(kdf.quantile(q=0.5, numeric_only=True)),
                       len(pdf.quantile(q=0.5, numeric_only=True)))
        self.assert_eq(
            len(kdf.quantile(q=[0.25, 0.5, 0.75], numeric_only=True)),
            len(pdf.quantile(q=[0.25, 0.5, 0.75], numeric_only=True)),
        )
Ejemplo n.º 10
0
 def test_from_to_pandas(self):
     data = [True, True, False, None]
     pser = pd.Series(data, dtype="boolean")
     psser = ps.Series(data, dtype="boolean")
     self.check_extension(pser, psser.to_pandas())
     self.check_extension(ps.from_pandas(pser), psser)
Ejemplo n.º 11
0
 def decimal_withnan_psser(self):
     return ps.from_pandas(self.decimal_withnan_pser)
Ejemplo n.º 12
0
 def integral_psdf(self):
     return ps.from_pandas(self.integral_pdf)
Ejemplo n.º 13
0
 def numeric_psdf(self):
     return ps.from_pandas(self.numeric_pdf)
Ejemplo n.º 14
0
    def test_get_dummies_decimal(self):
        pdf = pd.DataFrame({"d": [Decimal(1.0), Decimal(2.0), Decimal(1)]})
        kdf = ps.from_pandas(pdf)

        self.assert_eq(ps.get_dummies(kdf), pd.get_dummies(pdf, dtype=np.int8))
        self.assert_eq(ps.get_dummies(kdf.d), pd.get_dummies(pdf.d, dtype=np.int8), almost=True)
Ejemplo n.º 15
0
    def non_numeric_array_pssers(self):
        pssers = {}

        for k, v in self.non_numeric_array_psers.items():
            pssers[k] = ps.from_pandas(v)
        return pssers
 def kdf1(self):
     return ps.from_pandas(self.pdf1)
Ejemplo n.º 17
0
 def test_from_to_pandas(self):
     for pser, psser in self.numeric_pser_psser_pairs:
         self.assert_eq(pser, psser.to_pandas())
         self.assert_eq(ps.from_pandas(pser), psser)
Ejemplo n.º 18
0
    def transform_batch(
        self, func: Callable[..., Union[pd.DataFrame, pd.Series]], *args: Any, **kwargs: Any
    ) -> DataFrameOrSeries:
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a NumPy compound type style
            as below:

            >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[
            ...         (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame or Series

        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ps.DataFrame[int, [int, int]]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ps.DataFrame[("index", int), [('A', int), ('B', int)]]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)  # doctest: +NORMALIZE_WHITESPACE
               A  B
        index
        0      2  3
        1      4  5
        2      6  7

        >>> def plus_one_func(pdf) -> ps.Series[int]:
        ...     return pdf.B + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int64

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).pandas_on_spark.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark.pandas.series import first_series
        from pyspark import pandas as ps

        assert callable(func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        should_retain_index = should_infer_schema
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
            return func(pdf).to_frame()

        def pandas_series_func(
            f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType
        ) -> "UserDefinedFunctionLike":
            ff = f

            @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
            def udf(pdf: pd.DataFrame) -> pd.Series:
                return first_series(ff(pdf))

            return udf

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            log_advice(
                "If the type hints is not specified for `transform_batch`, "
                "it is expensive to infer the data type internally."
            )
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self._psdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed)
                )
            if len(transformed) != len(pdf):
                raise ValueError("transform_batch cannot produce aggregated results")
            psdf_or_psser = ps.from_pandas(transformed)

            if isinstance(psdf_or_psser, ps.Series):
                psser = cast(ps.Series, psdf_or_psser)

                field = psser._internal.data_fields[0].normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_series_func(output_func, return_type=field.spark_type)
                columns = self._psdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._psdf._internal.copy(
                    column_labels=psser._internal.column_labels,
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=psser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                psdf = cast(DataFrame, psdf_or_psser)
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return psdf

                index_fields = [
                    field.normalize_spark_type() for field in psdf._internal.index_fields
                ]
                data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields]

                return_schema = StructType(
                    [field.struct_field for field in index_fields + data_fields]
                )

                self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True  # type: ignore[arg-type]
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(  # type: ignore[call-overload]
                    output_func, returnType=return_schema
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                return DataFrame(
                    psdf._internal.with_new_sdf(
                        spark_frame=sdf, index_fields=index_fields, data_fields=data_fields
                    )
                )
        else:
            return_type = infer_return_type(original_func)
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            if is_return_series:
                field = InternalField(
                    dtype=cast(SeriesType, return_type).dtype,
                    struct_field=StructField(
                        name=SPARK_DEFAULT_SERIES_NAME,
                        dataType=cast(SeriesType, return_type).spark_type,
                    ),
                ).normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_series_func(output_func, return_type=field.spark_type)
                columns = self._psdf._internal.spark_columns
                internal = self._psdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                index_fields = cast(DataFrameType, return_type).index_fields
                index_fields = [index_field.normalize_spark_type() for index_field in index_fields]
                data_fields = [
                    field.normalize_spark_type()
                    for field in cast(DataFrameType, return_type).data_fields
                ]
                normalized_fields = index_fields + data_fields
                return_schema = StructType([field.struct_field for field in normalized_fields])
                should_retain_index = len(index_fields) > 0

                self_applied = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=should_retain_index  # type: ignore[arg-type]
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(  # type: ignore[call-overload]
                    output_func, returnType=return_schema
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                index_spark_columns = None
                index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None

                if should_retain_index:
                    index_spark_columns = [
                        scol_for(sdf, index_field.struct_field.name) for index_field in index_fields
                    ]

                    if not any(
                        [
                            SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name)
                            for index_field in index_fields
                        ]
                    ):
                        index_names = [
                            (index_field.struct_field.name,) for index_field in index_fields
                        ]
                internal = InternalFrame(
                    spark_frame=sdf,
                    index_names=index_names,
                    index_spark_columns=index_spark_columns,
                    index_fields=index_fields,
                    data_fields=data_fields,
                )
                return DataFrame(internal)
Ejemplo n.º 19
0
 def test_from_to_pandas(self):
     data = pd.date_range("1994-1-31 10:30:15", periods=3, freq="M")
     pser = pd.Series(data)
     psser = ps.Series(data)
     self.assert_eq(pser, psser.to_pandas())
     self.assert_eq(ps.from_pandas(pser), psser)
Ejemplo n.º 20
0
 def test_from_to_pandas(self):
     data = [b"1", b"2", b"3"]
     pser = pd.Series(data)
     psser = ps.Series(data)
     self.assert_eq(pser, psser.to_pandas())
     self.assert_eq(ps.from_pandas(pser), psser)
    def test_aggregate(self):
        pdf1 = pd.DataFrame({
            "C": [0.362, 0.227, 1.267, -0.562],
            "B": [1, 2, 3, 4]
        })
        pdf2 = pd.DataFrame({"A": [1, 1, 2, 2]})
        psdf1 = ps.from_pandas(pdf1)
        psdf2 = ps.from_pandas(pdf2)

        for as_index in [True, False]:
            if as_index:
                sort = lambda df: df.sort_index()
            else:
                sort = lambda df: df.sort_values(list(df.columns)).reset_index(
                    drop=True)

            with self.subTest(as_index=as_index):
                self.assert_eq(
                    sort(psdf1.groupby(psdf2.A, as_index=as_index).agg("sum")),
                    sort(pdf1.groupby(pdf2.A, as_index=as_index).agg("sum")),
                )
                self.assert_eq(
                    sort(
                        psdf1.groupby(psdf2.A, as_index=as_index).agg({
                            "B":
                            "min",
                            "C":
                            "sum"
                        })),
                    sort(
                        pdf1.groupby(pdf2.A, as_index=as_index).agg({
                            "B": "min",
                            "C": "sum"
                        })),
                )
                self.assert_eq(
                    sort(
                        psdf1.groupby(psdf2.A, as_index=as_index).agg({
                            "B": ["min", "max"],
                            "C":
                            "sum"
                        })),
                    sort(
                        pdf1.groupby(pdf2.A, as_index=as_index).agg({
                            "B": ["min", "max"],
                            "C":
                            "sum"
                        })),
                )
                self.assert_eq(
                    sort(
                        psdf1.groupby([psdf1.C, psdf2.A],
                                      as_index=as_index).agg("sum")),
                    sort(
                        pdf1.groupby([pdf1.C, pdf2.A],
                                     as_index=as_index).agg("sum")),
                )
                self.assert_eq(
                    sort(
                        psdf1.groupby([psdf1.C + 1, psdf2.A],
                                      as_index=as_index).agg("sum")),
                    sort(
                        pdf1.groupby([pdf1.C + 1, pdf2.A],
                                     as_index=as_index).agg("sum")),
                )

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("Y", "C"), ("X", "B")])
        pdf1.columns = columns
        psdf1.columns = columns

        columns = pd.MultiIndex.from_tuples([("X", "A")])
        pdf2.columns = columns
        psdf2.columns = columns

        for as_index in [True, False]:
            stats_psdf = psdf1.groupby(psdf2[("X", "A")],
                                       as_index=as_index).agg({
                                           ("X", "B"): "min",
                                           ("Y", "C"): "sum"
                                       })
            stats_pdf = pdf1.groupby(pdf2[("X", "A")], as_index=as_index).agg({
                ("X", "B"):
                "min",
                ("Y", "C"):
                "sum"
            })
            self.assert_eq(
                stats_psdf.sort_values(
                    by=[("X", "B"), ("Y", "C")]).reset_index(drop=True),
                stats_pdf.sort_values(by=[("X", "B"), ("Y", "C")]).reset_index(
                    drop=True),
            )

        stats_psdf = psdf1.groupby(psdf2[("X", "A")]).agg({
            ("X", "B"): ["min", "max"],
            ("Y", "C"):
            "sum"
        })
        stats_pdf = pdf1.groupby(pdf2[("X", "A")]).agg({
            ("X", "B"): ["min", "max"],
            ("Y", "C"):
            "sum"
        })
        self.assert_eq(
            stats_psdf.sort_values(by=[("X", "B",
                                        "min"), ("X", "B",
                                                 "max"), ("Y", "C",
                                                          "sum")]).reset_index(
                                                              drop=True),
            stats_pdf.sort_values(by=[("X", "B",
                                       "min"), ("X", "B",
                                                "max"), ("Y", "C",
                                                         "sum")]).reset_index(
                                                             drop=True),
        )
Ejemplo n.º 22
0
 def test_ror(self):
     pser = pd.Series([True, False, None], dtype="bool")
     psser = ps.from_pandas(pser)
     self.assert_eq(True | pser, True | psser)
     self.assert_eq(False | pser, False | psser)
Ejemplo n.º 23
0
 def test_from_to_pandas(self):
     data = [1, "x", "y"]
     pser = pd.Series(data, dtype="category")
     psser = ps.Series(data, dtype="category")
     self.assert_eq(pser, psser.to_pandas())
     self.assert_eq(ps.from_pandas(pser), psser)
Ejemplo n.º 24
0
 def other_psser(self):
     return ps.from_pandas(self.other_pser)
Ejemplo n.º 25
0
 def test_from_to_pandas(self):
     data = ["x", "y", "z"]
     pser = pd.Series(data)
     psser = ps.Series(data)
     self.assert_eq(pser, psser.to_pandas())
     self.assert_eq(ps.from_pandas(pser), psser)
Ejemplo n.º 26
0
 def test_from_to_pandas(self):
     data = [True, True, False]
     pser = pd.Series(data)
     psser = ps.Series(data)
     self.assert_eq(pser, psser.to_pandas())
     self.assert_eq(ps.from_pandas(pser), psser)
Ejemplo n.º 27
0
    def _test_groupby_rolling_func(self, f):
        pser = pd.Series([1, 2, 3, 2], index=np.random.rand(4), name="a")
        kser = ps.from_pandas(pser)
        self.assert_eq(
            getattr(kser.groupby(kser).rolling(2), f)().sort_index(),
            getattr(pser.groupby(pser).rolling(2), f)().sort_index(),
        )
        self.assert_eq(
            getattr(kser.groupby(kser).rolling(2), f)().sum(),
            getattr(pser.groupby(pser).rolling(2), f)().sum(),
        )

        # Multiindex
        pser = pd.Series(
            [1, 2, 3, 2],
            index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"),
                                             ("b", "z"), ("c", "z")]),
            name="a",
        )
        kser = ps.from_pandas(pser)
        self.assert_eq(
            getattr(kser.groupby(kser).rolling(2), f)().sort_index(),
            getattr(pser.groupby(pser).rolling(2), f)().sort_index(),
        )

        pdf = pd.DataFrame({
            "a": [1.0, 2.0, 3.0, 2.0],
            "b": [4.0, 2.0, 3.0, 1.0]
        })
        kdf = ps.from_pandas(pdf)
        self.assert_eq(
            getattr(kdf.groupby(kdf.a).rolling(2), f)().sort_index(),
            getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index(),
        )
        self.assert_eq(
            getattr(kdf.groupby(kdf.a).rolling(2), f)().sum(),
            getattr(pdf.groupby(pdf.a).rolling(2), f)().sum(),
        )
        self.assert_eq(
            getattr(kdf.groupby(kdf.a + 1).rolling(2), f)().sort_index(),
            getattr(pdf.groupby(pdf.a + 1).rolling(2), f)().sort_index(),
        )
        self.assert_eq(
            getattr(kdf.b.groupby(kdf.a).rolling(2), f)().sort_index(),
            getattr(pdf.b.groupby(pdf.a).rolling(2), f)().sort_index(),
        )
        self.assert_eq(
            getattr(kdf.groupby(kdf.a)["b"].rolling(2), f)().sort_index(),
            getattr(pdf.groupby(pdf.a)["b"].rolling(2), f)().sort_index(),
        )
        self.assert_eq(
            getattr(kdf.groupby(kdf.a)[["b"]].rolling(2), f)().sort_index(),
            getattr(pdf.groupby(pdf.a)[["b"]].rolling(2), f)().sort_index(),
        )

        # Multiindex column
        columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
        pdf.columns = columns
        kdf.columns = columns
        self.assert_eq(
            getattr(kdf.groupby(("a", "x")).rolling(2), f)().sort_index(),
            getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index(),
        )

        self.assert_eq(
            getattr(kdf.groupby([("a", "x"), ("a", "y")]).rolling(2),
                    f)().sort_index(),
            getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2),
                    f)().sort_index(),
        )
Ejemplo n.º 28
0
 def numeric_array_pssers(self):
     return [ps.from_pandas(pser) for pser in self.numeric_array_psers]
Ejemplo n.º 29
0
 def psdf(self):
     return ps.from_pandas(self.pdf)
Ejemplo n.º 30
0
    def test_to_datetime(self):
        pdf = pd.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5]
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        self.assert_eq(pd.to_datetime(1490195805, unit="s"),
                       ps.to_datetime(1490195805, unit="s"))
        self.assert_eq(
            pd.to_datetime(1490195805433502912, unit="ns"),
            ps.to_datetime(1490195805433502912, unit="ns"),
        )

        self.assert_eq(
            pd.to_datetime([1, 2, 3],
                           unit="D",
                           origin=pd.Timestamp("1960-01-01")),
            ps.to_datetime([1, 2, 3],
                           unit="D",
                           origin=pd.Timestamp("1960-01-01")),
        )

        pdf = pd.DataFrame({
            "years": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5]
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        pdf = pd.DataFrame({
            "years": [2015, 2016],
            "months": [2, 3],
            "day": [4, 5]
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        pdf = pd.DataFrame({
            "years": [2015, 2016],
            "months": [2, 3],
            "days": [4, 5]
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        # SPARK-36946: Support time for ps.to_datetime
        pdf = pd.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5],
            "hour": [2, 3],
            "minute": [10, 30],
            "second": [21, 25],
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        pdf = pd.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5],
            "hour": [2, 3],
            "minute": [10, 30],
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        pdf = pd.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5],
            "hour": [2, 3]
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        pdf = pd.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5],
            "hour": [2, 3],
            "minute": [10, 30],
            "second": [21, 25],
            "ms": [50, 69],
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        pdf = pd.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5],
            "hour": [2, 3],
            "minute": [10, 30],
            "second": [21, 25],
            "ms": [50, 69],
            "millisecond": [123, 678],
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))

        pdf = pd.DataFrame({
            "Year": [2015, 2016],
            "Month": [2, 3],
            "Day": [4, 5],
            "Hour": [2, 3],
            "Minute": [10, 30],
            "Second": [21, 25],
            "ms": [50, 69],
            "millisecond": [123, 678],
        })
        psdf = ps.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ps.to_datetime(dict_from_pdf))