Beispiel #1
0
    def test_infer_schema_with_names_pandas_instances(self):
        def func() -> 'pd.DataFrame["a" : np.float, "b":str]':  # noqa: F405
            pass

        expected = StructType([StructField("a", DoubleType()), StructField("b", StringType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> "pd.DataFrame['a': np.float, 'b': int]":  # noqa: F405
            pass

        expected = StructType([StructField("a", DoubleType()), StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType([StructField("a", LongType()), StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType(
            [StructField("(x, a)", LongType()), StructField("(y, b)", LongType())]
        )
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical(["a", "b", "c"])})

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType([StructField("a", LongType()), StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, CategoricalDtype(categories=["a", "b", "c"])])
        self.assertEqual(inferred.spark_type, expected)
Beispiel #2
0
    def transform_batch(
        self, func: Callable[..., pd.Series], *args: Any, **kwargs: Any
    ) -> "Series":
        """
        Transform the data with the function that takes pandas Series and outputs pandas Series.
        The pandas Series given to the function is of a batch used internally.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input series. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pser) -> ps.Series[int]:
            ...     return pd.Series([len(pser)] * len(pser))
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.A.pandas_on_spark.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.Series[int]:
            ...     return x + 1

        Parameters
        ----------
        func : function
            Function to apply to each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch : Similar but it takes pandas DataFrame as its
            internal batch.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pser) -> ps.Series[np.int64]:
        ...     return pser + 1
        >>> df.A.pandas_on_spark.transform_batch(plus_one_func)
        0    2
        1    4
        2    6
        Name: A, dtype: int64

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.A.pandas_on_spark.transform_batch(lambda pser: pser + 1)
        0    2
        1    4
        2    6
        Name: A, dtype: int64

        You can also specify extra arguments.

        >>> def plus_one_func(pser, a, b, c=3) -> ps.Series[np.int64]:
        ...     return pser + a + b + c
        >>> df.A.pandas_on_spark.transform_batch(plus_one_func, 1, b=2)
        0     7
        1     9
        2    11
        Name: A, dtype: int64

        You can also use ``np.ufunc`` and built-in functions as input.

        >>> df.A.pandas_on_spark.transform_batch(np.add, 10)
        0    11
        1    13
        2    15
        Name: A, dtype: int64

        >>> (df * -1).A.pandas_on_spark.transform_batch(abs)
        0    1
        1    3
        2    5
        Name: A, dtype: int64
        """
        assert callable(func), "the first argument should be a callable function."

        return_sig = None
        try:
            spec = inspect.getfullargspec(func)
            return_sig = spec.annotations.get("return", None)
        except TypeError:
            # Falls back to schema inference if it fails to get signature.
            pass

        return_type = None
        if return_sig is not None:
            # Extract the signature arguments from this function.
            sig_return = infer_return_type(func)
            if not isinstance(sig_return, SeriesType):
                raise ValueError(
                    "Expected the return type of this function to be of type column,"
                    " but found type {}".format(sig_return)
                )
            return_type = cast(SeriesType, sig_return)

        return self._transform_batch(lambda c: func(c, *args, **kwargs), return_type)
Beispiel #3
0
    def transform_batch(
        self, func: Callable[..., Union[pd.DataFrame, pd.Series]], *args: Any, **kwargs: Any
    ) -> DataFrameOrSeries:
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a NumPy compound type style
            as below:

            >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[
            ...         (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame or Series

        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ps.DataFrame[int, [int, int]]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ps.DataFrame[("index", int), [('A', int), ('B', int)]]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)  # doctest: +NORMALIZE_WHITESPACE
               A  B
        index
        0      2  3
        1      4  5
        2      6  7

        >>> def plus_one_func(pdf) -> ps.Series[int]:
        ...     return pdf.B + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int64

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).pandas_on_spark.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark.pandas.series import first_series
        from pyspark import pandas as ps

        assert callable(func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        should_retain_index = should_infer_schema
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
            return func(pdf).to_frame()

        def pandas_series_func(
            f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType
        ) -> "UserDefinedFunctionLike":
            ff = f

            @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
            def udf(pdf: pd.DataFrame) -> pd.Series:
                return first_series(ff(pdf))

            return udf

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            log_advice(
                "If the type hints is not specified for `transform_batch`, "
                "it is expensive to infer the data type internally."
            )
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self._psdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed)
                )
            if len(transformed) != len(pdf):
                raise ValueError("transform_batch cannot produce aggregated results")
            psdf_or_psser = ps.from_pandas(transformed)

            if isinstance(psdf_or_psser, ps.Series):
                psser = cast(ps.Series, psdf_or_psser)

                field = psser._internal.data_fields[0].normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_series_func(output_func, return_type=field.spark_type)
                columns = self._psdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._psdf._internal.copy(
                    column_labels=psser._internal.column_labels,
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=psser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                psdf = cast(DataFrame, psdf_or_psser)
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return psdf

                index_fields = [
                    field.normalize_spark_type() for field in psdf._internal.index_fields
                ]
                data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields]

                return_schema = StructType(
                    [field.struct_field for field in index_fields + data_fields]
                )

                self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True  # type: ignore[arg-type]
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(  # type: ignore[call-overload]
                    output_func, returnType=return_schema
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                return DataFrame(
                    psdf._internal.with_new_sdf(
                        spark_frame=sdf, index_fields=index_fields, data_fields=data_fields
                    )
                )
        else:
            return_type = infer_return_type(original_func)
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            if is_return_series:
                field = InternalField(
                    dtype=cast(SeriesType, return_type).dtype,
                    struct_field=StructField(
                        name=SPARK_DEFAULT_SERIES_NAME,
                        dataType=cast(SeriesType, return_type).spark_type,
                    ),
                ).normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_series_func(output_func, return_type=field.spark_type)
                columns = self._psdf._internal.spark_columns
                internal = self._psdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                index_fields = cast(DataFrameType, return_type).index_fields
                index_fields = [index_field.normalize_spark_type() for index_field in index_fields]
                data_fields = [
                    field.normalize_spark_type()
                    for field in cast(DataFrameType, return_type).data_fields
                ]
                normalized_fields = index_fields + data_fields
                return_schema = StructType([field.struct_field for field in normalized_fields])
                should_retain_index = len(index_fields) > 0

                self_applied = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=should_retain_index  # type: ignore[arg-type]
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(  # type: ignore[call-overload]
                    output_func, returnType=return_schema
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                index_spark_columns = None
                index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None

                if should_retain_index:
                    index_spark_columns = [
                        scol_for(sdf, index_field.struct_field.name) for index_field in index_fields
                    ]

                    if not any(
                        [
                            SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name)
                            for index_field in index_fields
                        ]
                    ):
                        index_names = [
                            (index_field.struct_field.name,) for index_field in index_fields
                        ]
                internal = InternalFrame(
                    spark_frame=sdf,
                    index_names=index_names,
                    index_spark_columns=index_spark_columns,
                    index_fields=index_fields,
                    data_fields=data_fields,
                )
                return DataFrame(internal)
Beispiel #4
0
    def apply_batch(
        self, func: Callable[..., pd.DataFrame], args: Tuple = (), **kwds: Any
    ) -> "DataFrame":
        """
        Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas
        DataFrame given to the function is of a batch used internally.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int, [int]]:
            ...     return pd.DataFrame([len(pdf)])
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.apply_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...
            10  83
            11  83

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a NumPy compound type style
            as below:

            >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[
            ...         (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

        Parameters
        ----------
        func : function
            Function to apply to each pandas frame.
        args : tuple
            Positional arguments to pass to `func` in addition to the
            array/series.
        **kwds
            Additional keyword arguments to pass as keywords arguments to
            `func`.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.apply: For row/columnwise operations.
        DataFrame.applymap: For elementwise operations.
        DataFrame.aggregate: Only perform aggregating type operations.
        DataFrame.transform: Only perform transforming type operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def query_func(pdf) -> ps.DataFrame[int, [int, int]]:
        ...     return pdf.query('A == 1')
        >>> df.pandas_on_spark.apply_batch(query_func)
           c0  c1
        0   1   2

        >>> def query_func(pdf) -> ps.DataFrame[("idx", int), [("A", int), ("B", int)]]:
        ...     return pdf.query('A == 1')
        >>> df.pandas_on_spark.apply_batch(query_func)  # doctest: +NORMALIZE_WHITESPACE
             A  B
        idx
        0    1  2

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.apply_batch(lambda pdf: pdf.query('A == 1'))
           A  B
        0  1  2

        You can also specify extra arguments.

        >>> def calculation(pdf, y, z) -> ps.DataFrame[int, [int, int]]:
        ...     return pdf ** y + z
        >>> df.pandas_on_spark.apply_batch(calculation, args=(10,), z=20)
                c0        c1
        0       21      1044
        1    59069   1048596
        2  9765645  60466196

        You can also use ``np.ufunc`` and built-in functions as input.

        >>> df.pandas_on_spark.apply_batch(np.add, args=(10,))
            A   B
        0  11  12
        1  13  14
        2  15  16

        >>> (df * -1).pandas_on_spark.apply_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        """
        # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate?

        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark import pandas as ps

        if not isinstance(func, FunctionType):
            assert callable(func), "the first argument should be a callable function."
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None

        original_func = func
        func = lambda o: original_func(o, *args, **kwds)

        self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            log_advice(
                "If the type hints is not specified for `apply_batch`, "
                "it is expensive to infer the data type internally."
            )
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self_applied.head(limit + 1)._to_internal_pandas()
            applied = func(pdf)
            if not isinstance(applied, pd.DataFrame):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(applied)
                )
            psdf: DataFrame = DataFrame(applied)
            if len(pdf) <= limit:
                return psdf

            index_fields = [field.normalize_spark_type() for field in psdf._internal.index_fields]
            data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields]

            return_schema = StructType([field.struct_field for field in index_fields + data_fields])

            output_func = GroupBy._make_pandas_df_builder_func(
                self_applied, func, return_schema, retain_index=True
            )
            sdf = self_applied._internal.spark_frame.mapInPandas(
                lambda iterator: map(output_func, iterator), schema=return_schema
            )

            # If schema is inferred, we can restore indexes too.
            internal = psdf._internal.with_new_sdf(
                spark_frame=sdf, index_fields=index_fields, data_fields=data_fields
            )
        else:
            return_type = infer_return_type(original_func)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe:
                raise TypeError(
                    "The given function should specify a frame as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            index_fields = cast(DataFrameType, return_type).index_fields
            should_retain_index = len(index_fields) > 0
            return_schema = cast(DataFrameType, return_type).spark_type

            output_func = GroupBy._make_pandas_df_builder_func(
                self_applied, func, return_schema, retain_index=should_retain_index
            )
            sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                lambda iterator: map(output_func, iterator), schema=return_schema
            )

            index_spark_columns = None
            index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None

            if should_retain_index:
                index_spark_columns = [
                    scol_for(sdf, index_field.struct_field.name) for index_field in index_fields
                ]

                if not any(
                    [
                        SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name)
                        for index_field in index_fields
                    ]
                ):
                    index_names = [(index_field.struct_field.name,) for index_field in index_fields]
            internal = InternalFrame(
                spark_frame=sdf,
                index_names=index_names,
                index_spark_columns=index_spark_columns,
                index_fields=index_fields,
                data_fields=cast(DataFrameType, return_type).data_fields,
            )
        return DataFrame(internal)
Beispiel #5
0
    def transform_batch(self, func, *args, **kwargs) -> Union["DataFrame", "Series"]:
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ps.DataFrame['a': float, 'b': float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

            When the given function returns DataFrame and has the return type annotated, the
            original index of the DataFrame will be lost and then a default index will be attached
            to the result. Please be careful about configuring the default index. See also
            `Default Index Type
            <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.

        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame or Series

        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ps.DataFrame[int, int]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ps.DataFrame['A': int, 'B': int]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> def plus_one_func(pdf) -> ps.Series[int]:
        ...     return pdf.B + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int64

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).pandas_on_spark.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark.pandas.series import first_series
        from pyspark import pandas as ps

        assert callable(func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        names = self._psdf._internal.to_internal_spark_frame.schema.names

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf.columns = names
            return pdf

        def apply_func(pdf):
            return func(pdf).to_frame()

        def pandas_extract(pdf, name):
            # This is for output to work around a DataFrame for struct
            # from Spark 3.0.  See SPARK-23836
            return pdf[name]

        def pandas_series_func(f):
            ff = f
            return lambda *series: first_series(ff(*series))

        def pandas_frame_func(f, field_name):
            ff = f
            return lambda *series: pandas_extract(ff(pandas_concat(series)), field_name)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self._psdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed)
                )
            if len(transformed) != len(pdf):
                raise ValueError("transform_batch cannot produce aggregated results")
            psdf_or_psser = ps.from_pandas(transformed)

            if isinstance(psdf_or_psser, ps.Series):
                psser = cast(ps.Series, psdf_or_psser)

                spark_return_type = force_decimal_precision_scale(
                    as_nullable_spark_type(psser.spark.data_type)
                )
                return_schema = StructType(
                    [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)]
                )
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_udf(returnType=spark_return_type, functionType=PandasUDFType.SCALAR)(
                    pandas_series_func(output_func)
                )
                columns = self._psdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._psdf._internal.copy(
                    column_labels=psser._internal.column_labels,
                    data_spark_columns=[
                        pudf(F.struct(*columns)).alias(psser._internal.data_spark_column_names[0])
                    ],
                    data_dtypes=psser._internal.data_dtypes,
                    column_label_names=psser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                psdf = cast(DataFrame, psdf_or_psser)
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return psdf

                # Force nullability.
                return_schema = force_decimal_precision_scale(
                    as_nullable_spark_type(psdf._internal.to_internal_spark_frame.schema)
                )

                self_applied = DataFrame(self._psdf._internal.resolved_copy)  # type: DataFrame

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)(
                    output_func
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                return DataFrame(psdf._internal.with_new_sdf(sdf))
        else:
            return_type = infer_return_type(original_func)
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            if is_return_series:
                spark_return_type = force_decimal_precision_scale(
                    as_nullable_spark_type(cast(SeriesType, return_type).spark_type)
                )
                return_schema = StructType(
                    [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)]
                )
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_udf(returnType=spark_return_type, functionType=PandasUDFType.SCALAR)(
                    pandas_series_func(output_func)
                )
                columns = self._psdf._internal.spark_columns
                internal = self._psdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[pudf(F.struct(*columns)).alias(SPARK_DEFAULT_SERIES_NAME)],
                    data_dtypes=[cast(SeriesType, return_type).dtype],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                return_schema = cast(DataFrameType, return_type).spark_type

                self_applied = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)(
                    output_func
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                internal = InternalFrame(
                    spark_frame=sdf,
                    index_spark_columns=None,
                    data_dtypes=cast(DataFrameType, return_type).dtypes,
                )
                return DataFrame(internal)
Beispiel #6
0
    def test_infer_schema_from_pandas_instances(self):
        def func() -> pd.Series[int]:
            pass

        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtype, np.int64)
        self.assertEqual(inferred.spark_type, LongType())

        def func() -> pd.Series[np.float]:
            pass

        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtype, np.float64)
        self.assertEqual(inferred.spark_type, DoubleType())

        def func() -> "pd.DataFrame[np.float, str]":
            pass

        expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> "pandas.DataFrame[np.float]":
            pass

        expected = StructType([StructField("c0", DoubleType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> "pd.Series[int]":
            pass

        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtype, np.int64)
        self.assertEqual(inferred.spark_type, LongType())

        def func() -> pd.DataFrame[np.float, str]:
            pass

        expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> pd.DataFrame[np.float]:
            pass

        expected = StructType([StructField("c0", DoubleType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})

        def func() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
            pass

        expected = StructType([StructField("c0", LongType()), StructField("c1", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical(["a", "b", "c"])})

        def func() -> pd.Series[pdf.b.dtype]:  # type: ignore
            pass

        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtype, CategoricalDtype(categories=["a", "b", "c"]))
        self.assertEqual(inferred.spark_type, LongType())

        def func() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
            pass

        expected = StructType([StructField("c0", LongType()), StructField("c1", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, CategoricalDtype(categories=["a", "b", "c"])])
        self.assertEqual(inferred.spark_type, expected)
Beispiel #7
0
        def try_infer_return_type():
            def f() -> ps.Series[pdf.a.dtype]:  # type: ignore
                pass

            infer_return_type(f)
Beispiel #8
0
        def try_infer_return_type():
            def f() -> ps.DataFrame[pdf.dtypes]:  # type: ignore
                pass

            infer_return_type(f)
Beispiel #9
0
        def try_infer_return_type():
            def f() -> 'ps.DataFrame["a" : np.float : 1, "b":str:2]':  # noqa: F405
                pass

            infer_return_type(f)
Beispiel #10
0
        def try_infer_return_type():
            def f() -> ps.DataFrame[A]:
                pass

            infer_return_type(f)
Beispiel #11
0
    def apply_batch(self, func, args=(), **kwds) -> "DataFrame":
        """
        Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas
        DataFrame given to the function is of a batch used internally.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. Koalas internally
            splits the input series into multiple batches and calls `func` with each batch multiple
            times. Therefore, operations such as global aggregations are impossible. See the example
            below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)])
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.koalas.apply_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...
            10  83
            11  83

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ps.DataFrame["a": float, "b": float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

            When the given function has the return type annotated, the original index of the
            DataFrame will be lost and a default index will be attached to the result DataFrame.
            Please be careful about configuring the default index. See also `Default Index Type
            <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.


        Parameters
        ----------
        func : function
            Function to apply to each pandas frame.
        args : tuple
            Positional arguments to pass to `func` in addition to the
            array/series.
        **kwds
            Additional keyword arguments to pass as keywords arguments to
            `func`.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.apply: For row/columnwise operations.
        DataFrame.applymap: For elementwise operations.
        DataFrame.aggregate: Only perform aggregating type operations.
        DataFrame.transform: Only perform transforming type operations.
        Series.koalas.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def query_func(pdf) -> ps.DataFrame[int, int]:
        ...     return pdf.query('A == 1')
        >>> df.koalas.apply_batch(query_func)
           c0  c1
        0   1   2

        >>> def query_func(pdf) -> ps.DataFrame["A": int, "B": int]:
        ...     return pdf.query('A == 1')
        >>> df.koalas.apply_batch(query_func)
           A  B
        0  1  2

        You can also omit the type hints so Koalas infers the return schema as below:

        >>> df.koalas.apply_batch(lambda pdf: pdf.query('A == 1'))
           A  B
        0  1  2

        You can also specify extra arguments.

        >>> def calculation(pdf, y, z) -> ps.DataFrame[int, int]:
        ...     return pdf ** y + z
        >>> df.koalas.apply_batch(calculation, args=(10,), z=20)
                c0        c1
        0       21      1044
        1    59069   1048596
        2  9765645  60466196

        You can also use ``np.ufunc`` and built-in functions as input.

        >>> df.koalas.apply_batch(np.add, args=(10,))
            A   B
        0  11  12
        1  13  14
        2  15  16

        >>> (df * -1).koalas.apply_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        """
        # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate?

        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark import pandas as ps

        if not isinstance(func, types.FunctionType):
            assert callable(
                func), "the first argument should be a callable function."
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        should_use_map_in_pandas = LooseVersion(pyspark.__version__) >= "3.0"

        original_func = func
        func = lambda o: original_func(o, *args, **kwds)

        self_applied = DataFrame(
            self._kdf._internal.resolved_copy)  # type: DataFrame

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self_applied.head(limit + 1)._to_internal_pandas()
            applied = func(pdf)
            if not isinstance(applied, pd.DataFrame):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(applied))
            kdf = ps.DataFrame(applied)  # type: DataFrame
            if len(pdf) <= limit:
                return kdf

            return_schema = force_decimal_precision_scale(
                as_nullable_spark_type(
                    kdf._internal.to_internal_spark_frame.schema))
            if should_use_map_in_pandas:
                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True)
                sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                    lambda iterator: map(output_func, iterator),
                    schema=return_schema)
            else:
                sdf = GroupBy._spark_group_map_apply(
                    self_applied,
                    func, (F.spark_partition_id(), ),
                    return_schema,
                    retain_index=True)

            # If schema is inferred, we can restore indexes too.
            internal = kdf._internal.with_new_sdf(sdf)
        else:
            return_type = infer_return_type(original_func)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe:
                raise TypeError(
                    "The given function should specify a frame as its type "
                    "hints; however, the return type was %s." % return_sig)
            return_schema = cast(DataFrameType, return_type).spark_type

            if should_use_map_in_pandas:
                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False)
                sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                    lambda iterator: map(output_func, iterator),
                    schema=return_schema)
            else:
                sdf = GroupBy._spark_group_map_apply(
                    self_applied,
                    func, (F.spark_partition_id(), ),
                    return_schema,
                    retain_index=False)

            # Otherwise, it loses index.
            internal = InternalFrame(
                spark_frame=sdf,
                index_spark_columns=None,
                data_dtypes=cast(DataFrameType, return_type).dtypes,
            )

        return DataFrame(internal)
Beispiel #12
0
        def try_infer_return_type():
            def f() -> pd.Series[pdf.a.dtype]:  # type: ignore[name-defined]
                pass

            infer_return_type(f)
Beispiel #13
0
        def try_infer_return_type():
            def f() -> pd.DataFrame[pdf.dtypes]:  # type: ignore[name-defined]
                pass

            infer_return_type(f)
Beispiel #14
0
        def try_infer_return_type():
            def f() -> None:
                pass

            infer_return_type(f)