def test_type_annotation_scalar_iter(self):
        def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR_ITER)

        def func(
            iter: Iterator[Tuple[pd.DataFrame, pd.Series]]
        ) -> Iterator[pd.DataFrame]:
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR_ITER)

        def func(
                iter: Iterator[Tuple[pd.DataFrame,
                                     ...]]) -> Iterator[pd.Series]:
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR_ITER)

        def func(
            iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]]
        ) -> Iterator[pd.Series]:
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR_ITER)
Exemple #2
0
    def test_type_annotation_scalar_iter(self):
        exec(
            "from typing import Iterator\n"
            "def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]: pass",
            self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR_ITER)

        exec(
            "from typing import Iterator, Tuple\n"
            "def func(iter: Iterator[Tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]:\n"
            "    pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR_ITER)

        exec(
            "from typing import Iterator, Tuple\n"
            "def func(iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]: pass",
            self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR_ITER)

        exec(
            "from typing import Iterator, Tuple, Union\n"
            "def func(iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]])"
            " -> Iterator[pd.Series]: pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR_ITER)
def _create_pandas_udf(f, returnType, evalType):
    argspec = _get_argspec(f)

    # pandas UDF by type hints.
    if sys.version_info >= (3, 6):
        from inspect import signature

        if evalType in [
                PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF
        ]:
            warnings.warn(
                "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for "
                "pandas UDF instead of specifying pandas UDF type which will be deprecated "
                "in the future releases. See SPARK-28264 for more details.",
                UserWarning)
        elif evalType in [
                PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF
        ]:
            # In case of 'SQL_GROUPED_MAP_PANDAS_UDF',  deprecation warning is being triggered
            # at `apply` instead.
            # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the
            # evaluation type will always be set.
            pass
        elif len(argspec.annotations) > 0:
            evalType = infer_eval_type(signature(f))
            assert evalType is not None

    if evalType is None:
        # Set default is scalar UDF.
        evalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF

    if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or
            evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF) and \
            len(argspec.args) == 0 and \
            argspec.varargs is None:
        raise ValueError(
            "Invalid function: 0-arg pandas_udfs are not supported. "
            "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
        )

    if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \
            and len(argspec.args) not in (1, 2):
        raise ValueError(
            "Invalid function: pandas_udf with function type GROUPED_MAP or "
            "the function in groupby.applyInPandas "
            "must take either one argument (data) or two arguments (key, data)."
        )

    if evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF \
            and len(argspec.args) not in (2, 3):
        raise ValueError(
            "Invalid function: the function in cogroup.applyInPandas "
            "must take either two arguments (left, right) "
            "or three arguments (key, left, right).")

    return _create_udf(f, returnType, evalType)
Exemple #4
0
    def test_type_annotation_group_agg(self):
        def func(col: pd.Series) -> str:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)

        def func(col: pd.DataFrame, col1: pd.Series) -> int:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)

        def func(col: pd.DataFrame, *args: pd.Series) -> Row:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)

        def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)

        def func(col: pd.Series, *, col2: pd.DataFrame) -> float:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)

        def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
Exemple #5
0
    def test_type_annotation_scalar(self):
        def func(col: pd.Series) -> pd.Series:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)

        def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)

        def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)

        def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)

        def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)

        def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series:
            pass

        self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
    def test_string_type_annotation(self):
        def func(col: "pd.Series") -> "pd.Series":
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR)

        def func(col: "pd.DataFrame", col1: "pd.Series") -> "pd.DataFrame":
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR)

        def func(col: "pd.DataFrame", *args: "pd.Series") -> "pd.Series":
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR)

        def func(col: "pd.Series", *args: "pd.Series",
                 **kwargs: "pd.DataFrame") -> "pd.Series":
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR)

        def func(col: "pd.Series", *, col2: "pd.DataFrame") -> "pd.DataFrame":
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR)

        def func(col: Union["pd.Series", "pd.DataFrame"], *,
                 col2: "pd.DataFrame") -> "pd.Series":
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR)

        def func(col: "Union[pd.Series, pd.DataFrame]", *,
                 col2: "pd.DataFrame") -> "pd.Series":
            pass

        self.assertEqual(
            infer_eval_type(signature(func), get_type_hints(func)),
            PandasUDFType.SCALAR)
Exemple #7
0
    def test_type_annotation_scalar(self):
        exec("def func(col: pd.Series) -> pd.Series: pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR)

        exec(
            "def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame: pass",
            self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR)

        exec(
            "def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series: pass",
            self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR)

        exec(
            "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series:\n"
            "    pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR)

        exec(
            "def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame:\n"
            "    pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR)

        exec(
            "from typing import Union\n"
            "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series:\n"
            "    pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.SCALAR)
Exemple #8
0
    def test_type_annotation_group_agg(self):
        exec("def func(col: pd.Series) -> str: pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.GROUPED_AGG)

        exec("def func(col: pd.DataFrame, col1: pd.Series) -> int: pass",
             self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.GROUPED_AGG)

        exec(
            "from pyspark.sql import Row\n"
            "def func(col: pd.DataFrame, *args: pd.Series) -> Row: pass",
            self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.GROUPED_AGG)

        exec(
            "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str:\n"
            "    pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.GROUPED_AGG)

        exec(
            "def func(col: pd.Series, *, col2: pd.DataFrame) -> float:\n"
            "    pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.GROUPED_AGG)

        exec(
            "from typing import Union\n"
            "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float:\n"
            "    pass", self.local)
        self.assertEqual(
            infer_eval_type(inspect.signature(self.local['func'])),
            PandasUDFType.GROUPED_AGG)