Ejemplo n.º 1
0
def _create_udf(f, returnType, evalType):

    if evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                    PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                    PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                    PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
                    PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                    PythonEvalType.SQL_MAP_PANDAS_ITER_UDF):

        from pyspark.sql.pandas.utils import require_minimum_pyarrow_version
        require_minimum_pyarrow_version()

        argspec = _get_argspec(f)

        if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or
                evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF) and \
                len(argspec.args) == 0 and \
                argspec.varargs is None:
            raise ValueError(
                "Invalid function: 0-arg pandas_udfs are not supported. "
                "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
            )

        if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \
                and len(argspec.args) not in (1, 2):
            raise ValueError(
                "Invalid function: pandas_udfs with function type GROUPED_MAP "
                "must take either one argument (data) or two arguments (key, data)."
            )

        if evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF \
                and len(argspec.args) not in (2, 3):
            raise ValueError(
                "Invalid function: pandas_udfs with function type COGROUPED_MAP "
                "must take either two arguments (left, right) "
                "or three arguments (key, left, right).")

    # Set the name of the UserDefinedFunction object to be the name of function f
    udf_obj = UserDefinedFunction(f,
                                  returnType=returnType,
                                  name=None,
                                  evalType=evalType,
                                  deterministic=True)
    return udf_obj._wrapped()
Ejemplo n.º 2
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.sql import SparkSession
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, SparkSession)

        from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
        from pyspark.sql.types import TimestampType
        from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type
        from pyspark.sql.pandas.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        import pyarrow as pa

        # Create the Spark schema from list of names passed in with Arrow types
        if isinstance(schema, (list, tuple)):
            arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
            struct = StructType()
            prefer_timestamp_ntz = is_timestamp_ntz_preferred()
            for name, field in zip(schema, arrow_schema):
                struct.add(name,
                           from_arrow_type(field.type, prefer_timestamp_ntz),
                           nullable=field.nullable)
            schema = struct

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError(
                "Single data type %s is not supported with Arrow" %
                str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [
                to_arrow_type(TimestampType())
                if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                for t in pdf.dtypes
            ]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism
                 )  # round int up
        pdf_slices = (pdf.iloc[start:start + step]
                      for start in range(0, len(pdf), step))

        # Create list of Arrow (columns, type) for serializer dump_stream
        arrow_data = [[(c, t)
                       for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)
                       ] for pdf_slice in pdf_slices]

        jsqlContext = self._wrapped._jsqlContext

        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        col_by_name = True  # col by name only applies to StructType columns, can't happen here
        ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(
                jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(),
                                                   jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Ejemplo n.º 3
0
    def toPandas(self):
        """
        Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.

        This is only available if Pandas is installed and available.

        .. versionadded:: 1.3.0

        Notes
        -----
        This method should only be used if the resulting Pandas's :class:`DataFrame` is
        expected to be small, as all the data is loaded into the driver's memory.

        Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.

        Examples
        --------
        >>> df.toPandas()  # doctest: +SKIP
           age   name
        0    2  Alice
        1    5    Bob
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        from pyspark.sql.pandas.utils import require_minimum_pandas_version
        require_minimum_pandas_version()

        import numpy as np
        import pandas as pd

        timezone = self.sql_ctx._conf.sessionLocalTimeZone()

        if self.sql_ctx._conf.arrowPySparkEnabled():
            use_arrow = True
            try:
                from pyspark.sql.pandas.types import to_arrow_schema
                from pyspark.sql.pandas.utils import require_minimum_pyarrow_version

                require_minimum_pyarrow_version()
                to_arrow_schema(self.schema)
            except Exception as e:

                if self.sql_ctx._conf.arrowPySparkFallbackEnabled():
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
                        "failed by the reason below:\n  %s\n"
                        "Attempting non-optimization as "
                        "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
                        "true." % str(e))
                    warnings.warn(msg)
                    use_arrow = False
                else:
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                        "reached the error below and will not continue because automatic fallback "
                        "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
                        "false.\n  %s" % str(e))
                    warnings.warn(msg)
                    raise

            # Try to use Arrow optimization when the schema is supported and the required version
            # of PyArrow is found, if 'spark.sql.execution.arrow.pyspark.enabled' is enabled.
            if use_arrow:
                try:
                    from pyspark.sql.pandas.types import _check_series_localize_timestamps, \
                        _convert_map_items_to_dict
                    import pyarrow
                    # Rename columns to avoid duplicated column names.
                    tmp_column_names = [
                        'col_{}'.format(i) for i in range(len(self.columns))
                    ]
                    self_destruct = self.sql_ctx._conf.arrowPySparkSelfDestructEnabled(
                    )
                    batches = self.toDF(*tmp_column_names)._collect_as_arrow(
                        split_batches=self_destruct)
                    if len(batches) > 0:
                        table = pyarrow.Table.from_batches(batches)
                        # Ensure only the table has a reference to the batches, so that
                        # self_destruct (if enabled) is effective
                        del batches
                        # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
                        # values, but we should use datetime.date to match the behavior with when
                        # Arrow optimization is disabled.
                        pandas_options = {'date_as_object': True}
                        if self_destruct:
                            # Configure PyArrow to use as little memory as possible:
                            # self_destruct - free columns as they are converted
                            # split_blocks - create a separate Pandas block for each column
                            # use_threads - convert one column at a time
                            pandas_options.update({
                                'self_destruct': True,
                                'split_blocks': True,
                                'use_threads': False,
                            })
                        pdf = table.to_pandas(**pandas_options)
                        # Rename back to the original column names.
                        pdf.columns = self.columns
                        for field in self.schema:
                            if isinstance(field.dataType, TimestampType):
                                pdf[field.name] = \
                                    _check_series_localize_timestamps(pdf[field.name], timezone)
                            elif isinstance(field.dataType, MapType):
                                pdf[field.name] = \
                                    _convert_map_items_to_dict(pdf[field.name])
                        return pdf
                    else:
                        return pd.DataFrame.from_records([],
                                                         columns=self.columns)
                except Exception as e:
                    # We might have to allow fallback here as well but multiple Spark jobs can
                    # be executed. So, simply fail in this case for now.
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                        "reached the error below and can not continue. Note that "
                        "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                        "effect on failures in the middle of "
                        "computation.\n  %s" % str(e))
                    warnings.warn(msg)
                    raise

        # Below is toPandas without Arrow optimization.
        pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
        column_counter = Counter(self.columns)

        dtype = [None] * len(self.schema)
        for fieldIdx, field in enumerate(self.schema):
            # For duplicate column name, we use `iloc` to access it.
            if column_counter[field.name] > 1:
                pandas_col = pdf.iloc[:, fieldIdx]
            else:
                pandas_col = pdf[field.name]

            pandas_type = PandasConversionMixin._to_corrected_pandas_type(
                field.dataType)
            # SPARK-21766: if an integer field is nullable and has null values, it can be
            # inferred by pandas as float column. Once we convert the column with NaN back
            # to integer type e.g., np.int16, we will hit exception. So we use the inferred
            # float type, not the corrected type from the schema in this case.
            if pandas_type is not None and \
                not(isinstance(field.dataType, IntegralType) and field.nullable and
                    pandas_col.isnull().any()):
                dtype[fieldIdx] = pandas_type
            # Ensure we fall back to nullable numpy types, even when whole column is null:
            if isinstance(field.dataType,
                          IntegralType) and pandas_col.isnull().any():
                dtype[fieldIdx] = np.float64
            if isinstance(field.dataType,
                          BooleanType) and pandas_col.isnull().any():
                dtype[fieldIdx] = np.object

        df = pd.DataFrame()
        for index, t in enumerate(dtype):
            column_name = self.schema[index].name

            # For duplicate column name, we use `iloc` to access it.
            if column_counter[column_name] > 1:
                series = pdf.iloc[:, index]
            else:
                series = pdf[column_name]

            if t is not None:
                series = series.astype(t, copy=False)

            # `insert` API makes copy of data, we only do it for Series of duplicate column names.
            # `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work because `iloc` could
            # return a view or a copy depending by context.
            if column_counter[column_name] > 1:
                df.insert(index, column_name, series, allow_duplicates=True)
            else:
                df[column_name] = series

        pdf = df

        if timezone is None:
            return pdf
        else:
            from pyspark.sql.pandas.types import _check_series_convert_timestamps_local_tz
            for field in self.schema:
                # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
                if isinstance(field.dataType, TimestampType):
                    pdf[field.name] = \
                        _check_series_convert_timestamps_local_tz(pdf[field.name], timezone)
            return pdf
Ejemplo n.º 4
0
#
"""
.. versionadded:: 3.2.0
    pandas API on Spark
"""

import os
import sys
from distutils.version import LooseVersion
import warnings

from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version

try:
    require_minimum_pandas_version()
    require_minimum_pyarrow_version()
except ImportError as e:
    if os.environ.get("SPARK_TESTING"):
        warnings.warn(str(e))
        sys.exit()
    else:
        raise

import pyarrow

if (LooseVersion(pyarrow.__version__) >= LooseVersion("2.0.0")
        and "PYARROW_IGNORE_TIMEZONE" not in os.environ):
    import logging

    logging.warning(
        "'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to "
Ejemplo n.º 5
0
def pandas_udf(f=None, returnType=None, functionType=None):
    """
    Creates a pandas user defined function (a.k.a. vectorized user defined function).

    Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer
    data and Pandas to work with the data, which allows vectorized operations. A Pandas UDF
    is defined using the `pandas_udf` as a decorator or to wrap the function, and no
    additional configuration is required. A Pandas UDF behaves as a regular PySpark function
    API in general.

    .. versionadded:: 2.3.0

    Parameters
    ----------
    f : function, optional
        user-defined function. A python function if used as a standalone function
    returnType : :class:`pyspark.sql.types.DataType` or str, optional
        the return type of the user-defined function. The value can be either a
        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
    functionType : int, optional
        an enum value in :class:`pyspark.sql.functions.PandasUDFType`.
        Default: SCALAR. This parameter exists for compatibility.
        Using Python type hints is encouraged.

    Examples
    --------
    In order to use this API, customarily the below are imported:

    >>> import pandas as pd
    >>> from pyspark.sql.functions import pandas_udf

    From Spark 3.0 with Python 3.6+, `Python type hints <https://www.python.org/dev/peps/pep-0484>`_
    detect the function types as below:

    >>> @pandas_udf(IntegerType())
    ... def slen(s: pd.Series) -> pd.Series:
    ...     return s.str.len()

    Prior to Spark 3.0, the pandas UDF used `functionType` to decide the execution type as below:

    >>> from pyspark.sql.functions import PandasUDFType
    >>> from pyspark.sql.types import IntegerType
    >>> @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
    ... def slen(s):
    ...     return s.str.len()

    It is preferred to specify type hints for the pandas UDF instead of specifying pandas UDF
    type via `functionType` which will be deprecated in the future releases.

    Note that the type hint should use `pandas.Series` in all cases but there is one variant
    that `pandas.DataFrame` should be used for its input or output type hint instead when the input
    or output column is of :class:`pyspark.sql.types.StructType`. The following example shows
    a Pandas UDF which takes long column, string column and struct column, and outputs a struct
    column. It requires the function to specify the type hints of `pandas.Series` and
    `pandas.DataFrame` as below:

    >>> @pandas_udf("col1 string, col2 long")
    >>> def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
    ...     s3['col2'] = s1 + s2.str.len()
    ...     return s3
    ...
    >>> # Create a Spark DataFrame that has three columns including a sturct column.
    ... df = spark.createDataFrame(
    ...     [[1, "a string", ("a nested string",)]],
    ...     "long_col long, string_col string, struct_col struct<col1:string>")
    >>> df.printSchema()
    root
    |-- long_column: long (nullable = true)
    |-- string_column: string (nullable = true)
    |-- struct_column: struct (nullable = true)
    |    |-- col1: string (nullable = true)
    >>> df.select(func("long_col", "string_col", "struct_col")).printSchema()
    |-- func(long_col, string_col, struct_col): struct (nullable = true)
    |    |-- col1: string (nullable = true)
    |    |-- col2: long (nullable = true)

    In the following sections, it describes the cominations of the supported type hints. For
    simplicity, `pandas.DataFrame` variant is omitted.

    * Series to Series
        `pandas.Series`, ... -> `pandas.Series`

        The function takes one or more `pandas.Series` and outputs one `pandas.Series`.
        The output of the function should always be of the same length as the input.

        >>> @pandas_udf("string")
        ... def to_upper(s: pd.Series) -> pd.Series:
        ...     return s.str.upper()
        ...
        >>> df = spark.createDataFrame([("John Doe",)], ("name",))
        >>> df.select(to_upper("name")).show()
        +--------------+
        |to_upper(name)|
        +--------------+
        |      JOHN DOE|
        +--------------+

        >>> @pandas_udf("first string, last string")
        ... def split_expand(s: pd.Series) -> pd.DataFrame:
        ...     return s.str.split(expand=True)
        ...
        >>> df = spark.createDataFrame([("John Doe",)], ("name",))
        >>> df.select(split_expand("name")).show()
        +------------------+
        |split_expand(name)|
        +------------------+
        |       [John, Doe]|
        +------------------+

        .. note:: The length of the input is not that of the whole input column, but is the
            length of an internal batch used for each call to the function.

    * Iterator of Series to Iterator of Series
        `Iterator[pandas.Series]` -> `Iterator[pandas.Series]`

        The function takes an iterator of `pandas.Series` and outputs an iterator of
        `pandas.Series`. In this case, the created pandas UDF instance requires one input
        column when this is called as a PySpark column. The length of the entire output from
        the function should be the same length of the entire input; therefore, it can
        prefetch the data from the input iterator as long as the lengths are the same.

        It is also useful when the UDF execution
        requires initializing some states although internally it works identically as
        Series to Series case. The pseudocode below illustrates the example.

        .. highlight:: python
        .. code-block:: python

            @pandas_udf("long")
            def calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
                # Do some expensive initialization with a state
                state = very_expensive_initialization()
                for x in iterator:
                    # Use that state for whole iterator.
                    yield calculate_with_state(x, state)

            df.select(calculate("value")).show()

        >>> from typing import Iterator
        >>> @pandas_udf("long")
        ... def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
        ...     for s in iterator:
        ...         yield s + 1
        ...
        >>> df = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))
        >>> df.select(plus_one(df.v)).show()
        +-----------+
        |plus_one(v)|
        +-----------+
        |          2|
        |          3|
        |          4|
        +-----------+

        .. note:: The length of each series is the length of a batch internally used.

    * Iterator of Multiple Series to Iterator of Series
        `Iterator[Tuple[pandas.Series, ...]]` -> `Iterator[pandas.Series]`

        The function takes an iterator of a tuple of multiple `pandas.Series` and outputs an
        iterator of `pandas.Series`. In this case, the created pandas UDF instance requires
        input columns as many as the series when this is called as a PySpark column.
        Otherwise, it has the same characteristics and restrictions as Iterator of Series
        to Iterator of Series case.

        >>> from typing import Iterator, Tuple
        >>> from pyspark.sql.functions import struct, col
        >>> @pandas_udf("long")
        ... def multiply(iterator: Iterator[Tuple[pd.Series, pd.DataFrame]]) -> Iterator[pd.Series]:
        ...     for s1, df in iterator:
        ...         yield s1 * df.v
        ...
        >>> df = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))
        >>> df.withColumn('output', multiply(col("v"), struct(col("v")))).show()
        +---+------+
        |  v|output|
        +---+------+
        |  1|     1|
        |  2|     4|
        |  3|     9|
        +---+------+

        .. note:: The length of each series is the length of a batch internally used.

    * Series to Scalar
        `pandas.Series`, ... -> `Any`

        The function takes `pandas.Series` and returns a scalar value. The `returnType`
        should be a primitive data type, and the returned scalar can be either a python primitive
        type, e.g., int or float or a numpy data type, e.g., numpy.int64 or numpy.float64.
        `Any` should ideally be a specific scalar type accordingly.

        >>> @pandas_udf("double")
        ... def mean_udf(v: pd.Series) -> float:
        ...     return v.mean()
        ...
        >>> df = spark.createDataFrame(
        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))
        >>> df.groupby("id").agg(mean_udf(df['v'])).show()
        +---+-----------+
        | id|mean_udf(v)|
        +---+-----------+
        |  1|        1.5|
        |  2|        6.0|
        +---+-----------+

        This UDF can also be used as window functions as below:

        >>> from pyspark.sql import Window
        >>> @pandas_udf("double")
        ... def mean_udf(v: pd.Series) -> float:
        ...     return v.mean()
        ...
        >>> df = spark.createDataFrame(
        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))
        >>> w = Window.partitionBy('id').orderBy('v').rowsBetween(-1, 0)
        >>> df.withColumn('mean_v', mean_udf("v").over(w)).show()
        +---+----+------+
        | id|   v|mean_v|
        +---+----+------+
        |  1| 1.0|   1.0|
        |  1| 2.0|   1.5|
        |  2| 3.0|   3.0|
        |  2| 5.0|   4.0|
        |  2|10.0|   7.5|
        +---+----+------+

        .. note:: For performance reasons, the input series to window functions are not copied.
            Therefore, mutating the input series is not allowed and will cause incorrect results.
            For the same reason, users should also not rely on the index of the input series.

    Notes
    -----
    The user-defined functions do not support conditional expressions or short circuiting
    in boolean expressions and it ends up with being executed all internally. If the functions
    can fail on special rows, the workaround is to incorporate the condition into the functions.

    The user-defined functions do not take keyword arguments on the calling side.

    The data type of returned `pandas.Series` from the user-defined functions should be
    matched with defined `returnType` (see :meth:`types.to_arrow_type` and
    :meth:`types.from_arrow_type`). When there is mismatch between them, Spark might do
    conversion on returned data. The conversion is not guaranteed to be correct and results
    should be checked for accuracy by users.

    Currently,
    :class:`pyspark.sql.types.MapType`,
    :class:`pyspark.sql.types.ArrayType` of :class:`pyspark.sql.types.TimestampType` and
    nested :class:`pyspark.sql.types.StructType`
    are currently not supported as output types.

    See Also
    --------
    pyspark.sql.GroupedData.agg
    pyspark.sql.DataFrame.mapInPandas
    pyspark.sql.GroupedData.applyInPandas
    pyspark.sql.PandasCogroupedOps.applyInPandas
    pyspark.sql.UDFRegistration.register
    """

    # The following table shows most of Pandas data and SQL type conversions in Pandas UDFs that
    # are not yet visible to the user. Some of behaviors are buggy and might be changed in the near
    # future. The table might have to be eventually documented externally.
    # Please see SPARK-28132's PR to see the codes in order to generate the table below.
    #
    # +-----------------------------+----------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+--------------+--------------+--------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+--------------+-----------------+------------------+---------------+--------------------------------+  # noqa
    # |SQL Type \ Pandas Value(Type)|None(object(NoneType))|        True(bool)|           1(int8)|          1(int16)|            1(int32)|            1(int64)|          1(uint8)|         1(uint16)|         1(uint32)|         1(uint64)|  1.0(float16)|  1.0(float32)|  1.0(float64)|1970-01-01 00:00:00(datetime64[ns])|1970-01-01 00:00:00-05:00(datetime64[ns, US/Eastern])|a(object(string))|  1(object(Decimal))|[1 2 3](object(array[int32]))| 1.0(float128)|(1+0j)(complex64)|(1+0j)(complex128)|    A(category)|1 days 00:00:00(timedelta64[ns])|  # noqa
    # +-----------------------------+----------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+--------------+--------------+--------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+--------------+-----------------+------------------+---------------+--------------------------------+  # noqa
    # |                      boolean|                  None|              True|              True|              True|                True|                True|              True|              True|              True|              True|          True|          True|          True|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                      tinyint|                  None|                 1|                 1|                 1|                   1|                   1|                 1|                 1|                 1|                 1|             1|             1|             1|                                  X|                                                    X|                X|                   1|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                     smallint|                  None|                 1|                 1|                 1|                   1|                   1|                 1|                 1|                 1|                 1|             1|             1|             1|                                  X|                                                    X|                X|                   1|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                          int|                  None|                 1|                 1|                 1|                   1|                   1|                 1|                 1|                 1|                 1|             1|             1|             1|                                  X|                                                    X|                X|                   1|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                       bigint|                  None|                 1|                 1|                 1|                   1|                   1|                 1|                 1|                 1|                 1|             1|             1|             1|                                  0|                                       18000000000000|                X|                   1|                            X|             X|                X|                 X|              X|                  86400000000000|  # noqa
    # |                        float|                  None|               1.0|               1.0|               1.0|                 1.0|                 1.0|               1.0|               1.0|               1.0|               1.0|           1.0|           1.0|           1.0|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                       double|                  None|               1.0|               1.0|               1.0|                 1.0|                 1.0|               1.0|               1.0|               1.0|               1.0|           1.0|           1.0|           1.0|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                         date|                  None|                 X|                 X|                 X|datetime.date(197...|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|               datetime.date(197...|                                 datetime.date(197...|                X|datetime.date(197...|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                    timestamp|                  None|                 X|                 X|                 X|                   X|datetime.datetime...|                 X|                 X|                 X|                 X|             X|             X|             X|               datetime.datetime...|                                 datetime.datetime...|                X|datetime.datetime...|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                       string|                  None|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|              'a'|                   X|                            X|             X|                X|                 X|            'A'|                               X|  # noqa
    # |                decimal(10,0)|                  None|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|                X|        Decimal('1')|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                   array<int>|                  None|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|                X|                   X|                    [1, 2, 3]|             X|                X|                 X|              X|                               X|  # noqa
    # |              map<string,int>|                     X|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |               struct<_1:int>|                     X|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|              X|                               X|  # noqa
    # |                       binary|                  None|bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'\x01')|  bytearray(b'\x01')|  bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'')|bytearray(b'')|bytearray(b'')|                     bytearray(b'')|                                       bytearray(b'')|  bytearray(b'a')|                   X|                            X|bytearray(b'')|   bytearray(b'')|    bytearray(b'')|bytearray(b'A')|                  bytearray(b'')|  # noqa
    # +-----------------------------+----------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+--------------+--------------+--------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+--------------+-----------------+------------------+---------------+--------------------------------+  # noqa    #
    #
    # Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be
    #       used in `returnType`.
    # Note: The values inside of the table are generated by `repr`.
    # Note: Python 3.7.3, Pandas 1.1.1 and PyArrow 1.0.1 are used.
    # Note: Timezone is KST.
    # Note: 'X' means it throws an exception during the conversion.
    require_minimum_pandas_version()
    require_minimum_pyarrow_version()

    # decorator @pandas_udf(returnType, functionType)
    is_decorator = f is None or isinstance(f, (str, DataType))

    if is_decorator:
        # If DataType has been passed as a positional argument
        # for decorator use it as a returnType
        return_type = f or returnType

        if functionType is not None:
            # @pandas_udf(dataType, functionType=functionType)
            # @pandas_udf(returnType=dataType, functionType=functionType)
            eval_type = functionType
        elif returnType is not None and isinstance(returnType, int):
            # @pandas_udf(dataType, functionType)
            eval_type = returnType
        else:
            # @pandas_udf(dataType) or @pandas_udf(returnType=dataType)
            eval_type = None
    else:
        return_type = returnType

        if functionType is not None:
            eval_type = functionType
        else:
            eval_type = None

    if return_type is None:
        raise ValueError("Invalid return type: returnType can not be None")

    if eval_type not in [
            PythonEvalType.SQL_SCALAR_PANDAS_UDF,
            PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
            PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
            PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
            PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
            PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF, None
    ]:  # None means it should infer the type from type hints.

        raise ValueError(
            "Invalid function type: "
            "functionType must be one the values from PandasUDFType")

    if is_decorator:
        return functools.partial(_create_pandas_udf,
                                 returnType=return_type,
                                 evalType=eval_type)
    else:
        return _create_pandas_udf(f=f,
                                  returnType=return_type,
                                  evalType=eval_type)
Ejemplo n.º 6
0
    def toPandas(self):
        """
        Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.

        This is only available if Pandas is installed and available.

        .. note:: This method should only be used if the resulting Pandas's :class:`DataFrame` is
            expected to be small, as all the data is loaded into the driver's memory.

        .. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.

        >>> df.toPandas()  # doctest: +SKIP
           age   name
        0    2  Alice
        1    5    Bob
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        from pyspark.sql.pandas.utils import require_minimum_pandas_version
        require_minimum_pandas_version()

        import numpy as np
        import pandas as pd

        timezone = self.sql_ctx._conf.sessionLocalTimeZone()

        if self.sql_ctx._conf.arrowPySparkEnabled():
            use_arrow = True
            try:
                from pyspark.sql.pandas.types import to_arrow_schema
                from pyspark.sql.pandas.utils import require_minimum_pyarrow_version

                require_minimum_pyarrow_version()
                to_arrow_schema(self.schema)
            except Exception as e:

                if self.sql_ctx._conf.arrowPySparkFallbackEnabled():
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
                        "failed by the reason below:\n  %s\n"
                        "Attempting non-optimization as "
                        "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
                        "true." % _exception_message(e))
                    warnings.warn(msg)
                    use_arrow = False
                else:
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                        "reached the error below and will not continue because automatic fallback "
                        "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
                        "false.\n  %s" % _exception_message(e))
                    warnings.warn(msg)
                    raise

            # Try to use Arrow optimization when the schema is supported and the required version
            # of PyArrow is found, if 'spark.sql.execution.arrow.pyspark.enabled' is enabled.
            if use_arrow:
                try:
                    from pyspark.sql.pandas.types import _check_dataframe_localize_timestamps
                    import pyarrow
                    batches = self._collect_as_arrow()
                    if len(batches) > 0:
                        table = pyarrow.Table.from_batches(batches)
                        # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
                        # values, but we should use datetime.date to match the behavior with when
                        # Arrow optimization is disabled.
                        pdf = table.to_pandas(date_as_object=True)
                        return _check_dataframe_localize_timestamps(
                            pdf, timezone)
                    else:
                        return pd.DataFrame.from_records([],
                                                         columns=self.columns)
                except Exception as e:
                    # We might have to allow fallback here as well but multiple Spark jobs can
                    # be executed. So, simply fail in this case for now.
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                        "reached the error below and can not continue. Note that "
                        "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                        "effect on failures in the middle of "
                        "computation.\n  %s" % _exception_message(e))
                    warnings.warn(msg)
                    raise

        # Below is toPandas without Arrow optimization.
        pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)

        dtype = {}
        for field in self.schema:
            pandas_type = PandasConversionMixin._to_corrected_pandas_type(
                field.dataType)
            # SPARK-21766: if an integer field is nullable and has null values, it can be
            # inferred by pandas as float column. Once we convert the column with NaN back
            # to integer type e.g., np.int16, we will hit exception. So we use the inferred
            # float type, not the corrected type from the schema in this case.
            if pandas_type is not None and \
                not(isinstance(field.dataType, IntegralType) and field.nullable and
                    pdf[field.name].isnull().any()):
                dtype[field.name] = pandas_type
            # Ensure we fall back to nullable numpy types, even when whole column is null:
            if isinstance(field.dataType,
                          IntegralType) and pdf[field.name].isnull().any():
                dtype[field.name] = np.float64
            if isinstance(field.dataType,
                          BooleanType) and pdf[field.name].isnull().any():
                dtype[field.name] = np.object

        for f, t in dtype.items():
            pdf[f] = pdf[f].astype(t, copy=False)

        if timezone is None:
            return pdf
        else:
            from pyspark.sql.pandas.types import _check_series_convert_timestamps_local_tz
            for field in self.schema:
                # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
                if isinstance(field.dataType, TimestampType):
                    pdf[field.name] = \
                        _check_series_convert_timestamps_local_tz(pdf[field.name], timezone)
            return pdf
Ejemplo n.º 7
0
def pandas_udf(f=None, returnType=None, functionType=None):
    """
    Creates a vectorized user defined function (UDF).

    :param f: user-defined function. A python function if used as a standalone function
    :param returnType: the return type of the user-defined function. The value can be either a
        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
    :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`.
                         Default: SCALAR.

    .. seealso:: :meth:`pyspark.sql.DataFrame.mapInPandas`
    .. seealso:: :meth:`pyspark.sql.GroupedData.applyInPandas`
    .. seealso:: :meth:`pyspark.sql.PandasCogroupedOps.applyInPandas`

    The function type of the UDF can be one of the following:

    1. SCALAR

       A scalar UDF defines a transformation: One or more `pandas.Series` -> A `pandas.Series`.
       The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`.
       If the return type is :class:`StructType`, the returned value should be a `pandas.DataFrame`.

       :class:`MapType`, nested :class:`StructType` are currently not supported as output types.

       Scalar UDFs can be used with :meth:`pyspark.sql.DataFrame.withColumn` and
       :meth:`pyspark.sql.DataFrame.select`.

       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
       >>> from pyspark.sql.types import IntegerType, StringType
       >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType())  # doctest: +SKIP
       >>> @pandas_udf(StringType())  # doctest: +SKIP
       ... def to_upper(s):
       ...     return s.str.upper()
       ...
       >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
       ... def add_one(x):
       ...     return x + 1
       ...
       >>> df = spark.createDataFrame([(1, "John Doe", 21)],
       ...                            ("id", "name", "age"))  # doctest: +SKIP
       >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\
       ...     .show()  # doctest: +SKIP
       +----------+--------------+------------+
       |slen(name)|to_upper(name)|add_one(age)|
       +----------+--------------+------------+
       |         8|      JOHN DOE|          22|
       +----------+--------------+------------+
       >>> @pandas_udf("first string, last string")  # doctest: +SKIP
       ... def split_expand(n):
       ...     return n.str.split(expand=True)
       >>> df.select(split_expand("name")).show()  # doctest: +SKIP
       +------------------+
       |split_expand(name)|
       +------------------+
       |       [John, Doe]|
       +------------------+

       .. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input
           column, but is the length of an internal batch used for each call to the function.
           Therefore, this can be used, for example, to ensure the length of each returned
           `pandas.Series`, and can not be used as the column length.

    2. SCALAR_ITER

       A scalar iterator UDF is semantically the same as the scalar Pandas UDF above except that the
       wrapped Python function takes an iterator of batches as input instead of a single batch and,
       instead of returning a single output batch, it yields output batches or explicitly returns an
       generator or an iterator of output batches.
       It is useful when the UDF execution requires initializing some state, e.g., loading a machine
       learning model file to apply inference to every input batch.

       .. note:: It is not guaranteed that one invocation of a scalar iterator UDF will process all
           batches from one partition, although it is currently implemented this way.
           Your code shall not rely on this behavior because it might change in the future for
           further optimization, e.g., one invocation processes multiple partitions.

       Scalar iterator UDFs are used with :meth:`pyspark.sql.DataFrame.withColumn` and
       :meth:`pyspark.sql.DataFrame.select`.

       >>> import pandas as pd  # doctest: +SKIP
       >>> from pyspark.sql.functions import col, pandas_udf, struct, PandasUDFType
       >>> pdf = pd.DataFrame([1, 2, 3], columns=["x"])  # doctest: +SKIP
       >>> df = spark.createDataFrame(pdf)  # doctest: +SKIP

       When the UDF is called with a single column that is not `StructType`, the input to the
       underlying function is an iterator of `pd.Series`.

       >>> @pandas_udf("long", PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
       ... def plus_one(batch_iter):
       ...     for x in batch_iter:
       ...         yield x + 1
       ...
       >>> df.select(plus_one(col("x"))).show()  # doctest: +SKIP
       +-----------+
       |plus_one(x)|
       +-----------+
       |          2|
       |          3|
       |          4|
       +-----------+

       When the UDF is called with more than one columns, the input to the underlying function is an
       iterator of `pd.Series` tuple.

       >>> @pandas_udf("long", PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
       ... def multiply_two_cols(batch_iter):
       ...     for a, b in batch_iter:
       ...         yield a * b
       ...
       >>> df.select(multiply_two_cols(col("x"), col("x"))).show()  # doctest: +SKIP
       +-----------------------+
       |multiply_two_cols(x, x)|
       +-----------------------+
       |                      1|
       |                      4|
       |                      9|
       +-----------------------+

       When the UDF is called with a single column that is `StructType`, the input to the underlying
       function is an iterator of `pd.DataFrame`.

       >>> @pandas_udf("long", PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
       ... def multiply_two_nested_cols(pdf_iter):
       ...    for pdf in pdf_iter:
       ...        yield pdf["a"] * pdf["b"]
       ...
       >>> df.select(
       ...     multiply_two_nested_cols(
       ...         struct(col("x").alias("a"), col("x").alias("b"))
       ...     ).alias("y")
       ... ).show()  # doctest: +SKIP
       +---+
       |  y|
       +---+
       |  1|
       |  4|
       |  9|
       +---+

       In the UDF, you can initialize some states before processing batches, wrap your code with
       `try ... finally ...` or use context managers to ensure the release of resources at the end
       or in case of early termination.

       >>> y_bc = spark.sparkContext.broadcast(1)  # doctest: +SKIP
       >>> @pandas_udf("long", PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
       ... def plus_y(batch_iter):
       ...     y = y_bc.value  # initialize some state
       ...     try:
       ...         for x in batch_iter:
       ...             yield x + y
       ...     finally:
       ...         pass  # release resources here, if any
       ...
       >>> df.select(plus_y(col("x"))).show()  # doctest: +SKIP
       +---------+
       |plus_y(x)|
       +---------+
       |        2|
       |        3|
       |        4|
       +---------+

    3. GROUPED_MAP

       A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`
       The returnType should be a :class:`StructType` describing the schema of the returned
       `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match
       the field names in the defined returnType schema if specified as strings, or match the
       field data types by position if not strings, e.g. integer indices.
       The length of the returned `pandas.DataFrame` can be arbitrary.

       Grouped map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`.

       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
       >>> df = spark.createDataFrame(
       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
       ...     ("id", "v"))  # doctest: +SKIP
       >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
       ... def normalize(pdf):
       ...     v = pdf.v
       ...     return pdf.assign(v=(v - v.mean()) / v.std())
       >>> df.groupby("id").apply(normalize).show()  # doctest: +SKIP
       +---+-------------------+
       | id|                  v|
       +---+-------------------+
       |  1|-0.7071067811865475|
       |  1| 0.7071067811865475|
       |  2|-0.8320502943378437|
       |  2|-0.2773500981126146|
       |  2| 1.1094003924504583|
       +---+-------------------+

       Alternatively, the user can define a function that takes two arguments.
       In this case, the grouping key(s) will be passed as the first argument and the data will
       be passed as the second argument. The grouping key(s) will be passed as a tuple of numpy
       data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in
       as a `pandas.DataFrame` containing all columns from the original Spark DataFrame.
       This is useful when the user does not want to hardcode grouping key(s) in the function.

       >>> import pandas as pd  # doctest: +SKIP
       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
       >>> df = spark.createDataFrame(
       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
       ...     ("id", "v"))  # doctest: +SKIP
       >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
       ... def mean_udf(key, pdf):
       ...     # key is a tuple of one numpy.int64, which is the value
       ...     # of 'id' for the current group
       ...     return pd.DataFrame([key + (pdf.v.mean(),)])
       >>> df.groupby('id').apply(mean_udf).show()  # doctest: +SKIP
       +---+---+
       | id|  v|
       +---+---+
       |  1|1.5|
       |  2|6.0|
       +---+---+
       >>> @pandas_udf(
       ...    "id long, `ceil(v / 2)` long, v double",
       ...    PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
       >>> def sum_udf(key, pdf):
       ...     # key is a tuple of two numpy.int64s, which is the values
       ...     # of 'id' and 'ceil(df.v / 2)' for the current group
       ...     return pd.DataFrame([key + (pdf.v.sum(),)])
       >>> df.groupby(df.id, ceil(df.v / 2)).apply(sum_udf).show()  # doctest: +SKIP
       +---+-----------+----+
       | id|ceil(v / 2)|   v|
       +---+-----------+----+
       |  2|          5|10.0|
       |  1|          1| 3.0|
       |  2|          3| 5.0|
       |  2|          2| 3.0|
       +---+-----------+----+

       .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is
           recommended to explicitly index the columns by name to ensure the positions are correct,
           or alternatively use an `OrderedDict`.
           For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or
           `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`.

       .. seealso:: :meth:`pyspark.sql.GroupedData.apply`

    4. GROUPED_AGG

       A grouped aggregate UDF defines a transformation: One or more `pandas.Series` -> A scalar
       The `returnType` should be a primitive data type, e.g., :class:`DoubleType`.
       The returned scalar can be either a python primitive type, e.g., `int` or `float`
       or a numpy data type, e.g., `numpy.int64` or `numpy.float64`.

       :class:`MapType` and :class:`StructType` are currently not supported as output types.

       Group aggregate UDFs are used with :meth:`pyspark.sql.GroupedData.agg` and
       :class:`pyspark.sql.Window`

       This example shows using grouped aggregated UDFs with groupby:

       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
       >>> df = spark.createDataFrame(
       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
       ...     ("id", "v"))
       >>> @pandas_udf("double", PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
       ... def mean_udf(v):
       ...     return v.mean()
       >>> df.groupby("id").agg(mean_udf(df['v'])).show()  # doctest: +SKIP
       +---+-----------+
       | id|mean_udf(v)|
       +---+-----------+
       |  1|        1.5|
       |  2|        6.0|
       +---+-----------+

       This example shows using grouped aggregated UDFs as window functions.

       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
       >>> from pyspark.sql import Window
       >>> df = spark.createDataFrame(
       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
       ...     ("id", "v"))
       >>> @pandas_udf("double", PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
       ... def mean_udf(v):
       ...     return v.mean()
       >>> w = (Window.partitionBy('id')
       ...            .orderBy('v')
       ...            .rowsBetween(-1, 0))
       >>> df.withColumn('mean_v', mean_udf(df['v']).over(w)).show()  # doctest: +SKIP
       +---+----+------+
       | id|   v|mean_v|
       +---+----+------+
       |  1| 1.0|   1.0|
       |  1| 2.0|   1.5|
       |  2| 3.0|   3.0|
       |  2| 5.0|   4.0|
       |  2|10.0|   7.5|
       +---+----+------+

       .. note:: For performance reasons, the input series to window functions are not copied.
            Therefore, mutating the input series is not allowed and will cause incorrect results.
            For the same reason, users should also not rely on the index of the input series.

       .. seealso:: :meth:`pyspark.sql.GroupedData.agg` and :class:`pyspark.sql.Window`

    .. note:: The user-defined functions do not support conditional expressions or short circuiting
        in boolean expressions and it ends up with being executed all internally. If the functions
        can fail on special rows, the workaround is to incorporate the condition into the functions.

    .. note:: The user-defined functions do not take keyword arguments on the calling side.

    .. note:: The data type of returned `pandas.Series` from the user-defined functions should be
        matched with defined returnType (see :meth:`types.to_arrow_type` and
        :meth:`types.from_arrow_type`). When there is mismatch between them, Spark might do
        conversion on returned data. The conversion is not guaranteed to be correct and results
        should be checked for accuracy by users.
    """

    # The following table shows most of Pandas data and SQL type conversions in Pandas UDFs that
    # are not yet visible to the user. Some of behaviors are buggy and might be changed in the near
    # future. The table might have to be eventually documented externally.
    # Please see SPARK-28132's PR to see the codes in order to generate the table below.
    #
    # +-----------------------------+----------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+--------------+--------------+--------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+--------------+-----------------+------------------+-----------+--------------------------------+  # noqa
    # |SQL Type \ Pandas Value(Type)|None(object(NoneType))|        True(bool)|           1(int8)|          1(int16)|            1(int32)|            1(int64)|          1(uint8)|         1(uint16)|         1(uint32)|         1(uint64)|  1.0(float16)|  1.0(float32)|  1.0(float64)|1970-01-01 00:00:00(datetime64[ns])|1970-01-01 00:00:00-05:00(datetime64[ns, US/Eastern])|a(object(string))|  1(object(Decimal))|[1 2 3](object(array[int32]))| 1.0(float128)|(1+0j)(complex64)|(1+0j)(complex128)|A(category)|1 days 00:00:00(timedelta64[ns])|  # noqa
    # +-----------------------------+----------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+--------------+--------------+--------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+--------------+-----------------+------------------+-----------+--------------------------------+  # noqa
    # |                      boolean|                  None|              True|              True|              True|                True|                True|              True|              True|              True|              True|          True|          True|          True|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                      tinyint|                  None|                 1|                 1|                 1|                   1|                   1|                 1|                 1|                 1|                 1|             1|             1|             1|                                  X|                                                    X|                X|                   1|                            X|             X|                X|                 X|          0|                               X|  # noqa
    # |                     smallint|                  None|                 1|                 1|                 1|                   1|                   1|                 1|                 1|                 1|                 1|             1|             1|             1|                                  X|                                                    X|                X|                   1|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                          int|                  None|                 1|                 1|                 1|                   1|                   1|                 1|                 1|                 1|                 1|             1|             1|             1|                                  X|                                                    X|                X|                   1|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                       bigint|                  None|                 1|                 1|                 1|                   1|                   1|                 1|                 1|                 1|                 1|             1|             1|             1|                                  0|                                       18000000000000|                X|                   1|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                        float|                  None|               1.0|               1.0|               1.0|                 1.0|                 1.0|               1.0|               1.0|               1.0|               1.0|           1.0|           1.0|           1.0|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                       double|                  None|               1.0|               1.0|               1.0|                 1.0|                 1.0|               1.0|               1.0|               1.0|               1.0|           1.0|           1.0|           1.0|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                         date|                  None|                 X|                 X|                 X|datetime.date(197...|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|               datetime.date(197...|                                 datetime.date(197...|                X|datetime.date(197...|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                    timestamp|                  None|                 X|                 X|                 X|                   X|datetime.datetime...|                 X|                 X|                 X|                 X|             X|             X|             X|               datetime.datetime...|                                 datetime.datetime...|                X|datetime.datetime...|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                       string|                  None|                ''|                ''|                ''|              '\x01'|              '\x01'|                ''|                ''|            '\x01'|            '\x01'|            ''|            ''|            ''|                                  X|                                                    X|              'a'|                   X|                            X|            ''|                X|                ''|          X|                               X|  # noqa
    # |                decimal(10,0)|                  None|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|                X|        Decimal('1')|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                   array<int>|                  None|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|                X|                   X|                    [1, 2, 3]|             X|                X|                 X|          X|                               X|  # noqa
    # |              map<string,int>|                     X|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |               struct<_1:int>|                     X|                 X|                 X|                 X|                   X|                   X|                 X|                 X|                 X|                 X|             X|             X|             X|                                  X|                                                    X|                X|                   X|                            X|             X|                X|                 X|          X|                               X|  # noqa
    # |                       binary|                  None|bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'\x01')|  bytearray(b'\x01')|  bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'\x01')|bytearray(b'')|bytearray(b'')|bytearray(b'')|                     bytearray(b'')|                                       bytearray(b'')|  bytearray(b'a')|                   X|                            X|bytearray(b'')|   bytearray(b'')|    bytearray(b'')|          X|                  bytearray(b'')|  # noqa
    # +-----------------------------+----------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+--------------+--------------+--------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+--------------+-----------------+------------------+-----------+--------------------------------+  # noqa
    #
    # Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be
    #       used in `returnType`.
    # Note: The values inside of the table are generated by `repr`.
    # Note: Python 3.7.3, Pandas 0.24.2 and PyArrow 0.13.0 are used.
    # Note: Timezone is KST.
    # Note: 'X' means it throws an exception during the conversion.
    require_minimum_pandas_version()
    require_minimum_pyarrow_version()

    # decorator @pandas_udf(returnType, functionType)
    is_decorator = f is None or isinstance(f, (str, DataType))

    if is_decorator:
        # If DataType has been passed as a positional argument
        # for decorator use it as a returnType
        return_type = f or returnType

        if functionType is not None:
            # @pandas_udf(dataType, functionType=functionType)
            # @pandas_udf(returnType=dataType, functionType=functionType)
            eval_type = functionType
        elif returnType is not None and isinstance(returnType, int):
            # @pandas_udf(dataType, functionType)
            eval_type = returnType
        else:
            # @pandas_udf(dataType) or @pandas_udf(returnType=dataType)
            eval_type = None
    else:
        return_type = returnType

        if functionType is not None:
            eval_type = functionType
        else:
            eval_type = None

    if return_type is None:
        raise ValueError("Invalid return type: returnType can not be None")

    if eval_type not in [
            PythonEvalType.SQL_SCALAR_PANDAS_UDF,
            PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
            PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
            PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
            PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
            PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF, None
    ]:  # None means it should infer the type from type hints.

        raise ValueError(
            "Invalid function type: "
            "functionType must be one the values from PandasUDFType")

    if is_decorator:
        return functools.partial(_create_pandas_udf,
                                 returnType=return_type,
                                 evalType=eval_type)
    else:
        return _create_pandas_udf(f=f,
                                  returnType=return_type,
                                  evalType=eval_type)