Example #1
0
def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
    num_arg = read_int(infile)
    arg_offsets = [read_int(infile) for i in range(num_arg)]
    row_func = None
    for i in range(read_int(infile)):
        f, return_type = read_command(pickleSer, infile)
        if row_func is None:
            row_func = f
        else:
            row_func = chain(row_func, f)

    # make sure StopIteration's raised in the user code are not ignored
    # when they are processed in a for loop, raise them as RuntimeError's instead
    func = fail_on_stopiteration(row_func)

    # the last returnType will be the return type of UDF
    if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
        return arg_offsets, wrap_scalar_pandas_udf(func, return_type)
    elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        argspec = _get_argspec(row_func)  # signature was lost when wrapping it
        return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type, argspec, runner_conf)
    elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
        return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type)
    elif eval_type == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF:
        return arg_offsets, wrap_window_agg_pandas_udf(func, return_type, runner_conf, udf_index)
    elif eval_type == PythonEvalType.SQL_BATCHED_UDF:
        return arg_offsets, wrap_udf(func, return_type)
    else:
        raise ValueError("Unknown eval type: {}".format(eval_type))
Example #2
0
def _create_pandas_udf(f, returnType, evalType):
    argspec = _get_argspec(f)

    # pandas UDF by type hints.
    if sys.version_info >= (3, 6):
        from inspect import signature

        if evalType in [
                PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF
        ]:
            warnings.warn(
                "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for "
                "pandas UDF instead of specifying pandas UDF type which will be deprecated "
                "in the future releases. See SPARK-28264 for more details.",
                UserWarning)
        elif evalType in [
                PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF
        ]:
            # In case of 'SQL_GROUPED_MAP_PANDAS_UDF',  deprecation warning is being triggered
            # at `apply` instead.
            # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the
            # evaluation type will always be set.
            pass
        elif len(argspec.annotations) > 0:
            evalType = infer_eval_type(signature(f))
            assert evalType is not None

    if evalType is None:
        # Set default is scalar UDF.
        evalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF

    if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or
            evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF) and \
            len(argspec.args) == 0 and \
            argspec.varargs is None:
        raise ValueError(
            "Invalid function: 0-arg pandas_udfs are not supported. "
            "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
        )

    if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \
            and len(argspec.args) not in (1, 2):
        raise ValueError(
            "Invalid function: pandas_udf with function type GROUPED_MAP or "
            "the function in groupby.applyInPandas "
            "must take either one argument (data) or two arguments (key, data)."
        )

    if evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF \
            and len(argspec.args) not in (2, 3):
        raise ValueError(
            "Invalid function: the function in cogroup.applyInPandas "
            "must take either two arguments (left, right) "
            "or three arguments (key, left, right).")

    return _create_udf(f, returnType, evalType)
Example #3
0
File: udf.py Project: FUHENG0571/S
def _create_udf(f, returnType, evalType):

    if evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                    PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                    PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF):

        from pyspark.sql.utils import require_minimum_pyarrow_version
        require_minimum_pyarrow_version()

        argspec = _get_argspec(f)

        if evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF and len(argspec.args) == 0 and \
                argspec.varargs is None:
            raise ValueError(
                "Invalid function: 0-arg pandas_udfs are not supported. "
                "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
            )

        if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \
                and len(argspec.args) not in (1, 2):
            raise ValueError(
                "Invalid function: pandas_udfs with function type GROUPED_MAP "
                "must take either one argument (data) or two arguments (key, data)."
            )

    # Set the name of the UserDefinedFunction object to be the name of function f
    udf_obj = UserDefinedFunction(f,
                                  returnType=returnType,
                                  name=None,
                                  evalType=evalType,
                                  deterministic=True)
    return udf_obj._wrapped()
Example #4
0
def _create_udf(f, returnType, evalType):

    if evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                    PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                    PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF):

        from pyspark.sql.utils import require_minimum_pyarrow_version
        require_minimum_pyarrow_version()

        argspec = _get_argspec(f)

        if evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF and len(argspec.args) == 0 and \
                argspec.varargs is None:
            raise ValueError(
                "Invalid function: 0-arg pandas_udfs are not supported. "
                "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
            )

        if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \
                and len(argspec.args) not in (1, 2):
            raise ValueError(
                "Invalid function: pandas_udfs with function type GROUPED_MAP "
                "must take either one argument (data) or two arguments (key, data).")

    # Set the name of the UserDefinedFunction object to be the name of function f
    udf_obj = UserDefinedFunction(
        f, returnType=returnType, name=None, evalType=evalType, deterministic=True)
    return udf_obj._wrapped()
Example #5
0
    def wrapped(key_series, value_series):
        import pandas as pd
        argspec = _get_argspec(f)

        if len(argspec.args) == 1:
            result = f(pd.concat(value_series, axis=1))
        elif len(argspec.args) == 2:
            key = tuple(s[0] for s in key_series)
            result = f(key, pd.concat(value_series, axis=1))

        if not isinstance(result, pd.DataFrame):
            raise TypeError("Return type of the user-defined function should be "
                            "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type), len(result.columns)))
        arrow_return_types = (to_arrow_type(field.dataType) for field in return_type)
        return [(result[result.columns[i]], arrow_type)
                for i, arrow_type in enumerate(arrow_return_types)]
Example #6
0
File: udf.py Project: FUHENG0571/S
    def _create_judf(self):
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        func = fail_on_stopiteration(self.func)

        # for pandas UDFs the worker needs to know if the function takes
        # one or two arguments, but the signature is lost when wrapping with
        # fail_on_stopiteration, so we store it here
        if self.evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                             PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                             PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF):
            func._argspec = _get_argspec(self.func)

        wrapped_func = _wrap_function(sc, func, self.returnType)
        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
        judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
        return judf
Example #7
0
    def _create_judf(self):
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        func = fail_on_stopiteration(self.func)

        # for pandas UDFs the worker needs to know if the function takes
        # one or two arguments, but the signature is lost when wrapping with
        # fail_on_stopiteration, so we store it here
        if self.evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                             PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                             PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF):
            func._argspec = _get_argspec(self.func)

        wrapped_func = _wrap_function(sc, func, self.returnType)
        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
        judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
        return judf
Example #8
0
    def wrapped(key_series, value_series):
        import pandas as pd
        argspec = _get_argspec(f)

        if len(argspec.args) == 1:
            result = f(pd.concat(value_series, axis=1))
        elif len(argspec.args) == 2:
            key = tuple(s[0] for s in key_series)
            result = f(key, pd.concat(value_series, axis=1))

        if not isinstance(result, pd.DataFrame):
            raise TypeError(
                "Return type of the user-defined function should be "
                "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type),
                                                 len(result.columns)))
        arrow_return_types = (to_arrow_type(field.dataType)
                              for field in return_type)
        return [(result[result.columns[i]], arrow_type)
                for i, arrow_type in enumerate(arrow_return_types)]