Ejemplo n.º 1
0
def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
    num_arg = read_int(infile)
    arg_offsets = [read_int(infile) for i in range(num_arg)]
    row_func = None
    for i in range(read_int(infile)):
        f, return_type = read_command(pickleSer, infile)
        if row_func is None:
            row_func = f
        else:
            row_func = chain(row_func, f)

    # make sure StopIteration's raised in the user code are not ignored
    # when they are processed in a for loop, raise them as RuntimeError's instead
    func = fail_on_stopiteration(row_func)

    # the last returnType will be the return type of UDF
    if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
        return arg_offsets, wrap_scalar_pandas_udf(func, return_type)
    elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        argspec = _get_argspec(row_func)  # signature was lost when wrapping it
        return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type, argspec, runner_conf)
    elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
        return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type)
    elif eval_type == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF:
        return arg_offsets, wrap_window_agg_pandas_udf(func, return_type, runner_conf, udf_index)
    elif eval_type == PythonEvalType.SQL_BATCHED_UDF:
        return arg_offsets, wrap_udf(func, return_type)
    else:
        raise ValueError("Unknown eval type: {}".format(eval_type))
Ejemplo n.º 2
0
    def _create_judf(self):
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        func = fail_on_stopiteration(self.func)
        wrapped_func = _wrap_function(sc, func, self.returnType)
        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
        judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
        return judf
Ejemplo n.º 3
0
Archivo: udf.py Proyecto: FUHENG0571/S
    def _create_judf(self):
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        func = fail_on_stopiteration(self.func)

        # for pandas UDFs the worker needs to know if the function takes
        # one or two arguments, but the signature is lost when wrapping with
        # fail_on_stopiteration, so we store it here
        if self.evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                             PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                             PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF):
            func._argspec = _get_argspec(self.func)

        wrapped_func = _wrap_function(sc, func, self.returnType)
        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
        judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
        return judf
Ejemplo n.º 4
0
    def _create_judf(self):
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        func = fail_on_stopiteration(self.func)

        # for pandas UDFs the worker needs to know if the function takes
        # one or two arguments, but the signature is lost when wrapping with
        # fail_on_stopiteration, so we store it here
        if self.evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                             PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                             PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF):
            func._argspec = _get_argspec(self.func)

        wrapped_func = _wrap_function(sc, func, self.returnType)
        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
        judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
        return judf
Ejemplo n.º 5
0
def read_single_udf(pickleSer, infile, eval_type):
    num_arg = read_int(infile)
    arg_offsets = [read_int(infile) for i in range(num_arg)]
    row_func = None
    for i in range(read_int(infile)):
        f, return_type = read_command(pickleSer, infile)
        if row_func is None:
            row_func = f
        else:
            row_func = chain(row_func, f)

    # make sure StopIteration's raised in the user code are not ignored
    # when they are processed in a for loop, raise them as RuntimeError's instead
    func = fail_on_stopiteration(row_func)

    # the last returnType will be the return type of UDF
    if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
        return arg_offsets, wrap_scalar_pandas_udf(func, return_type)
    elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type)
    else:
        return arg_offsets, wrap_udf(func, return_type)
Ejemplo n.º 6
0
 def __init__(self, createCombiner, mergeValue, mergeCombiners):
     self.createCombiner = fail_on_stopiteration(createCombiner)
     self.mergeValue = fail_on_stopiteration(mergeValue)
     self.mergeCombiners = fail_on_stopiteration(mergeCombiners)
Ejemplo n.º 7
0
def _fail_on_stopiteration(fn):
    # noinspection PyPackageRequirements
    from pyspark import util

    return util.fail_on_stopiteration(fn)
Ejemplo n.º 8
0
 def __init__(self, createCombiner, mergeValue, mergeCombiners):
     self.createCombiner = fail_on_stopiteration(createCombiner)
     self.mergeValue = fail_on_stopiteration(mergeValue)
     self.mergeCombiners = fail_on_stopiteration(mergeCombiners)
Ejemplo n.º 9
0
 def _func(_, iterator):
     return filter(fail_on_stopiteration(_fn), iterator)
Ejemplo n.º 10
0
 def _func(_, iterator):
     return chain.from_iterable(map(fail_on_stopiteration(_fn), iterator))
Ejemplo n.º 11
0
 def _func(_, iterator):
     return map(util.fail_on_stopiteration(_fn), iterator)