def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index): num_arg = read_int(infile) arg_offsets = [read_int(infile) for i in range(num_arg)] row_func = None for i in range(read_int(infile)): f, return_type = read_command(pickleSer, infile) if row_func is None: row_func = f else: row_func = chain(row_func, f) # make sure StopIteration's raised in the user code are not ignored # when they are processed in a for loop, raise them as RuntimeError's instead func = fail_on_stopiteration(row_func) # the last returnType will be the return type of UDF if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF: return arg_offsets, wrap_scalar_pandas_udf(func, return_type) elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: argspec = _get_argspec(row_func) # signature was lost when wrapping it return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type, argspec, runner_conf) elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type) elif eval_type == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF: return arg_offsets, wrap_window_agg_pandas_udf(func, return_type, runner_conf, udf_index) elif eval_type == PythonEvalType.SQL_BATCHED_UDF: return arg_offsets, wrap_udf(func, return_type) else: raise ValueError("Unknown eval type: {}".format(eval_type))
def _create_pandas_udf(f, returnType, evalType): argspec = _get_argspec(f) # pandas UDF by type hints. if sys.version_info >= (3, 6): from inspect import signature if evalType in [ PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF ]: warnings.warn( "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for " "pandas UDF instead of specifying pandas UDF type which will be deprecated " "in the future releases. See SPARK-28264 for more details.", UserWarning) elif evalType in [ PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF ]: # In case of 'SQL_GROUPED_MAP_PANDAS_UDF', deprecation warning is being triggered # at `apply` instead. # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the # evaluation type will always be set. pass elif len(argspec.annotations) > 0: evalType = infer_eval_type(signature(f)) assert evalType is not None if evalType is None: # Set default is scalar UDF. evalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF) and \ len(argspec.args) == 0 and \ argspec.varargs is None: raise ValueError( "Invalid function: 0-arg pandas_udfs are not supported. " "Instead, create a 1-arg pandas_udf and ignore the arg in your function." ) if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \ and len(argspec.args) not in (1, 2): raise ValueError( "Invalid function: pandas_udf with function type GROUPED_MAP or " "the function in groupby.applyInPandas " "must take either one argument (data) or two arguments (key, data)." ) if evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF \ and len(argspec.args) not in (2, 3): raise ValueError( "Invalid function: the function in cogroup.applyInPandas " "must take either two arguments (left, right) " "or three arguments (key, left, right).") return _create_udf(f, returnType, evalType)
def _create_udf(f, returnType, evalType): if evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF): from pyspark.sql.utils import require_minimum_pyarrow_version require_minimum_pyarrow_version() argspec = _get_argspec(f) if evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF and len(argspec.args) == 0 and \ argspec.varargs is None: raise ValueError( "Invalid function: 0-arg pandas_udfs are not supported. " "Instead, create a 1-arg pandas_udf and ignore the arg in your function." ) if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \ and len(argspec.args) not in (1, 2): raise ValueError( "Invalid function: pandas_udfs with function type GROUPED_MAP " "must take either one argument (data) or two arguments (key, data)." ) # Set the name of the UserDefinedFunction object to be the name of function f udf_obj = UserDefinedFunction(f, returnType=returnType, name=None, evalType=evalType, deterministic=True) return udf_obj._wrapped()
def _create_udf(f, returnType, evalType): if evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF): from pyspark.sql.utils import require_minimum_pyarrow_version require_minimum_pyarrow_version() argspec = _get_argspec(f) if evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF and len(argspec.args) == 0 and \ argspec.varargs is None: raise ValueError( "Invalid function: 0-arg pandas_udfs are not supported. " "Instead, create a 1-arg pandas_udf and ignore the arg in your function." ) if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \ and len(argspec.args) not in (1, 2): raise ValueError( "Invalid function: pandas_udfs with function type GROUPED_MAP " "must take either one argument (data) or two arguments (key, data).") # Set the name of the UserDefinedFunction object to be the name of function f udf_obj = UserDefinedFunction( f, returnType=returnType, name=None, evalType=evalType, deterministic=True) return udf_obj._wrapped()
def wrapped(key_series, value_series): import pandas as pd argspec = _get_argspec(f) if len(argspec.args) == 1: result = f(pd.concat(value_series, axis=1)) elif len(argspec.args) == 2: key = tuple(s[0] for s in key_series) result = f(key, pd.concat(value_series, axis=1)) if not isinstance(result, pd.DataFrame): raise TypeError("Return type of the user-defined function should be " "pandas.DataFrame, but is {}".format(type(result))) if not len(result.columns) == len(return_type): raise RuntimeError( "Number of columns of the returned pandas.DataFrame " "doesn't match specified schema. " "Expected: {} Actual: {}".format(len(return_type), len(result.columns))) arrow_return_types = (to_arrow_type(field.dataType) for field in return_type) return [(result[result.columns[i]], arrow_type) for i, arrow_type in enumerate(arrow_return_types)]
def _create_judf(self): from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext func = fail_on_stopiteration(self.func) # for pandas UDFs the worker needs to know if the function takes # one or two arguments, but the signature is lost when wrapping with # fail_on_stopiteration, so we store it here if self.evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF): func._argspec = _get_argspec(self.func) wrapped_func = _wrap_function(sc, func, self.returnType) jdt = spark._jsparkSession.parseDataType(self.returnType.json()) judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( self._name, wrapped_func, jdt, self.evalType, self.deterministic) return judf
def _create_judf(self): from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext func = fail_on_stopiteration(self.func) # for pandas UDFs the worker needs to know if the function takes # one or two arguments, but the signature is lost when wrapping with # fail_on_stopiteration, so we store it here if self.evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF): func._argspec = _get_argspec(self.func) wrapped_func = _wrap_function(sc, func, self.returnType) jdt = spark._jsparkSession.parseDataType(self.returnType.json()) judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( self._name, wrapped_func, jdt, self.evalType, self.deterministic) return judf
def wrapped(key_series, value_series): import pandas as pd argspec = _get_argspec(f) if len(argspec.args) == 1: result = f(pd.concat(value_series, axis=1)) elif len(argspec.args) == 2: key = tuple(s[0] for s in key_series) result = f(key, pd.concat(value_series, axis=1)) if not isinstance(result, pd.DataFrame): raise TypeError( "Return type of the user-defined function should be " "pandas.DataFrame, but is {}".format(type(result))) if not len(result.columns) == len(return_type): raise RuntimeError( "Number of columns of the returned pandas.DataFrame " "doesn't match specified schema. " "Expected: {} Actual: {}".format(len(return_type), len(result.columns))) arrow_return_types = (to_arrow_type(field.dataType) for field in return_type) return [(result[result.columns[i]], arrow_type) for i, arrow_type in enumerate(arrow_return_types)]