Esempio n. 1
0
    def registerFunction(self, name, f, returnType=StringType()):
        """Registers a lambda function as a UDF so it can be used in SQL statements.

        In addition to a name and the function itself, the return type can be optionally specified.
        When the return type is not given it default to a string and conversion will automatically
        be done.  For any other return type, the produced object must match the specified type.

        >>> sqlCtx.registerFunction("stringLengthString", lambda x: len(x))
        >>> sqlCtx.sql("SELECT stringLengthString('test')").collect()
        [Row(c0=u'4')]

        >>> from pyspark.sql.types import IntegerType
        >>> sqlCtx.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
        >>> sqlCtx.sql("SELECT stringLengthInt('test')").collect()
        [Row(c0=4)]
        """
        func = lambda _, it: imap(lambda x: f(*x), it)
        ser = AutoBatchedSerializer(PickleSerializer())
        command = (func, None, ser, ser)
        pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(self._sc, command, self)
        self._ssql_ctx.udf().registerPython(name,
                                            bytearray(pickled_cmd),
                                            env,
                                            includes,
                                            self._sc.pythonExec,
                                            bvars,
                                            self._sc._javaAccumulator,
                                            returnType.json())
Esempio n. 2
0
def _wrap_function(sc, func, returnType):
    command = (func, returnType)
    pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(
        sc, command)
    return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes,
                                  sc.pythonExec, sc.pythonVer, broadcast_vars,
                                  sc._javaAccumulator)
Esempio n. 3
0
    def registerFunction(self, name, f, returnType=StringType()):
        """Registers a lambda function as a UDF so it can be used in SQL statements.

        In addition to a name and the function itself, the return type can be optionally specified.
        When the return type is not given it default to a string and conversion will automatically
        be done.  For any other return type, the produced object must match the specified type.

        :param name: name of the UDF
        :param samplingRatio: lambda function
        :param returnType: a :class:`DataType` object

        >>> sqlContext.registerFunction("stringLengthString", lambda x: len(x))
        >>> sqlContext.sql("SELECT stringLengthString('test')").collect()
        [Row(c0=u'4')]

        >>> from pyspark.sql.types import IntegerType
        >>> sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
        [Row(c0=4)]

        >>> from pyspark.sql.types import IntegerType
        >>> sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
        [Row(c0=4)]
        """
        func = lambda _, it: map(lambda x: f(*x), it)
        ser = AutoBatchedSerializer(PickleSerializer())
        command = (func, None, ser, ser)
        pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(
            self._sc, command, self)
        self._ssql_ctx.udf().registerPython(name, bytearray(pickled_cmd), env,
                                            includes, self._sc.pythonExec,
                                            self._sc.pythonVer, bvars,
                                            self._sc._javaAccumulator,
                                            returnType.json())
Esempio n. 4
0
 def _create_judf(self):
     f = self.func  # put it in closure `func`
     func = lambda _, it: imap(lambda x: f(*x), it)
     ser = AutoBatchedSerializer(PickleSerializer())
     command = (func, None, ser, ser)
     sc = SparkContext._active_spark_context
     pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
     ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
     jdt = ssql_ctx.parseDataType(self.returnType.json())
     judf = sc._jvm.UserDefinedPythonFunction(f.__name__, bytearray(pickled_command), env,
                                              includes, sc.pythonExec, broadcast_vars,
                                              sc._javaAccumulator, jdt)
     return judf
Esempio n. 5
0
def _wrap_function(sc: SparkContext, func: Callable[..., Any],
                   returnType: "DataTypeOrString") -> JavaObject:
    command = (func, returnType)
    pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(
        sc, command)
    return sc._jvm.PythonFunction(  # type: ignore[attr-defined]
        bytearray(pickled_command),
        env,
        includes,
        sc.pythonExec,  # type: ignore[attr-defined]
        sc.pythonVer,  # type: ignore[attr-defined]
        broadcast_vars,
        sc._javaAccumulator  # type: ignore[attr-defined]
    )
Esempio n. 6
0
 def _create_judf(self):
     f = self.func  # put it in closure `func`
     func = lambda _, it: map(lambda x: f(*x), it)
     ser = AutoBatchedSerializer(PickleSerializer())
     command = (func, None, ser, ser)
     sc = SparkContext._active_spark_context
     pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
     ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
     jdt = ssql_ctx.parseDataType(self.returnType.json())
     fname = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__
     judf = sc._jvm.UserDefinedPythonFunction(fname, bytearray(pickled_command), env, includes,
                                              sc.pythonExec, sc.pythonVer, broadcast_vars,
                                              sc._javaAccumulator, jdt)
     return judf
Esempio n. 7
0
 def _create_judf(self, name):
     f, returnType = self.func, self.returnType  # put them in closure `func`
     func = lambda _, it: map(lambda x: returnType.toInternal(f(*x)), it)
     ser = AutoBatchedSerializer(PickleSerializer())
     command = (func, None, ser, ser)
     sc = SparkContext._active_spark_context
     pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
     ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
     jdt = ssql_ctx.parseDataType(self.returnType.json())
     if name is None:
         name = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__
     judf = sc._jvm.UserDefinedPythonFunction(name, bytearray(pickled_command), env, includes,
                                              sc.pythonExec, sc.pythonVer, broadcast_vars,
                                              sc._javaAccumulator, jdt)
     return judf
Esempio n. 8
0
def _wrap_function(sc: SparkContext, func: Callable[..., Any],
                   returnType: "DataTypeOrString") -> JavaObject:
    command = (func, returnType)
    pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(
        sc, command)
    assert sc._jvm is not None
    return sc._jvm.SimplePythonFunction(
        bytearray(pickled_command),
        env,
        includes,
        sc.pythonExec,
        sc.pythonVer,
        broadcast_vars,
        sc._javaAccumulator,
    )
Esempio n. 9
0
def _wrap_function(self, sc, func, returnType):
    if sc.profiler_collector:
        profiler = sc.profiler_collector.new_profiler(sc)
        from pyspark.accumulators import _udf_dic
        if len(_udf_dic.keys()) == 0:
            key = 99999
        else:
            key = max(_udf_dic.keys()) + 1
        sc.profiler_collector.add_profiler(key, profiler)
        _udf_dic[key] = (SparkContext._next_accum_id - 1, self._name)
    else:
        profiler = None
    command = (func, returnType, profiler)
    pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(
        sc, command)
    return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes,
                                  sc.pythonExec, sc.pythonVer, broadcast_vars,
                                  sc._javaAccumulator)
Esempio n. 10
0
def _wrap_function(sc, func, returnType):
    command = (func, returnType)
    pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
    return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec,
                                  sc.pythonVer, broadcast_vars, sc._javaAccumulator)