Beispiel #1
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(
                jgws, gw._python_proxy_port)
            _py4j_cleaner = Py4jCallbackConnectionCleaner(gw)
            _py4j_cleaner.start()

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        if cls._transformerSerializer is None:
            transformer_serializer = TransformFunctionSerializer()
            transformer_serializer.init(SparkContext._active_spark_context,
                                        CloudPickleSerializer(), gw)
            # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM
            # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever.
            # (https://github.com/bartdag/py4j/pull/184)
            #
            # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when
            # calling "registerSerializer". If we call "registerSerializer" twice, the second
            # PythonProxyHandler will override the first one, then the first one will be GCed and
            # trigger "PythonProxyHandler.finalize". To avoid that, we should not call
            # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't
            # be GCed.
            #
            # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version.
            transformer_serializer.gateway.jvm.PythonDStream.registerSerializer(
                transformer_serializer)
            cls._transformerSerializer = transformer_serializer
        else:
            cls._transformerSerializer.init(SparkContext._active_spark_context,
                                            CloudPickleSerializer(), gw)
Beispiel #2
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port)
            _py4j_cleaner = Py4jCallbackConnectionCleaner(gw)
            _py4j_cleaner.start()

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        if cls._transformerSerializer is None:
            transformer_serializer = TransformFunctionSerializer()
            transformer_serializer.init(
                    SparkContext._active_spark_context, CloudPickleSerializer(), gw)
            # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM
            # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever.
            # (https://github.com/bartdag/py4j/pull/184)
            #
            # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when
            # calling "registerSerializer". If we call "registerSerializer" twice, the second
            # PythonProxyHandler will override the first one, then the first one will be GCed and
            # trigger "PythonProxyHandler.finalize". To avoid that, we should not call
            # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't
            # be GCed.
            #
            # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version.
            transformer_serializer.gateway.jvm.PythonDStream.registerSerializer(
                    transformer_serializer)
            cls._transformerSerializer = transformer_serializer
        else:
            cls._transformerSerializer.init(
                    SparkContext._active_spark_context, CloudPickleSerializer(), gw)
Beispiel #3
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(
                jgws, gw._python_proxy_port)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
Beispiel #4
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        from pyspark.java_gateway import ensure_callback_server_started
        ensure_callback_server_started(gw)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)