def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort( jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init(SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init(SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort( jgws, gw._python_proxy_port) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") from pyspark.java_gateway import ensure_callback_server_started ensure_callback_server_started(gw) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)