Ejemplo n.º 1
0
 def __get_random_port_for_callback_server(self) -> None:
     # See: https://github.com/bartdag/py4j/issues/147
     self._gateway.start_callback_server(
         CallbackServerParameters(port=0, daemonize=True, daemonize_connections=True))
     jgws = JavaObject("GATEWAY_SERVER", self._gateway._gateway_client)
     jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(),
                              self._gateway.get_callback_server().get_listening_port())
Ejemplo n.º 2
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(
                jgws, gw._python_proxy_port)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def convert_reference_type(target_id, gateway_client: GatewayClient,
                           java_gateway: JavaGateway):
    java_object = JavaObject(target_id, gateway_client)
    if is_instance_of(java_gateway, java_object, "java.time.LocalDate"):
        return date(java_object.getYear(), java_object.getMonthValue(),
                    java_object.getDayOfMonth())
    else:
        return java_object
Ejemplo n.º 4
0
 def testJavaErrorGC(self):
     id = self.doError()
     java_object = JavaObject(id, self.gateway._gateway_client)
     try:
         # Should fail because it should have been garbage collected...
         java_object.getCause()
         self.fail()
     except Py4JError:
         self.assertTrue(True)
Ejemplo n.º 5
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(
                jgws, gw._python_proxy_port)
            _py4j_cleaner = Py4jCallbackConnectionCleaner(gw)
            _py4j_cleaner.start()

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        if cls._transformerSerializer is None:
            transformer_serializer = TransformFunctionSerializer()
            transformer_serializer.init(SparkContext._active_spark_context,
                                        CloudPickleSerializer(), gw)
            # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM
            # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever.
            # (https://github.com/bartdag/py4j/pull/184)
            #
            # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when
            # calling "registerSerializer". If we call "registerSerializer" twice, the second
            # PythonProxyHandler will override the first one, then the first one will be GCed and
            # trigger "PythonProxyHandler.finalize". To avoid that, we should not call
            # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't
            # be GCed.
            #
            # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version.
            transformer_serializer.gateway.jvm.PythonDStream.registerSerializer(
                transformer_serializer)
            cls._transformerSerializer = transformer_serializer
        else:
            cls._transformerSerializer.init(SparkContext._active_spark_context,
                                            CloudPickleSerializer(), gw)
Ejemplo n.º 6
0
    def init(self, arglist, _sc=None, _sqlContext=None):
        sc = SparkContext() if _sc is None else _sc
        sqlContext = HiveContext(sc) if _sqlContext is None else _sqlContext

        sc.setLogLevel("ERROR")

        self.sqlContext = sqlContext
        self.sc = sc
        self._jvm = sc._jvm

        from py4j.java_gateway import java_import
        java_import(self._jvm, "org.tresamigos.smv.ColumnHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper")
        java_import(self._jvm, "org.tresamigos.smv.dqm.*")
        java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper")

        self.j_smvPyClient = self.create_smv_pyclient(arglist)

        # shortcut is meant for internal use only
        self.j_smvApp = self.j_smvPyClient.j_smvApp()

        # issue #429 set application name from smv config
        sc._conf.setAppName(self.appName())

        # user may choose a port for the callback server
        gw = sc._gateway
        cbsp = self.j_smvPyClient.callbackServerPort()
        cbs_port = cbsp.get() if cbsp.isDefined() else gw._python_proxy_port

        # this was a workaround for py4j 0.8.2.1, shipped with spark
        # 1.5.x, to prevent the callback server from hanging the
        # python, and hence the java, process
        from pyspark.streaming.context import _daemonize_callback_server
        _daemonize_callback_server()

        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            print("starting callback server on port {0}".format(cbs_port))
            gw._shutdown_callback_server(
            )  # in case another has already started
            gw._start_callback_server(cbs_port)
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.SmvPythonHelper.updatePythonGatewayPort(
                jgws, gw._python_proxy_port)

        self.repo = PythonDataSetRepository(self)
        self.j_smvPyClient.register('Python', self.repo)
        return self
Ejemplo n.º 7
0
Archivo: context.py Proyecto: kwop/cdap
    def __ensureGatewayInit(cls, gatewayPort, gatewaySecret, driver):
        with cls._lock:
            if not cls._gateway:
                # Spark 1.6 and Spark 2 are using later verions of py4j (0.9 and 0.10+ respectively),
                # which has better control on gateway client and callback server using
                # GatewayParameters and CallbackServerParameters. Try to use those,
                # as it'll be less hacky (it's still a bit hacky for Spark 1.6, see below)
                try:
                    from py4j.java_gateway import GatewayParameters, CallbackServerParameters
                    callbackServerParams = CallbackServerParameters(
                        port=0, daemonize=True,
                        daemonize_connections=True) if driver else None
                    gateway = JavaGateway(
                        gateway_parameters=GatewayParameters(
                            port=gatewayPort,
                            auto_convert=True,
                            auth_token=gatewaySecret),
                        callback_server_parameters=callbackServerParams)
                except:
                    from py4j.java_gateway import CallbackServer, GatewayClient
                    gateway = JavaGateway(
                        gateway_client=GatewayClient(port=gatewayPort),
                        auto_convert=True)
                    cls._onDemandCallback = True

                java_import(gateway.jvm, "io.cdap.cdap.app.runtime.spark.*")
                java_import(gateway.jvm,
                            "io.cdap.cdap.app.runtime.spark.python.*")

                if driver and not cls._onDemandCallback:
                    # For py4j 0.10+ (used by Spark 2.0), use the official API to set set callback port on the gateway server
                    if "get_callback_server" in dir(gateway):
                        callbackPort = gateway.get_callback_server(
                        ).get_listening_port()
                        gateway.jvm.SparkPythonUtil.setGatewayCallbackPort(
                            gateway.java_gateway_server, callbackPort)
                    else:
                        # For py4j 0.9 (used by Spark 1.6), it doesn't have way to set the dynamic port of the callback server,
                        # hence we need the hack to call SparkPythonUtil to set it
                        callbackPort = gateway._callback_server.server_socket.getsockname(
                        )[1]
                        gateway.jvm.SparkPythonUtil.setGatewayCallbackPort(
                            JavaObject("GATEWAY_SERVER",
                                       gateway._gateway_client), callbackPort)
                cls._gateway = gateway
                cls._jvm = gateway.jvm
                cls._runtimeContext = cls._jvm.SparkRuntimeContextProvider.get(
                )
                print "Java gateway initialized with gateway port ", gatewayPort
Ejemplo n.º 8
0
def convert(target_id, gateway_client):
    """Convert WarpScript stacks and GTS into wrapped objects.
    """

    jObject = JavaObject(target_id, gateway_client)

    # check if it is a WarpScript stack
    if 'execMulti' in dir(jObject):
        return Stack(jObject)

    # check if it is a GTS
    elif 'hasElevations' in dir(jObject):
        return Gts(jObject)

    else:
        return jObject
Ejemplo n.º 9
0
def _ensure_callback_gateway_initialized(gw):
    """ Ensure that python callback gateway is started and configured.
    Source: ``pyspark/streaming/context.py`` in ``StreamingContext._ensure_initialized``
    """
    # start callback server
    # getattr will fallback to JVM, so we cannot test by hasattr()
    if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
        gw.callback_server_parameters.eager_load = True
        gw.callback_server_parameters.daemonize = True
        gw.callback_server_parameters.daemonize_connections = True
        gw.callback_server_parameters.port = 0
        gw.start_callback_server(gw.callback_server_parameters)
        cbport = gw._callback_server.server_socket.getsockname()[1]
        gw._callback_server.port = cbport
        # gateway with real port
        gw._python_proxy_port = gw._callback_server.port
        # get the GatewayServer object in JVM by ID
        jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
        # update the port of CallbackClient with real port
        jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port)
Ejemplo n.º 10
0
def ensure_callback_server_started(gw):
    """
    Start callback server if not already started. The callback server is needed if the Java
    driver process needs to callback into the Python driver process to execute Python code.
    """

    # getattr will fallback to JVM, so we cannot test by hasattr()
    if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
        gw.callback_server_parameters.eager_load = True
        gw.callback_server_parameters.daemonize = True
        gw.callback_server_parameters.daemonize_connections = True
        gw.callback_server_parameters.port = 0
        gw.start_callback_server(gw.callback_server_parameters)
        cbport = gw._callback_server.server_socket.getsockname()[1]
        gw._callback_server.port = cbport
        # gateway with real port
        gw._python_proxy_port = gw._callback_server.port
        # get the GatewayServer object in JVM by ID
        jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
        # update the port of CallbackClient with real port
        jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port)
Ejemplo n.º 11
0
    def __init__(self, arglist, _sc = None, _sqlContext = None):
        sc = SparkContext() if _sc is None else _sc
        sqlContext = HiveContext(sc) if _sqlContext is None else _sqlContext

        self.prepend_source("src/main/python")

        sc.setLogLevel("ERROR")

        self.sqlContext = sqlContext
        self.sc = sc
        self._jvm = sc._jvm

        from py4j.java_gateway import java_import
        java_import(self._jvm, "org.tresamigos.smv.ColumnHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper")
        java_import(self._jvm, "org.tresamigos.smv.dqm.*")
        java_import(self._jvm, "org.tresamigos.smv.panel.*")
        java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper")

        self.j_smvPyClient = self.create_smv_pyclient(arglist)

        # shortcut is meant for internal use only
        self.j_smvApp = self.j_smvPyClient.j_smvApp()

        self.stages = self.j_smvPyClient.stages()

        # issue #429 set application name from smv config
        sc._conf.setAppName(self.appName())

        # user may choose a port for the callback server
        gw = sc._gateway
        cbsp = self.j_smvPyClient.callbackServerPort()
        cbs_port = cbsp.get() if cbsp.isDefined() else gw._python_proxy_port

        # check wither the port is in-use or not. Try 10 times, if all fail, error out
        check_counter = 0
        while(not check_socket(cbs_port) and check_counter < 10):
            cbs_port += 1
            check_counter += 1

        if (not check_socket(cbs_port)):
            raise SmvRuntimeError("Start Python callback server failed. Port {0}-{1} are all in use".format(cbs_port - check_counter, cbs_port))

        # this was a workaround for py4j 0.8.2.1, shipped with spark
        # 1.5.x, to prevent the callback server from hanging the
        # python, and hence the java, process
        from pyspark.streaming.context import _daemonize_callback_server
        _daemonize_callback_server()

        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            print("SMV starting Py4j callback server on port {0}".format(cbs_port))
            gw._shutdown_callback_server() # in case another has already started
            gw._start_callback_server(cbs_port)
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.SmvPythonHelper.updatePythonGatewayPort(jgws, gw._python_proxy_port)

        self.repoFactory = DataSetRepoFactory(self)
        self.j_smvPyClient.registerRepoFactory('Python', self.repoFactory)

        # Initialize DataFrame and Column with helper methods
        smv.helpers.init_helpers()