Exemple #1
0
    def init(self, driver_cores=0):
        """
        Initiate the ray cluster.

        :param driver_cores: The number of cores for the raylet on driver for Spark cluster mode.
        Default is 0 and in this case the local driver wouldn't have any ray workload.

        :return The dictionary of address information about the ray cluster.
        Information contains node_ip_address, redis_address, object_store_address,
        raylet_socket_name, webui_url and session_dir.
        """
        if self.initialized:
            print("The Ray cluster has been launched.")
        else:
            if self.is_local:
                if self.env:
                    os.environ.update(self.env)
                import ray
                self._address_info = ray.init(num_cpus=self.ray_node_cpu_cores,
                                              object_store_memory=self.object_store_memory,
                                              resources=self.extra_params)
            else:
                self.cluster_ips = self._gather_cluster_ips()
                from bigdl.util.common import init_executor_gateway
                init_executor_gateway(self.sc)
                print("JavaGatewayServer has been successfully launched on executors")
                self._start_cluster()
                self._address_info = self._start_driver(num_cores=driver_cores)

            print(self._address_info)
            kill_redundant_log_monitors(self._address_info["redis_address"])
            self.initialized = True
        return self._address_info
Exemple #2
0
    def __init__(self, sc, redis_port=None, password="******", object_store_memory=None,
                 verbose=False, env=None, extra_params=None):
        """
        The RayContext would initiate a ray cluster on top of the configuration of SparkContext.
        After creating RayContext, call the init method to set up the cluster.

        - For Spark local mode: The total available cores is equal to Spark local cores.
        - For Spark cluster mode: The number of raylets is equal to number of executors.
        The number of available cores for each raylet is equal to executor cores.

        :param sc: An instance of SparkContext.
        :param redis_port: redis port for the "head" node.
        The value would be randomly picked if not specified.
        :param password: Password for the redis. Default to be "123456" if not specified.
        :param object_store_memory: The memory size for ray object_store in string.
        This can be specified in bytes(b), kilobytes(k), megabytes(m) or gigabytes(g).
        For example, 50b, 100k, 250m, 30g.
        :param verbose: True for more logs when starting ray. Default is False.
        :param env: The environment variable dict for running ray processes. Default is None.
        :param extra_params: The key value dict for extra options to launch ray.
        For example, extra_params={"temp-dir": "/tmp/ray/"}
        """
        assert sc is not None, "sc cannot be None, please create a SparkContext first"
        self.sc = sc
        self.stopped = False
        self.is_local = is_local(sc)
        self.verbose = verbose
        self.redis_password = password
        self.object_store_memory = resource_to_bytes(object_store_memory)
        self.ray_processesMonitor = None
        self.env = env
        self.extra_params = extra_params
        self._address_info = None
        if self.is_local:
            self.num_ray_nodes = 1
            self.ray_node_cpu_cores = self._get_spark_local_cores()
        # For Spark local mode, directly call ray.init() and ray.shutdown().
        # ray.shutdown() would clear up all the ray related processes.
        # Ray Manager is only needed for Spark cluster mode to monitor ray processes.
        else:
            self.num_ray_nodes = int(self.sc.getConf().get("spark.executor.instances"))
            self.ray_node_cpu_cores = int(self.sc.getConf().get("spark.executor.cores"))
            self.python_loc = os.environ['PYSPARK_PYTHON']
            self.redis_port = random.randint(10000, 65535) if not redis_port else redis_port
            self.ray_service = RayServiceFuncGenerator(
                python_loc=self.python_loc,
                redis_port=self.redis_port,
                ray_node_cpu_cores=self.ray_node_cpu_cores,
                password=self.redis_password,
                object_store_memory=self.object_store_memory,
                verbose=self.verbose,
                env=self.env,
                extra_params=self.extra_params)
            self._gather_cluster_ips()
            from bigdl.util.common import init_executor_gateway
            print("Start to launch the JVM guarding process")
            init_executor_gateway(sc)
            print("JVM guarding process has been successfully launched")
        RayContext._active_ray_context = self
Exemple #3
0
 def __init__(self,
              sc,
              redis_port=None,
              password="******",
              object_store_memory=None,
              verbose=False,
              env=None,
              local_ray_node_num=2,
              waiting_time_sec=8,
              extra_params=None):
     """
     The RayContext would init a ray cluster on top of the configuration of SparkContext.
     For spark cluster mode: The number of raylets is equal to number of executors.
     For Spark local mode: The number of raylets is controlled by local_ray_node_num.
     CPU cores for each is raylet equals to spark_cores/local_ray_node_num.
     :param sc:
     :param redis_port: redis port for the "head" node.
            The value would be randomly picked if not specified.
     :param password: [optional] password for the redis.
     :param object_store_memory: Memory size for the object_store.
     :param verbose: True for more logs.
     :param env: The environment variable dict for running Ray.
     :param local_ray_node_num number of raylets to be created.
     :param waiting_time_sec: Waiting time for the raylets before connecting to redis.
     :param extra_params: key value dictionary for extra options to launch Ray.
                          i.e extra_params={"temp-dir": "/tmp/ray2/"}
     """
     self.sc = sc
     self.stopped = False
     self.is_local = is_local(sc)
     self.local_ray_node_num = local_ray_node_num
     self.ray_node_cpu_cores = self._get_ray_node_cpu_cores()
     self.num_ray_nodes = self._get_num_ray_nodes()
     self.python_loc = os.environ['PYSPARK_PYTHON']
     self.ray_processesMonitor = None
     self.verbose = verbose
     self.redis_password = password
     self.object_store_memory = object_store_memory
     self.redis_port = self._new_port() if not redis_port else redis_port
     self.ray_service = RayServiceFuncGenerator(
         python_loc=self.python_loc,
         redis_port=self.redis_port,
         ray_node_cpu_cores=self.ray_node_cpu_cores,
         mkl_cores=self._get_mkl_cores(),
         password=password,
         object_store_memory=self._enrich_object_sotre_memory(
             sc, object_store_memory),
         verbose=verbose,
         env=env,
         waitting_time_sec=waiting_time_sec,
         extra_params=extra_params)
     self._gather_cluster_ips()
     from bigdl.util.common import init_executor_gateway
     print("Start to launch the JVM guarding process")
     init_executor_gateway(sc)
     print("JVM guarding process has been successfully launched")
Exemple #4
0
    def init(self, driver_cores=0):
        """
        Initiate the ray cluster.

        :param driver_cores: The number of cores for the raylet on driver for Spark cluster mode.
        Default is 0 and in this case the local driver wouldn't have any ray workload.

        :return The dictionary of address information about the ray cluster.
        Information contains node_ip_address, redis_address, object_store_address,
        raylet_socket_name, webui_url and session_dir.
        """
        if self.initialized:
            print("The Ray cluster has been launched.")
        else:
            if self.is_local:
                if self.env:
                    os.environ.update(self.env)
                import ray
                kwargs = {}
                if self.extra_params is not None:
                    for k, v in self.extra_params.items():
                        kw = k.replace("-", "_")
                        kwargs[kw] = v
                init_params = dict(
                    num_cpus=self.ray_node_cpu_cores,
                    _redis_password=self.redis_password,
                    object_store_memory=self.object_store_memory,
                    include_dashboard=self.include_webui,
                    dashboard_host="0.0.0.0",
                )
                init_params.update(kwargs)
                if version.parse(ray.__version__) >= version.parse("1.4.0"):
                    init_params["namespace"] = "az"
                self._address_info = ray.init(**init_params)
            else:
                self.cluster_ips = self._gather_cluster_ips()
                from bigdl.util.common import init_executor_gateway
                init_executor_gateway(self.sc)
                print(
                    "JavaGatewayServer has been successfully launched on executors"
                )
                redis_address = self._start_cluster()
                self._address_info = self._start_driver(
                    num_cores=driver_cores, redis_address=redis_address)

            print(self._address_info)
            kill_redundant_log_monitors(self._address_info["redis_address"])
            self.initialized = True
        return self._address_info
Exemple #5
0
 def __init__(self,
              sc,
              redis_port=None,
              password="******",
              object_store_memory=None,
              verbose=False,
              env=None,
              extra_params=None):
     """
     The RayContext would init a ray cluster on top of the configuration of SparkContext.
     For spark cluster mode: The number of raylets is equal to number of executors.
     For Spark local mode: The number of raylets is controlled by local_ray_node_num.
     CPU cores for each is raylet equals to spark_cores/local_ray_node_num.
     :param sc:
     :param redis_port: redis port for the "head" node.
            The value would be randomly picked if not specified.
     :param password: [optional] password for the redis.
     :param object_store_memory: Memory size for the object_store.
     :param verbose: True for more logs.
     :param env: The environment variable dict for running Ray.
     :param extra_params: key value dictionary for extra options to launch Ray.
                          i.e extra_params={"temp-dir": "/tmp/ray2/"}
     """
     assert sc is not None, "sc cannot be None, please create a SparkContext first"
     self.sc = sc
     self.stopped = False
     self.is_local = is_local(sc)
     self.verbose = verbose
     self.redis_password = password
     self.object_store_memory = resource_to_bytes(object_store_memory)
     self.ray_processesMonitor = None
     self.env = env
     self.extra_params = extra_params
     if self.is_local:
         self.num_ray_nodes = 1
         self.ray_node_cpu_cores = self._get_spark_local_cores()
     # For Spark local mode, directly call ray.init() and ray.shutdown().
     # ray.shutdown() would clear up all the ray related processes.
     # Ray Manager is only needed for Spark cluster mode to monitor ray processes.
     else:
         self.num_ray_nodes = int(
             self.sc.getConf().get("spark.executor.instances"))
         self.ray_node_cpu_cores = int(
             self.sc.getConf().get("spark.executor.cores"))
         self.python_loc = os.environ['PYSPARK_PYTHON']
         self.redis_port = random.randint(
             10000, 65535) if not redis_port else redis_port
         self.ray_service = RayServiceFuncGenerator(
             python_loc=self.python_loc,
             redis_port=self.redis_port,
             ray_node_cpu_cores=self.ray_node_cpu_cores,
             password=self.redis_password,
             object_store_memory=self.object_store_memory,
             verbose=self.verbose,
             env=self.env,
             extra_params=self.extra_params)
         self._gather_cluster_ips()
         from bigdl.util.common import init_executor_gateway
         print("Start to launch the JVM guarding process")
         init_executor_gateway(sc)
         print("JVM guarding process has been successfully launched")
    def __init__(self,
                 sc,
                 redis_port=None,
                 password="******",
                 object_store_memory=None,
                 verbose=False,
                 env=None,
                 extra_params=None,
                 num_ray_nodes=None,
                 ray_node_cpu_cores=None):
        """
        The RayContext would initiate a ray cluster on top of the configuration of SparkContext.
        After creating RayContext, call the init method to set up the cluster.

        - For Spark local mode: The total available cores for Ray is equal to the number of
        Spark local cores.
        - For Spark cluster mode: The number of raylets to be created is equal to the number of
        Spark executors. The number of cores allocated for each raylet is equal to the number of
        cores for each Spark executor.
        You are allowed to specify num_ray_nodes and ray_node_cpu_cores for configurations
        to start raylets.

        :param sc: An instance of SparkContext.
        :param redis_port: redis port for the "head" node.
        The value would be randomly picked if not specified.
        :param password: Password for the redis. Default to be "123456" if not specified.
        :param object_store_memory: The memory size for ray object_store in string.
        This can be specified in bytes(b), kilobytes(k), megabytes(m) or gigabytes(g).
        For example, 50b, 100k, 250m, 30g.
        :param verbose: True for more logs when starting ray. Default is False.
        :param env: The environment variable dict for running ray processes. Default is None.
        :param extra_params: The key value dict for extra options to launch ray.
        For example, extra_params={"temp-dir": "/tmp/ray/"}
        :param num_ray_nodes: The number of raylets to start across the cluster.
        For Spark local mode, you don't need to specify this value.
        For Spark cluster mode, it is default to be the number of Spark executors. If
        spark.executor.instances can't be detected in your SparkContext, you need to explicitly
        specify this. It is recommended that num_ray_nodes is not larger than the number of
        Spark executors to make sure there are enough resources in your cluster.
        :param ray_node_cpu_cores: The number of available cores for each raylet.
        For Spark local mode, it is default to be the number of Spark local cores.
        For Spark cluster mode, it is default to be the number of cores for each Spark executor. If
        spark.executor.cores or spark.cores.max can't be detected in your SparkContext, you need to
        explicitly specify this. It is recommended that ray_node_cpu_cores is not larger than the
        number of cores for each Spark executor to make sure there are enough resources in your
        cluster.
        """
        assert sc is not None, "sc cannot be None, please create a SparkContext first"
        self.sc = sc
        self.initialized = False
        self.is_local = is_local(sc)
        self.verbose = verbose
        self.redis_password = password
        self.object_store_memory = resource_to_bytes(object_store_memory)
        self.ray_processesMonitor = None
        self.env = env
        self.extra_params = extra_params
        self._address_info = None
        if self.is_local:
            self.num_ray_nodes = 1
            spark_cores = self._get_spark_local_cores()
            if ray_node_cpu_cores:
                ray_node_cpu_cores = int(ray_node_cpu_cores)
                if ray_node_cpu_cores > spark_cores:
                    warnings.warn(
                        "ray_node_cpu_cores is larger than available Spark cores, "
                        "make sure there are enough resources on your machine")
                self.ray_node_cpu_cores = ray_node_cpu_cores
            else:
                self.ray_node_cpu_cores = spark_cores
        # For Spark local mode, directly call ray.init() and ray.shutdown().
        # ray.shutdown() would clear up all the ray related processes.
        # Ray Manager is only needed for Spark cluster mode to monitor ray processes.
        else:
            if self.sc.getConf().contains("spark.executor.cores"):
                executor_cores = int(
                    self.sc.getConf().get("spark.executor.cores"))
            else:
                executor_cores = None
            if ray_node_cpu_cores:
                ray_node_cpu_cores = int(ray_node_cpu_cores)
                if executor_cores and ray_node_cpu_cores > executor_cores:
                    warnings.warn(
                        "ray_node_cpu_cores is larger than Spark executor cores, "
                        "make sure there are enough resources on your cluster")
                self.ray_node_cpu_cores = ray_node_cpu_cores
            elif executor_cores:
                self.ray_node_cpu_cores = executor_cores
            else:
                raise Exception(
                    "spark.executor.cores not detected in the SparkContext, "
                    "you need to manually specify num_ray_nodes and ray_node_cpu_cores "
                    "for RayContext to start ray services")
            if self.sc.getConf().contains("spark.executor.instances"):
                num_executors = int(
                    self.sc.getConf().get("spark.executor.instances"))
            elif self.sc.getConf().contains("spark.cores.max"):
                import math
                num_executors = math.floor(
                    int(self.sc.getConf().get("spark.cores.max")) /
                    self.ray_node_cpu_cores)
            else:
                num_executors = None
            if num_ray_nodes:
                num_ray_nodes = int(num_ray_nodes)
                if num_executors and num_ray_nodes > num_executors:
                    warnings.warn(
                        "num_ray_nodes is larger than the number of Spark executors, "
                        "make sure there are enough resources on your cluster")
                self.num_ray_nodes = num_ray_nodes
            elif num_executors:
                self.num_ray_nodes = num_executors
            else:
                raise Exception(
                    "spark.executor.cores not detected in the SparkContext, "
                    "you need to manually specify num_ray_nodes and ray_node_cpu_cores "
                    "for RayContext to start ray services")

            self.python_loc = os.environ['PYSPARK_PYTHON']
            self.redis_port = random.randint(
                10000, 65535) if not redis_port else int(redis_port)
            self.ray_service = RayServiceFuncGenerator(
                python_loc=self.python_loc,
                redis_port=self.redis_port,
                ray_node_cpu_cores=self.ray_node_cpu_cores,
                password=self.redis_password,
                object_store_memory=self.object_store_memory,
                verbose=self.verbose,
                env=self.env,
                extra_params=self.extra_params)
            self._gather_cluster_ips()
            from bigdl.util.common import init_executor_gateway
            print("Start to launch the JVM guarding process")
            init_executor_gateway(sc)
            print("JVM guarding process has been successfully launched")
        RayContext._active_ray_context = self