Esempio n. 1
0
    def __init__(self, sc, redis_port=None, password="******", object_store_memory=None,
                 verbose=False, env=None, extra_params=None):
        """
        The RayContext would initiate a ray cluster on top of the configuration of SparkContext.
        After creating RayContext, call the init method to set up the cluster.

        - For Spark local mode: The total available cores is equal to Spark local cores.
        - For Spark cluster mode: The number of raylets is equal to number of executors.
        The number of available cores for each raylet is equal to executor cores.

        :param sc: An instance of SparkContext.
        :param redis_port: redis port for the "head" node.
        The value would be randomly picked if not specified.
        :param password: Password for the redis. Default to be "123456" if not specified.
        :param object_store_memory: The memory size for ray object_store in string.
        This can be specified in bytes(b), kilobytes(k), megabytes(m) or gigabytes(g).
        For example, 50b, 100k, 250m, 30g.
        :param verbose: True for more logs when starting ray. Default is False.
        :param env: The environment variable dict for running ray processes. Default is None.
        :param extra_params: The key value dict for extra options to launch ray.
        For example, extra_params={"temp-dir": "/tmp/ray/"}
        """
        assert sc is not None, "sc cannot be None, please create a SparkContext first"
        self.sc = sc
        self.stopped = False
        self.is_local = is_local(sc)
        self.verbose = verbose
        self.redis_password = password
        self.object_store_memory = resource_to_bytes(object_store_memory)
        self.ray_processesMonitor = None
        self.env = env
        self.extra_params = extra_params
        self._address_info = None
        if self.is_local:
            self.num_ray_nodes = 1
            self.ray_node_cpu_cores = self._get_spark_local_cores()
        # For Spark local mode, directly call ray.init() and ray.shutdown().
        # ray.shutdown() would clear up all the ray related processes.
        # Ray Manager is only needed for Spark cluster mode to monitor ray processes.
        else:
            self.num_ray_nodes = int(self.sc.getConf().get("spark.executor.instances"))
            self.ray_node_cpu_cores = int(self.sc.getConf().get("spark.executor.cores"))
            self.python_loc = os.environ['PYSPARK_PYTHON']
            self.redis_port = random.randint(10000, 65535) if not redis_port else redis_port
            self.ray_service = RayServiceFuncGenerator(
                python_loc=self.python_loc,
                redis_port=self.redis_port,
                ray_node_cpu_cores=self.ray_node_cpu_cores,
                password=self.redis_password,
                object_store_memory=self.object_store_memory,
                verbose=self.verbose,
                env=self.env,
                extra_params=self.extra_params)
            self._gather_cluster_ips()
            from bigdl.util.common import init_executor_gateway
            print("Start to launch the JVM guarding process")
            init_executor_gateway(sc)
            print("JVM guarding process has been successfully launched")
        RayContext._active_ray_context = self
Esempio n. 2
0
 def clean_fn(self):
     if not self.raycontext.initialized:
         return
     import ray
     ray.shutdown()
     if not self.sc:
         print(
             "WARNING: SparkContext has been stopped before cleaning the Ray resources"
         )
     if self.sc and (not is_local(self.sc)):
         self.ray_rdd.map(gen_shutdown_per_node(self.pgids,
                                                self.node_ips)).collect()
     else:
         gen_shutdown_per_node(self.pgids, self.node_ips)([])
Esempio n. 3
0
 def __init__(self, process_infos, sc, ray_rdd, raycontext, verbose=False):
     self.sc = sc
     self.raycontext = raycontext
     self.verbose = verbose
     self.ray_rdd = ray_rdd
     self.master = []
     self.slaves = []
     self.pgids = []
     self.node_ips = []
     self.process_infos = process_infos
     for process_info in process_infos:
         self.pgids.append(process_info.pgid)
         self.node_ips.append(process_info.node_ip)
         if process_info.master_addr:
             self.master.append(process_info)
         else:
             self.slaves.append(process_info)
     assert len(self.master) == 1, \
         "We should got 1 master only, but we got {}".format(len(self.master))
     self.master = self.master[0]
     if not is_local(self.sc):
         self.print_ray_remote_err_out()
Esempio n. 4
0
    def __init__(self,
                 sc,
                 redis_port=None,
                 password="******",
                 object_store_memory=None,
                 verbose=False,
                 env=None,
                 extra_params=None,
                 include_webui=True,
                 num_ray_nodes=None,
                 ray_node_cpu_cores=None):
        """
        The RayContext would initiate a ray cluster on top of the configuration of SparkContext.
        After creating RayContext, call the init method to set up the cluster.

        - For Spark local mode: The total available cores for Ray is equal to the number of
        Spark local cores.
        - For Spark cluster mode: The number of raylets to be created is equal to the number of
        Spark executors. The number of cores allocated for each raylet is equal to the number of
        cores for each Spark executor.
        You are allowed to specify num_ray_nodes and ray_node_cpu_cores for configurations
        to start raylets.

        :param sc: An instance of SparkContext.
        :param redis_port: The redis port for the ray head node. Default is None.
        The value would be randomly picked if not specified.
        :param password: The password for redis. Default to be "123456" if not specified.
        :param object_store_memory: The memory size for ray object_store in string.
        This can be specified in bytes(b), kilobytes(k), megabytes(m) or gigabytes(g).
        For example, "50b", "100k", "250m", "30g".
        :param verbose: True for more logs when starting ray. Default is False.
        :param env: The environment variable dict for running ray processes. Default is None.
        :param extra_params: The key value dict for extra options to launch ray.
        For example, extra_params={"temp-dir": "/tmp/ray/"}
        :param include_webui: True for including web ui when starting ray. Default is False.
        :param num_ray_nodes: The number of raylets to start across the cluster.
        For Spark local mode, you don't need to specify this value.
        For Spark cluster mode, it is default to be the number of Spark executors. If
        spark.executor.instances can't be detected in your SparkContext, you need to explicitly
        specify this. It is recommended that num_ray_nodes is not larger than the number of
        Spark executors to make sure there are enough resources in your cluster.
        :param ray_node_cpu_cores: The number of available cores for each raylet.
        For Spark local mode, it is default to be the number of Spark local cores.
        For Spark cluster mode, it is default to be the number of cores for each Spark executor. If
        spark.executor.cores or spark.cores.max can't be detected in your SparkContext, you need to
        explicitly specify this. It is recommended that ray_node_cpu_cores is not larger than the
        number of cores for each Spark executor to make sure there are enough resources in your
        cluster.
        """
        assert sc is not None, "sc cannot be None, please create a SparkContext first"
        self.sc = sc
        self.initialized = False
        self.is_local = is_local(sc)
        self.verbose = verbose
        self.redis_password = password
        self.object_store_memory = resource_to_bytes(object_store_memory)
        self.ray_processesMonitor = None
        self.env = env
        self.extra_params = extra_params
        self.include_webui = include_webui
        self._address_info = None
        if self.is_local:
            self.num_ray_nodes = 1
            spark_cores = self._get_spark_local_cores()
            if ray_node_cpu_cores:
                ray_node_cpu_cores = int(ray_node_cpu_cores)
                if ray_node_cpu_cores > spark_cores:
                    warnings.warn(
                        "ray_node_cpu_cores is larger than available Spark cores, "
                        "make sure there are enough resources on your machine")
                self.ray_node_cpu_cores = ray_node_cpu_cores
            else:
                self.ray_node_cpu_cores = spark_cores
        # For Spark local mode, directly call ray.init() and ray.shutdown().
        # ray.shutdown() would clear up all the ray related processes.
        # Ray Manager is only needed for Spark cluster mode to monitor ray processes.
        else:
            if self.sc.getConf().contains("spark.executor.cores"):
                executor_cores = int(
                    self.sc.getConf().get("spark.executor.cores"))
            else:
                executor_cores = None
            if ray_node_cpu_cores:
                ray_node_cpu_cores = int(ray_node_cpu_cores)
                if executor_cores and ray_node_cpu_cores > executor_cores:
                    warnings.warn(
                        "ray_node_cpu_cores is larger than Spark executor cores, "
                        "make sure there are enough resources on your cluster")
                self.ray_node_cpu_cores = ray_node_cpu_cores
            elif executor_cores:
                self.ray_node_cpu_cores = executor_cores
            else:
                raise Exception(
                    "spark.executor.cores not detected in the SparkContext, "
                    "you need to manually specify num_ray_nodes and ray_node_cpu_cores "
                    "for RayContext to start ray services")
            if self.sc.getConf().contains("spark.executor.instances"):
                num_executors = int(
                    self.sc.getConf().get("spark.executor.instances"))
            elif self.sc.getConf().contains("spark.cores.max"):
                import math
                num_executors = math.floor(
                    int(self.sc.getConf().get("spark.cores.max")) /
                    self.ray_node_cpu_cores)
            else:
                num_executors = None
            if num_ray_nodes:
                num_ray_nodes = int(num_ray_nodes)
                if num_executors and num_ray_nodes > num_executors:
                    warnings.warn(
                        "num_ray_nodes is larger than the number of Spark executors, "
                        "make sure there are enough resources on your cluster")
                self.num_ray_nodes = num_ray_nodes
            elif num_executors:
                self.num_ray_nodes = num_executors
            else:
                raise Exception(
                    "spark.executor.cores not detected in the SparkContext, "
                    "you need to manually specify num_ray_nodes and ray_node_cpu_cores "
                    "for RayContext to start ray services")

            from zoo.util.utils import detect_python_location
            self.python_loc = os.environ.get("PYSPARK_PYTHON",
                                             detect_python_location())
            self.redis_port = random.randint(
                10000, 65535) if not redis_port else int(redis_port)
            self.ray_service = RayServiceFuncGenerator(
                python_loc=self.python_loc,
                redis_port=self.redis_port,
                ray_node_cpu_cores=self.ray_node_cpu_cores,
                password=self.redis_password,
                object_store_memory=self.object_store_memory,
                verbose=self.verbose,
                env=self.env,
                include_webui=self.include_webui,
                extra_params=self.extra_params)
        RayContext._active_ray_context = self
        self.total_cores = self.num_ray_nodes * self.ray_node_cpu_cores
Esempio n. 5
0
 def __init__(self,
              sc,
              redis_port=None,
              password="******",
              object_store_memory=None,
              verbose=False,
              env=None,
              extra_params=None):
     """
     The RayContext would init a ray cluster on top of the configuration of SparkContext.
     For spark cluster mode: The number of raylets is equal to number of executors.
     For Spark local mode: The number of raylets is controlled by local_ray_node_num.
     CPU cores for each is raylet equals to spark_cores/local_ray_node_num.
     :param sc:
     :param redis_port: redis port for the "head" node.
            The value would be randomly picked if not specified.
     :param password: [optional] password for the redis.
     :param object_store_memory: Memory size for the object_store.
     :param verbose: True for more logs.
     :param env: The environment variable dict for running Ray.
     :param extra_params: key value dictionary for extra options to launch Ray.
                          i.e extra_params={"temp-dir": "/tmp/ray2/"}
     """
     assert sc is not None, "sc cannot be None, please create a SparkContext first"
     self.sc = sc
     self.stopped = False
     self.is_local = is_local(sc)
     self.verbose = verbose
     self.redis_password = password
     self.object_store_memory = resource_to_bytes(object_store_memory)
     self.ray_processesMonitor = None
     self.env = env
     self.extra_params = extra_params
     if self.is_local:
         self.num_ray_nodes = 1
         self.ray_node_cpu_cores = self._get_spark_local_cores()
     # For Spark local mode, directly call ray.init() and ray.shutdown().
     # ray.shutdown() would clear up all the ray related processes.
     # Ray Manager is only needed for Spark cluster mode to monitor ray processes.
     else:
         self.num_ray_nodes = int(
             self.sc.getConf().get("spark.executor.instances"))
         self.ray_node_cpu_cores = int(
             self.sc.getConf().get("spark.executor.cores"))
         self.python_loc = os.environ['PYSPARK_PYTHON']
         self.redis_port = random.randint(
             10000, 65535) if not redis_port else redis_port
         self.ray_service = RayServiceFuncGenerator(
             python_loc=self.python_loc,
             redis_port=self.redis_port,
             ray_node_cpu_cores=self.ray_node_cpu_cores,
             password=self.redis_password,
             object_store_memory=self.object_store_memory,
             verbose=self.verbose,
             env=self.env,
             extra_params=self.extra_params)
         self._gather_cluster_ips()
         from bigdl.util.common import init_executor_gateway
         print("Start to launch the JVM guarding process")
         init_executor_gateway(sc)
         print("JVM guarding process has been successfully launched")