Beispiel #1
0
 def _start_restricted_worker(self, num_cores=0):
     command = RayServiceFuncGenerator._get_raylet_command(
         redis_address=self.redis_address,
         ray_exec="ray ",
         password=self.redis_password,
         ray_node_cpu_cores=num_cores,
         object_store_memory=self.object_store_memory,
         extra_params=self.extra_params)
     print("Executing command: {}".format(command))
     process_info = session_execute(command=command, fail_fast=True)
     ProcessMonitor.register_shutdown_hook(pgid=process_info.pgid)
Beispiel #2
0
    def _start_cluster(self):
        ray_rdd = self.sc.range(0,
                                self.num_ray_nodes,
                                numSlices=self.num_ray_nodes)
        from zoo import ZooContext
        if ZooContext.barrier_mode:
            print("Launching Ray on cluster with Spark barrier mode")
            # The first ip would be used to launch ray master.
            process_infos = ray_rdd.barrier().mapPartitions(
                self.ray_service.gen_ray_start(self.cluster_ips[0])).collect()
        else:
            print("Launching Ray on cluster without Spark barrier mode")
            master_process_infos = ray_rdd.mapPartitionsWithIndex(
                self.ray_service.gen_ray_master_start()).collect()
            master_process_infos = [
                process for process in master_process_infos if process
            ]
            assert len(master_process_infos) == 1
            master_process_info = master_process_infos[0]
            redis_address = master_process_info.master_addr
            raylet_process_infos = ray_rdd.mapPartitions(
                self.ray_service.gen_raylet_start(redis_address)).collect()
            raylet_process_infos = [
                process for process in raylet_process_infos if process
            ]
            assert len(raylet_process_infos) == self.num_ray_nodes - 1
            process_infos = master_process_infos + raylet_process_infos

        self.ray_processesMonitor = ProcessMonitor(process_infos,
                                                   self.sc,
                                                   ray_rdd,
                                                   self,
                                                   verbose=self.verbose)
        return self.ray_processesMonitor.master.master_addr
Beispiel #3
0
 def _start_restricted_worker(self, num_cores, node_ip_address):
     extra_param = {"node-ip-address": node_ip_address}
     if self.extra_params is not None:
         extra_param.update(self.extra_params)
     command = RayServiceFuncGenerator._get_raylet_command(
         redis_address=self.redis_address,
         ray_exec="ray",
         password=self.redis_password,
         ray_node_cpu_cores=num_cores,
         object_store_memory=self.object_store_memory,
         extra_params=extra_param)
     modified_env = self.ray_service._prepare_env()
     print("Executing command: {}".format(command))
     process_info = session_execute(command=command, env=modified_env,
                                    tag="raylet", fail_fast=True)
     ProcessMonitor.register_shutdown_hook(pgid=process_info.pgid)
Beispiel #4
0
    def _start_cluster(self):
        print("Start to launch ray on cluster")
        ray_rdd = self.sc.range(0, self.num_ray_nodes,
                                numSlices=self.num_ray_nodes)
        process_infos = ray_rdd.barrier().mapPartitions(
            self.ray_service.gen_ray_start()).collect()

        self.ray_processesMonitor = ProcessMonitor(process_infos, self.sc, ray_rdd, self,
                                                   verbose=self.verbose)
        self.redis_address = self.ray_processesMonitor.master.master_addr
        return self
Beispiel #5
0
    def _start_cluster(self):
        print("Start to launch ray on cluster")
        ray_rdd = self.sc.range(0, self.num_ray_nodes,
                                numSlices=self.num_ray_nodes)
        # The first ip would be used to launch ray master.
        process_infos = ray_rdd.barrier().mapPartitions(
            self.ray_service.gen_ray_start(self.cluster_ips[0])).collect()

        self.ray_processesMonitor = ProcessMonitor(process_infos, self.sc, ray_rdd, self,
                                                   verbose=self.verbose)
        return self.ray_processesMonitor.master.master_addr