def start(self): """Starts the Horovod driver and services.""" self.rendezvous = RendezvousServer(self.settings.verbose) self.driver = ElasticDriver(rendezvous=self.rendezvous, discovery=self.settings.discovery, min_num_proc=self.settings.min_num_proc, max_num_proc=self.settings.max_num_proc, timeout=self.settings.elastic_timeout, reset_limit=self.settings.reset_limit, verbose=self.settings.verbose) handler = create_rendezvous_handler(self.driver) logger.debug("[ray] starting rendezvous") global_rendezv_port = self.rendezvous.start(handler) logger.debug(f"[ray] waiting for {self.settings.num_proc} to start.") self.driver.wait_for_available_slots(self.settings.num_proc) # Host-to-host common interface detection # requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(self.settings) current_hosts = self.driver.wait_for_available_slots( self.settings.num_proc, min_hosts=min_hosts) logger.debug("[ray] getting common interfaces") nics = detect_nics( self.settings, all_host_names=current_hosts.host_assignment_order, ) logger.debug("[ray] getting driver IP") server_ip = socket.gethostbyname(socket.gethostname()) self.run_env_vars = create_run_env_vars(server_ip, nics, global_rendezv_port, elastic=True)
def start(self, executable_cls: type = None, executable_args: Optional[List] = None, executable_kwargs: Optional[Dict] = None, extra_env_vars: Optional[Dict] = None): """Starts the workers and colocates them on all machines. We implement a node grouping because it seems like our implementation doesn't quite work for imbalanced nodes. Also, colocation performance is typically much better than non-colocated workers. Args: executable_cls (type): The class that will be created within an actor (BaseHorovodWorker). This will allow Horovod to establish its connections and set env vars. executable_args (List): Arguments to be passed into the worker class upon initialization. executable_kwargs (Dict): Keyword arguments to be passed into the worker class upon initialization. extra_env_vars (Dict): Environment variables to be set on the actors (worker processes) before initialization. """ extra_env_vars = extra_env_vars or {} def resources_per_host(): num_cpus = self.cpus_per_slot * self.num_slots num_gpus = self.gpus_per_slot * self.num_slots * int(self.use_gpu) return dict(num_cpus=num_cpus, num_gpus=num_gpus) self.coordinator = Coordinator(self.settings) executable_args = executable_args or [] self.workers = self._create_workers(resources_per_host()) # Get all the hostnames of all workers hostnames = map_blocking(lambda w: w.hostname.remote(), self.workers) # Register each hostname to the coordinator. assumes the hostname # ordering is the same. for rank, hostname in enumerate(hostnames): self.coordinator.register(hostname, rank) all_info = self.coordinator.finalize_registration() indexed_runners = dict(enumerate(self.workers)) for rank, local_cross_env_var in all_info.items(): indexed_runners[rank].update_env_vars.remote(local_cross_env_var) coordinator_envs = self.coordinator.establish_rendezvous() coordinator_envs.update(extra_env_vars) nics = detect_nics( self.settings, all_host_names=list(self.coordinator.hostnames_by_rank), node_workers=self.colocators) coordinator_envs.update(nics_to_env_var(nics)) map_blocking(lambda w: w.update_env_vars.remote(coordinator_envs), self.workers) self._start_executables(executable_cls, executable_args, executable_kwargs)
def start(self, executable_cls: type = None, executable_args: Optional[List] = None, executable_kwargs: Optional[Dict] = None, extra_env_vars: Optional[Dict] = None): """Starts the Horovod driver and services. Args: executable_cls (type): The class that will be created within an actor (BaseHorovodWorker). This will allow Horovod to establish its connections and set env vars. executable_args (List): Arguments to be passed into the worker class upon initialization. executable_kwargs (Dict): Keyword arguments to be passed into the worker class upon initialization. extra_env_vars (Dict): Environment variables to be set on the actors (worker processes) before initialization. """ self.rendezvous = RendezvousServer(self.settings.verbose) self.driver = ElasticDriver(rendezvous=self.rendezvous, discovery=self.settings.discovery, min_np=self.min_workers, max_np=self.max_workers, timeout=self.elastic_timeout, reset_limit=self.reset_limit, cooldown_range=self.cooldown_range, verbose=self.settings.verbose) handler = create_rendezvous_handler(self.driver) logger.debug("[ray] starting rendezvous") global_rendezv_port = self.rendezvous.start(handler) logger.debug(f"[ray] waiting for {self.num_workers} to start.") self.driver.wait_for_available_slots(self.num_workers) # Host-to-host common interface detection # requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(self.settings) current_hosts = self.driver.wait_for_available_slots( self.num_workers, min_hosts=min_hosts) logger.debug("[ray] getting common interfaces") nics = detect_nics( self.settings, all_host_names=current_hosts.host_assignment_order, ) logger.debug("[ray] getting driver IP") server_ip = socket.gethostbyname(socket.gethostname()) self.run_env_vars = create_run_env_vars(server_ip, nics, global_rendezv_port, elastic=True) self.executable_cls = executable_cls self.executable_args = executable_args self.executable_kwargs = executable_kwargs self.env_vars = extra_env_vars or {}
def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig): # TODO(matt): Implement placement group strategies in BackendExecutor. # Initialize workers with Horovod environment variables setup_futures = [] for rank in range(len(worker_group)): worker_node_id = worker_group.workers[rank].metadata.node_id setup_futures.append( worker_group.execute_single_async( rank, init_env_vars, rank, len(worker_group), worker_node_id ) ) ray.get(setup_futures) # Use Horovod Ray Coordinator # backend_config as settings self.coordinator = Coordinator(backend_config) # Get all the hostnames of all workers node_ids = [w.metadata.node_id for w in worker_group.workers] hostnames = [w.metadata.hostname for w in worker_group.workers] # Register each hostname to the coordinator. assumes the hostname # ordering is the same. for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)): self.coordinator.register(hostname, node_id, rank) all_info = self.coordinator.finalize_registration() setup_futures = [] for rank, local_cross_env_var in all_info.items(): setup_futures.append( worker_group.execute_single_async( rank, update_env_vars, local_cross_env_var ) ) ray.get(setup_futures) coordinator_envs = self.coordinator.establish_rendezvous() # Get one worker from each host/node. node_worker_indexes = [node_ids.index(node_id) for node_id in set(node_ids)] node_workers = [ HorovodWorkerWrapper(worker_group.workers[worker_index]) for worker_index in node_worker_indexes ] assert len(node_workers) == len(self.coordinator.hostnames) nics = detect_nics( backend_config, all_host_names=list(self.coordinator.hostnames), node_workers=node_workers, ) coordinator_envs.update(nics_to_env_var(nics)) worker_group.execute(update_env_vars, coordinator_envs)