def _start_executables(self, executable_cls, executable_args, executable_kwargs): def _start_exec(worker): return worker.start_executable.remote( executable_cls, executable_args, executable_kwargs) map_blocking(_start_exec, self.workers)
def start(self, executable_cls: type = None, executable_args: Optional[List] = None, executable_kwargs: Optional[Dict] = None, extra_env_vars: Optional[Dict] = None): """Starts the workers and colocates them on all machines. We implement a node grouping because it seems like our implementation doesn't quite work for imbalanced nodes. Also, colocation performance is typically much better than non-colocated workers. Args: executable_cls (type): The class that will be created within an actor (BaseHorovodWorker). This will allow Horovod to establish its connections and set env vars. executable_args (List): Arguments to be passed into the worker class upon initialization. executable_kwargs (Dict): Keyword arguments to be passed into the worker class upon initialization. extra_env_vars (Dict): Environment variables to be set on the actors (worker processes) before initialization. """ extra_env_vars = extra_env_vars or {} self.strategy = self._create_strategy() self.coordinator = Coordinator(self.settings) executable_args = executable_args or [] self.workers, node_workers = self.strategy.create_workers() # Get all the hostnames of all workers node_ids = map_blocking(lambda w: w.node_id.remote(), self.workers) hostnames = map_blocking(lambda w: w.hostname.remote(), self.workers) # Register each hostname to the coordinator. assumes the hostname # ordering is the same. for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)): self.coordinator.register(hostname, node_id, rank) all_info = self.coordinator.finalize_registration() indexed_runners = dict(enumerate(self.workers)) for rank, local_cross_env_var in all_info.items(): indexed_runners[rank].update_env_vars.remote(local_cross_env_var) coordinator_envs = self.coordinator.establish_rendezvous() coordinator_envs.update(extra_env_vars) nics = detect_nics( self.settings, all_host_names=list(self.coordinator.hostnames), node_workers=node_workers) coordinator_envs.update(nics_to_env_var(nics)) map_blocking(lambda w: w.update_env_vars.remote(coordinator_envs), self.workers) self._start_executables(executable_cls, executable_args, executable_kwargs)
def get_node_workers(cls, workers): """Returns list of one worker per node to use for NIC detection.""" # In some setups (i.e., Peloton), ray nodes may not have # unique host names. hostnames = map_blocking(lambda w: w.hostname.remote(), workers) host_worker_map = {} for hostname, worker in zip(hostnames, workers): host_worker_map[hostname] = worker return list(host_worker_map.values())