def __init__(self, ray_ctx, verbose=None, start_timeout=None): self.cores_per_node = ray_ctx.ray_node_cpu_cores self.num_nodes = ray_ctx.num_ray_nodes self.worker_class = make_horovod_worker(self.cores_per_node) self.remote_workers = [self.worker_class.remote() for i in range(0, self.num_nodes)] hosts = ray.get([worker.hostname.remote() for worker in self.remote_workers]) hosts_spec, name_rank_to_id, host_to_size = _hosts_to_hosts_spec(hosts) self.host_alloc_plan = _allocate(",".join(hosts_spec), self.num_nodes) global_rendezv = RendezvousServer(True) global_rendezv_port = global_rendezv.start_server(self.host_alloc_plan) if start_timeout is None: start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') all_host_names = [k for k in host_to_size] settings = hvd_settings.Settings(verbose=2 if verbose else 0, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=self.num_nodes, hosts=",".join(hosts_spec)) common_intfs = _find_common_network_interface(host_to_size, name_rank_to_id, self.remote_workers, settings) iface = list(common_intfs)[0] driver_ip = _get_driver_ip([iface]) common_envs = { "HOROVOD_GLOO_RENDEZVOUS_ADDR": driver_ip, "HOROVOD_GLOO_RENDEZVOUS_PORT": str(global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", "HOROVOD_GLOO_IFACE": iface, "PYTHONUNBUFFERED": '1', } for key in os.environ: if key.startswith("HOROVOD"): common_envs[key] = os.environ[key] # todo support other Horovod envs self.per_worker_envs = [common_envs.copy() for _ in range(self.num_nodes)] for alloc_info in self.host_alloc_plan: key = (alloc_info.hostname, alloc_info.local_rank) local_envs = self.per_worker_envs[name_rank_to_id[key]] local_envs["HOROVOD_RANK"] = str(alloc_info.rank) local_envs["HOROVOD_SIZE"] = str(alloc_info.size) local_envs["HOROVOD_LOCAL_RANK"] = str(alloc_info.local_rank) local_envs["HOROVOD_LOCAL_SIZE"] = str(alloc_info.local_size) local_envs["HOROVOD_CROSS_RANK"] = str(alloc_info.cross_rank) local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size)
def _driver_fn(client, net_if): cluster_tasks = _task_commons._get_cluster_tasks(client) # Worker discovery worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"] n_workers = 1 for cluster_task in cluster_tasks: if 'worker' in cluster_task: worker_addr = event.wait(client, f"{cluster_task}/addr") logger.info(f"{cluster_task}: {worker_addr}") worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}") n_workers += 1 # Worker task allocation to workers host_alloc_plan = gloo_run._allocate(','.join(worker_list), n_workers) for host in host_alloc_plan: host_info = f"""\ {host.rank},{host.size},{host.local_rank},\ {host.local_size},{host.cross_rank},{host.cross_size}\ """ event.broadcast(client, f"{cluster.get_task()}/{host.hostname}", host_info) global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start_server(host_alloc_plan) event.broadcast(client, f"{cluster.get_task()}/sock_addr", f"{net_if[1]}:{global_rendezv_port}") return global_rendezv.listen_thread
def __init__(self, ray_ctx, worker_cls=None, worker_param=None, workers_per_node=1): from horovod.run.gloo_run import RendezvousServer, _allocate self.cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node self.num_nodes = ray_ctx.num_ray_nodes * workers_per_node if worker_param is None: worker_param = {} worker_cls = make_worker(worker_cls, HorovodWorker) self.worker_class = ray.remote( num_cpus=self.cores_per_node)(worker_cls) self.remote_workers = [ self.worker_class.remote(**worker_param) for i in range(0, self.num_nodes) ] hosts = ray.get( [worker.ip_addr.remote() for worker in self.remote_workers]) hosts_spec, name_rank_to_id, host_to_size = _hosts_to_hosts_spec(hosts) self.host_alloc_plan = _allocate(",".join(hosts_spec), self.num_nodes) global_rendezv = RendezvousServer(True) global_rendezv_port = global_rendezv.start_server(self.host_alloc_plan) driver_ip = ray.services.get_node_ip_address() common_envs = { "HOROVOD_GLOO_RENDEZVOUS_ADDR": driver_ip, "HOROVOD_GLOO_RENDEZVOUS_PORT": str(global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", "PYTHONUNBUFFERED": '1', "OMP_NUM_THREADS": str(self.cores_per_node) } for key in os.environ: if key.startswith("HOROVOD"): common_envs[key] = os.environ[key] # todo support other Horovod envs self.per_worker_envs = [ common_envs.copy() for _ in range(self.num_nodes) ] for alloc_info in self.host_alloc_plan: key = (alloc_info.hostname, alloc_info.local_rank) local_envs = self.per_worker_envs[name_rank_to_id[key]] local_envs["HOROVOD_RANK"] = str(alloc_info.rank) local_envs["HOROVOD_SIZE"] = str(alloc_info.size) local_envs["HOROVOD_LOCAL_RANK"] = str(alloc_info.local_rank) local_envs["HOROVOD_LOCAL_SIZE"] = str(alloc_info.local_size) local_envs["HOROVOD_CROSS_RANK"] = str(alloc_info.cross_rank) local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size) ray.get( [worker.set_gloo_iface.remote() for worker in self.remote_workers]) self.run(lambda: print("horovod worker initialized"))