class HorovodBackend(Backend): share_cuda_visible_devices: bool = True def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig): # TODO(matt): Implement placement group strategies in BackendExecutor. # Initialize workers with Horovod environment variables setup_futures = [] for rank in range(len(worker_group)): worker_node_id = worker_group.workers[rank].metadata.node_id setup_futures.append( worker_group.execute_single_async( rank, init_env_vars, rank, len(worker_group), worker_node_id ) ) ray.get(setup_futures) # Use Horovod Ray Coordinator # backend_config as settings self.coordinator = Coordinator(backend_config) # Get all the hostnames of all workers node_ids = [w.metadata.node_id for w in worker_group.workers] hostnames = [w.metadata.hostname for w in worker_group.workers] # Register each hostname to the coordinator. assumes the hostname # ordering is the same. for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)): self.coordinator.register(hostname, node_id, rank) all_info = self.coordinator.finalize_registration() setup_futures = [] for rank, local_cross_env_var in all_info.items(): setup_futures.append( worker_group.execute_single_async( rank, update_env_vars, local_cross_env_var ) ) ray.get(setup_futures) coordinator_envs = self.coordinator.establish_rendezvous() # Get one worker from each host/node. node_worker_indexes = [node_ids.index(node_id) for node_id in set(node_ids)] node_workers = [ HorovodWorkerWrapper(worker_group.workers[worker_index]) for worker_index in node_worker_indexes ] assert len(node_workers) == len(self.coordinator.hostnames) nics = detect_nics( backend_config, all_host_names=list(self.coordinator.hostnames), node_workers=node_workers, ) coordinator_envs.update(nics_to_env_var(nics)) worker_group.execute(update_env_vars, coordinator_envs)
def test_cross_rank(): settings = MiniSettings() coord = Coordinator(settings) assert coord.world_size == 0 assert coord.hoststring == "" ranks = list(range(12)) for r in ranks: if r < 5: coord.register("host1", world_rank=r) elif r < 9: coord.register("host2", world_rank=r) else: coord.register("host3", world_rank=r) rank_to_info = coord.finalize_registration() assert len(rank_to_info) == len(ranks) # check that there is only 1 rank with cross_size == 1, cross_rank == 0 cross_size_1 = list(info for info in rank_to_info.values() if info["HOROVOD_CROSS_SIZE"] == 1) assert len(cross_size_1) == 1 assert cross_size_1[0]["HOROVOD_CROSS_RANK"] == 0 # check that there is only 2 rank with cross_size == 2 cross_size_2 = list(info for info in rank_to_info.values() if info["HOROVOD_CROSS_SIZE"] == 2) assert len(cross_size_2) == 2 # check that if cross_size == 2, set(cross_rank) == 0,1 assert set(d["HOROVOD_CROSS_RANK"] for d in cross_size_2) == {0, 1} # check that there is 9 rank with cross_size = 3 cross_size_3 = list(info for info in rank_to_info.values() if info["HOROVOD_CROSS_SIZE"] == 3) assert len(cross_size_3) == 9
class HorovodBackend(Backend): def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig): # TODO(matt): Implement placement group strategies in BackendExecutor. # Initialize workers with Horovod environment variables setup_futures = [] for rank in range(len(worker_group)): setup_futures.append( worker_group.execute_single_async(rank, init_env_vars, rank, len(worker_group))) ray.get(setup_futures) # Use Horovod Ray Coordinator # backend_config as settings self.coordinator = Coordinator(backend_config) # Get all the hostnames of all workers node_ids = worker_group.execute(get_node_id) hostnames = worker_group.execute(get_hostname) # Register each hostname to the coordinator. assumes the hostname # ordering is the same. for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)): self.coordinator.register(hostname, node_id, rank) all_info = self.coordinator.finalize_registration() setup_futures = [] for rank, local_cross_env_var in all_info.items(): setup_futures.append( worker_group.execute_single_async(rank, update_env_vars, local_cross_env_var)) ray.get(setup_futures) coordinator_envs = self.coordinator.establish_rendezvous() nics = detect_nics(backend_config, all_host_names=list(self.coordinator.hostnames), node_workers=worker_group.workers) coordinator_envs.update(nics_to_env_var(nics)) worker_group.execute(update_env_vars, coordinator_envs)
def test_coordinator_registration(): settings = MiniSettings() coord = Coordinator(settings) assert coord.world_size == 0 assert coord.hoststring == "" ranks = list(range(12)) for i, hostname in enumerate(["a", "b", "c"]): for r in ranks: if r % 3 == i: coord.register(hostname, world_rank=r) rank_to_info = coord.finalize_registration() assert len(rank_to_info) == len(ranks) assert all(info["NODE_WORLD_SIZE"] == 3 for info in rank_to_info.values()) assert {info["NODE_WORLD_RANK"] for info in rank_to_info.values()} == {0, 1, 2} assert all(info["LOCAL_SIZE"] == 4 for info in rank_to_info.values()) assert {info["LOCAL_RANK"] for info in rank_to_info.values()} == {0, 1, 2, 3}