Example #1
0
class HorovodBackend(Backend):
    share_cuda_visible_devices: bool = True

    def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig):

        # TODO(matt): Implement placement group strategies in BackendExecutor.

        # Initialize workers with Horovod environment variables
        setup_futures = []
        for rank in range(len(worker_group)):
            worker_node_id = worker_group.workers[rank].metadata.node_id
            setup_futures.append(
                worker_group.execute_single_async(
                    rank, init_env_vars, rank, len(worker_group), worker_node_id
                )
            )
        ray.get(setup_futures)

        # Use Horovod Ray Coordinator
        # backend_config as settings
        self.coordinator = Coordinator(backend_config)

        # Get all the hostnames of all workers
        node_ids = [w.metadata.node_id for w in worker_group.workers]
        hostnames = [w.metadata.hostname for w in worker_group.workers]
        # Register each hostname to the coordinator. assumes the hostname
        # ordering is the same.
        for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)):
            self.coordinator.register(hostname, node_id, rank)
        all_info = self.coordinator.finalize_registration()

        setup_futures = []
        for rank, local_cross_env_var in all_info.items():
            setup_futures.append(
                worker_group.execute_single_async(
                    rank, update_env_vars, local_cross_env_var
                )
            )
        ray.get(setup_futures)

        coordinator_envs = self.coordinator.establish_rendezvous()

        # Get one worker from each host/node.
        node_worker_indexes = [node_ids.index(node_id) for node_id in set(node_ids)]
        node_workers = [
            HorovodWorkerWrapper(worker_group.workers[worker_index])
            for worker_index in node_worker_indexes
        ]
        assert len(node_workers) == len(self.coordinator.hostnames)

        nics = detect_nics(
            backend_config,
            all_host_names=list(self.coordinator.hostnames),
            node_workers=node_workers,
        )
        coordinator_envs.update(nics_to_env_var(nics))

        worker_group.execute(update_env_vars, coordinator_envs)
Example #2
0
def test_cross_rank():
    settings = MiniSettings()
    coord = Coordinator(settings)
    assert coord.world_size == 0
    assert coord.hoststring == ""
    ranks = list(range(12))

    for r in ranks:
        if r < 5:
            coord.register("host1", world_rank=r)
        elif r < 9:
            coord.register("host2", world_rank=r)
        else:
            coord.register("host3", world_rank=r)

    rank_to_info = coord.finalize_registration()
    assert len(rank_to_info) == len(ranks)
    # check that there is only 1 rank with cross_size == 1, cross_rank == 0
    cross_size_1 = list(info for info in rank_to_info.values()
                        if info["HOROVOD_CROSS_SIZE"] == 1)
    assert len(cross_size_1) == 1
    assert cross_size_1[0]["HOROVOD_CROSS_RANK"] == 0
    # check that there is only 2 rank with cross_size == 2
    cross_size_2 = list(info for info in rank_to_info.values()
                        if info["HOROVOD_CROSS_SIZE"] == 2)
    assert len(cross_size_2) == 2

    # check that if cross_size == 2, set(cross_rank) == 0,1
    assert set(d["HOROVOD_CROSS_RANK"] for d in cross_size_2) == {0, 1}

    # check that there is 9 rank with cross_size = 3
    cross_size_3 = list(info for info in rank_to_info.values()
                        if info["HOROVOD_CROSS_SIZE"] == 3)
    assert len(cross_size_3) == 9
Example #3
0
class HorovodBackend(Backend):
    def on_start(self, worker_group: WorkerGroup,
                 backend_config: HorovodConfig):

        # TODO(matt): Implement placement group strategies in BackendExecutor.

        # Initialize workers with Horovod environment variables
        setup_futures = []
        for rank in range(len(worker_group)):
            setup_futures.append(
                worker_group.execute_single_async(rank, init_env_vars, rank,
                                                  len(worker_group)))
        ray.get(setup_futures)

        # Use Horovod Ray Coordinator
        # backend_config as settings
        self.coordinator = Coordinator(backend_config)

        # Get all the hostnames of all workers
        node_ids = worker_group.execute(get_node_id)
        hostnames = worker_group.execute(get_hostname)
        # Register each hostname to the coordinator. assumes the hostname
        # ordering is the same.
        for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)):
            self.coordinator.register(hostname, node_id, rank)
        all_info = self.coordinator.finalize_registration()

        setup_futures = []
        for rank, local_cross_env_var in all_info.items():
            setup_futures.append(
                worker_group.execute_single_async(rank, update_env_vars,
                                                  local_cross_env_var))
        ray.get(setup_futures)

        coordinator_envs = self.coordinator.establish_rendezvous()

        nics = detect_nics(backend_config,
                           all_host_names=list(self.coordinator.hostnames),
                           node_workers=worker_group.workers)
        coordinator_envs.update(nics_to_env_var(nics))

        worker_group.execute(update_env_vars, coordinator_envs)
Example #4
0
def test_coordinator_registration():
    settings = MiniSettings()
    coord = Coordinator(settings)
    assert coord.world_size == 0
    assert coord.hoststring == ""
    ranks = list(range(12))

    for i, hostname in enumerate(["a", "b", "c"]):
        for r in ranks:
            if r % 3 == i:
                coord.register(hostname, world_rank=r)

    rank_to_info = coord.finalize_registration()
    assert len(rank_to_info) == len(ranks)
    assert all(info["NODE_WORLD_SIZE"] == 3 for info in rank_to_info.values())
    assert {info["NODE_WORLD_RANK"]
            for info in rank_to_info.values()} == {0, 1, 2}
    assert all(info["LOCAL_SIZE"] == 4 for info in rank_to_info.values())
    assert {info["LOCAL_RANK"]
            for info in rank_to_info.values()} == {0, 1, 2, 3}