Ejemplo n.º 1
0
    def test_host_shutdown_on_worker_failure(self):
        """Tests two hosts, two slots each with one process on second host failing, causing host shutdown."""
        slots = {'host-1': 2, 'host-2': 2}
        discovery = FixedHosts(slots)

        driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4)
        driver.wait_for_available_slots(min_np=2)

        rank_results = {}

        def exec_command(slot_info, events):
            if slot_info.hostname == 'host-1':
                if slot_info.local_rank == 0:
                    return 1, time.time()

                driver.record_ready(slot_info.hostname, slot_info.local_rank)
                wait_for_one(events)
                return 1, time.time()

            driver.record_ready(slot_info.hostname, slot_info.local_rank)
            updated_slot_info = driver.get_slot_info(slot_info.hostname, slot_info.local_rank)
            rank_results[slot_info.rank] = (slot_info, updated_slot_info)
            return 0, time.time()

        driver.start(np=2, create_worker_fn=exec_command)
        res = driver.get_results()
        driver.stop()

        assert len(res) == 2
        for name, (exit_code, timestamp) in res.items():
            assert exit_code == 0, name

        assert len(rank_results) == 2
        for rank, (slot_info, updated_slot_info) in rank_results.items():
            assert updated_slot_info.size == 2, rank
            assert updated_slot_info.rank == slot_info.rank % 2, rank
            assert updated_slot_info.local_size == slot_info.local_size, rank
            assert updated_slot_info.local_rank == slot_info.local_rank, rank
            assert updated_slot_info.cross_size == 1, rank
            assert updated_slot_info.cross_rank == 0, rank
Ejemplo n.º 2
0
    def test_shutdown_on_initial_discovery_failure(self):
        """Tests that the driver will shutdown immediately if initial host discovery fails."""
        discovery = mock.Mock()
        discovery.find_available_hosts_and_slots.side_effect = RuntimeError()

        discover_hosts = ElasticDriver._discover_hosts

        def wrapped_discover_hosts(obj):
            try:
                discover_hosts(obj)
            except RuntimeError:
                # Suppress the error message from the background discovery thread to clean up unit tests
                pass

        try:
            ElasticDriver._discover_hosts = wrapped_discover_hosts
            driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4)
            with pytest.raises(RuntimeError):
                driver.wait_for_available_slots(min_np=2)
            assert driver.finished()
        finally:
            ElasticDriver._discover_hosts = discover_hosts
Ejemplo n.º 3
0
def gloo_run_elastic(settings, env, command):
    # Make the output directory if it does not exist
    if settings.output_filename:
        _mkdir_p(settings.output_filename)

    rendezvous = RendezvousServer(settings.verbose)
    driver = ElasticDriver(rendezvous,
                           settings.discovery,
                           settings.min_np,
                           settings.max_np,
                           timeout=settings.elastic_timeout,
                           verbose=settings.verbose)

    handler = create_rendezvous_handler(driver)
    global_rendezv_port = rendezvous.start_server(handler)

    # Host-to-host common interface detection requires at least 2 hosts in an elastic job.
    min_hosts = _get_min_start_hosts(settings)
    current_hosts = driver.wait_for_available_slots(settings.num_proc,
                                                    min_hosts=min_hosts)

    nics = driver_service.get_common_interfaces(
        settings, current_hosts.host_assignment_order)
    server_ip = network.get_driver_ip(nics)

    exec_command = _exec_command_fn(settings)
    event = register_shutdown_event()
    run_command = get_run_command(command,
                                  server_ip,
                                  nics,
                                  global_rendezv_port,
                                  elastic=True)
    create_worker = _create_elastic_worker_fn(exec_command, run_command, env,
                                              event)

    driver.start(settings.num_proc, create_worker)
    res = driver.get_results()
    driver.stop()
    rendezvous.stop_server()

    for name, value in sorted(res.items(), key=lambda item: item[1][1]):
        exit_code, timestamp = value
        if exit_code != 0:
            raise RuntimeError(
                'Horovod detected that one or more processes exited with non-zero '
                'status, thus causing the job to be terminated. The first process '
                'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                format(name=name, code=exit_code))
Ejemplo n.º 4
0
    def test_worker_notification_manager(self):
        """Tests that host add events are sent to the worker notification service and consumed."""
        slots = {'host-1': 2}
        discovery = FixedHosts(slots)

        rendezvous = RendezvousServer()
        driver = ElasticDriver(rendezvous, discovery, min_np=2, max_np=4)
        driver.wait_for_available_slots(min_np=2)
        handler = create_rendezvous_handler(driver)

        common_intfs = network.get_local_intfs()
        addr = network.get_driver_ip(common_intfs)
        port = rendezvous.start_server(handler)
        nic = list(common_intfs)[0]

        rank_results = {}

        class NotificationReceiver:
            def __init__(self):
                self.events = []

            def on_hosts_updated(self, timestamp):
                self.events.append(timestamp)

        def add_host():
            slots = {'host-1': 2, 'host-2': 2}
            discovery.set(slots)

        def remove_host():
            slots = {'host-2': 2}
            discovery.set(slots)

        def exec_command(slot_info, events):
            manager = WorkerNotificationManager()
            manager.init(rendezvous_addr=addr,
                         rendezvous_port=port,
                         nic=nic,
                         hostname=slot_info.hostname,
                         local_rank=slot_info.local_rank)

            notification_receiver = NotificationReceiver()
            manager.register_listener(notification_receiver)

            driver.record_ready(slot_info.hostname, slot_info.local_rank)

            if slot_info.rank == 0:
                add_host()
            driver.wait_for_available_slots(4)

            if slot_info.rank == 0:
                remove_host()

            # Busy wait for the number of available slots to decrease
            while driver._host_manager.current_hosts.count_available_slots(
            ) > 2:
                time.sleep(0.01)

            rank_results[slot_info.rank] = notification_receiver.events
            return 0, time.time()

        driver.start(np=2, create_worker_fn=exec_command)
        res = driver.get_results()
        driver.stop()

        assert len(res) == 2
        for name, (exit_code, timestamp) in res.items():
            assert exit_code == 0, name

        assert len(rank_results) == 2
        for rank, timestamps in rank_results.items():
            expected = 2 if rank == 0 else 0
            assert len(timestamps) == expected, rank

        rendezvous.stop_server()