Example #1
0
def _driver_fn(client, net_if):
    cluster_tasks = _task_commons._get_cluster_tasks(client)
    # Worker discovery
    worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"]
    n_workers = 1
    for cluster_task in cluster_tasks:
        if 'worker' in cluster_task:
            worker_addr = event.wait(client, f"{cluster_task}/addr")
            logger.info(f"{cluster_task}: {worker_addr}")
            worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}")
            n_workers += 1

    # Worker task allocation to workers
    hosts = gloo_run.parse_hosts(','.join(worker_list))
    host_alloc_plan = gloo_run.get_host_assignments(hosts, n_workers)
    for host in host_alloc_plan:
        host_info = f"""\
            {host.rank},{host.size},{host.local_rank},\
            {host.local_size},{host.cross_rank},{host.cross_size}\
            """
        event.broadcast(client, f"{get_task()}/{host.hostname}", host_info)

    global_rendezv = RendezvousServer(verbose=1)
    global_rendezv_port = global_rendezv.start_server()
    global_rendezv.httpd.init(host_alloc_plan)
    event.broadcast(client, f"{get_task()}/sock_addr",
                    f"{net_if[1]}:{global_rendezv_port}")
    return global_rendezv.listen_thread
Example #2
0
def gloo_run(settings, remote_host_names, common_intfs, env, server_ip,
             command):
    # allocate processes into slots
    host_alloc_plan = _allocate(settings.hosts, settings.num_proc)

    # create global rendezvous server
    global_rendezv = RendezvousServer(settings.verbose)
    # Start rendezvous server and get port that it is listening
    global_rendezv_port = global_rendezv.start_server(host_alloc_plan)

    iface = list(common_intfs)[0]

    run_command = (
        'HOROVOD_GLOO_RENDEZVOUS_ADDR={addr} '
        'HOROVOD_GLOO_RENDEZVOUS_PORT={port} '
        'HOROVOD_CONTROLLER=gloo '
        'HOROVOD_CPU_OPERATIONS=gloo '
        'HOROVOD_GLOO_IFACE={iface} '
        'NCCL_SOCKET_IFNAME={common_intfs} '
        '{command}'  # expect a lot of environment variables
        .format(
            addr=server_ip,
            port=global_rendezv_port,
            iface=iface,  # TODO: add multiple ifaces in future
            common_intfs=','.join(common_intfs),
            command=' '.join(quote(par) for par in command)))

    _launch_jobs(settings, env, host_alloc_plan, remote_host_names,
                 run_command)
    return
Example #3
0
def gloo_run_elastic(settings, env, command):
    # Make the output directory if it does not exist
    if settings.output_filename:
        _mkdir_p(settings.output_filename)

    rendezvous = RendezvousServer(settings.verbose)
    driver = ElasticDriver(rendezvous,
                           settings.discovery,
                           settings.min_np,
                           settings.max_np,
                           timeout=settings.elastic_timeout,
                           verbose=settings.verbose)

    handler = create_rendezvous_handler(driver)
    global_rendezv_port = rendezvous.start_server(handler)

    # Host-to-host common interface detection requires at least 2 hosts in an elastic job.
    min_hosts = _get_min_start_hosts(settings)
    current_hosts = driver.wait_for_available_slots(settings.num_proc,
                                                    min_hosts=min_hosts)

    nics = driver_service.get_common_interfaces(
        settings, current_hosts.host_assignment_order)
    server_ip = network.get_driver_ip(nics)

    exec_command = _exec_command_fn(settings)
    event = register_shutdown_event()
    run_command = get_run_command(command,
                                  server_ip,
                                  nics,
                                  global_rendezv_port,
                                  elastic=True)
    create_worker = _create_elastic_worker_fn(exec_command, run_command, env,
                                              event)

    driver.start(settings.num_proc, create_worker)
    res = driver.get_results()
    driver.stop()
    rendezvous.stop_server()

    for name, value in sorted(res.items(), key=lambda item: item[1][1]):
        exit_code, timestamp = value
        if exit_code != 0:
            raise RuntimeError(
                'Horovod detected that one or more processes exited with non-zero '
                'status, thus causing the job to be terminated. The first process '
                'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                format(name=name, code=exit_code))
Example #4
0
def launch_gloo(command, exec_command, settings, nics, env, server_ip):
    """
    Launches the given command multiple times using gloo.
    Each command is launched via exec_command.

    :param command: command to launch
    :param exec_command: means to execute a single command
    :param settings: settings for the distribution
    :param nics: common interfaces
    :param env: environment to use
    :param server_ip: ip to use for rendezvous server
    """
    # Make the output directory if it does not exist
    if settings.output_filename:
        _mkdir_p(settings.output_filename)

    # start global rendezvous server and get port that it is listening on
    rendezvous = RendezvousServer(settings.verbose)

    # allocate processes into slots
    hosts = parse_hosts(settings.hosts)
    host_alloc_plan = get_host_assignments(hosts, settings.num_proc)

    # start global rendezvous server and get port that it is listening on
    global_rendezv_port = rendezvous.start_server()
    rendezvous.httpd.init(host_alloc_plan)
    run_command = get_run_command(command, server_ip, nics,
                                  global_rendezv_port)

    slot_info_to_command = _slot_info_to_command_fn(run_command, env)
    event = register_shutdown_event()
    args_list = [[slot_info_to_command(slot_info), slot_info, [event]]
                 for slot_info in host_alloc_plan]

    # If an error occurs in one thread, entire process will be terminated.
    # Otherwise, threads will keep running.
    res = threads.execute_function_multithreaded(exec_command,
                                                 args_list,
                                                 block_until_all_done=True)

    for name, value in sorted(res.items(), key=lambda item: item[1][1]):
        exit_code, timestamp = value
        if exit_code != 0:
            raise RuntimeError(
                'Horovod detected that one or more processes exited with non-zero '
                'status, thus causing the job to be terminated. The first process '
                'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                format(name=name, code=exit_code))
Example #5
0
def gloo_run_elastic(settings, env, command):
    def get_common_interfaces(driver):
        # Host-to-host common interface detection requires at least 2 hosts in an elastic job.
        min_hosts = _get_min_start_hosts(settings)
        current_hosts = driver.wait_for_available_slots(settings.num_proc,
                                                        min_hosts=min_hosts)
        return driver_service.get_common_interfaces(
            settings, current_hosts.host_assignment_order)

    exec_command = _exec_command_fn(settings)
    rendezvous = RendezvousServer(settings.verbose)
    launch_gloo_elastic(command, exec_command, settings, env,
                        get_common_interfaces, rendezvous)
Example #6
0
def launch_gloo(command, exec_command, settings, nics, env, server_ip):
    """
    Launches the given command multiple times using gloo.
    Each command is launched via exec_command.

    :param command: command to launch
    :param exec_command: means to execute a single command
    :param settings: settings for the distribution
    :param nics: common interfaces
    :param env: environment to use
    :param server_ip: ip to use for rendezvous server
    """
    # allocate processes into slots
    host_alloc_plan = _allocate(settings.hosts, settings.num_proc)

    # create global rendezvous server
    global_rendezv = RendezvousServer(settings.verbose)
    # Start rendezvous server and get port that it is listening
    global_rendezv_port = global_rendezv.start_server(host_alloc_plan)

    run_command = (
        'HOROVOD_GLOO_RENDEZVOUS_ADDR={addr} '
        'HOROVOD_GLOO_RENDEZVOUS_PORT={port} '
        'HOROVOD_CONTROLLER=gloo '
        'HOROVOD_CPU_OPERATIONS=gloo '
        'HOROVOD_GLOO_IFACE={iface} '
        'NCCL_SOCKET_IFNAME={nics} '
        '{command}'  # expect a lot of environment variables
        .format(
            addr=server_ip,
            port=global_rendezv_port,
            iface=list(nics)[0],  # TODO: add multiple ifaces in future
            nics=','.join(nics),
            command=' '.join(quote(par) for par in command)))

    # Create a event for communication between threads
    event = threading.Event()

    def set_event_on_sigterm(signum, frame):
        event.set()

    signal.signal(signal.SIGINT, set_event_on_sigterm)
    signal.signal(signal.SIGTERM, set_event_on_sigterm)

    # TODO: Workaround for over-buffered outputs. Investigate how mpirun avoids this problem.
    env = copy.copy(env)  # copy env so we do not leak env modifications
    env['PYTHONUNBUFFERED'] = '1'

    # In case, the main thread receives a SIGINT, the event will be set so the spawned threads can
    # kill their corresponding middleman processes so the jobs can be killed as well.
    alloc_info_to_command = _alloc_info_to_command_fn(run_command, env)
    args_list = [[alloc_info_to_command(alloc_info), alloc_info, event]
                 for alloc_info in host_alloc_plan]

    # Make the output directory if it does not exist
    if settings.output_filename:
        _mkdir_p(settings.output_filename)

    # If an error occurs in one thread, entire process will be terminated.
    # Otherwise, threads will keep running.
    res = threads.execute_function_multithreaded(exec_command,
                                                 args_list,
                                                 block_until_all_done=True)

    for name, value in sorted(res.items(), key=lambda item: item[1][1]):
        exit_code, timestamp = value
        if exit_code != 0:
            raise RuntimeError(
                'Gloo job detected that one or more processes exited with non-zero '
                'status, thus causing the job to be terminated. The first process '
                'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                format(name=name, code=exit_code))
    def test_worker_notification_manager(self):
        """Tests that host add events are sent to the worker notification service and consumed."""
        slots = {'host-1': 2}
        discovery = FixedHosts(slots)

        rendezvous = RendezvousServer()
        driver = ElasticDriver(rendezvous, discovery, min_np=2, max_np=4)
        driver.wait_for_available_slots(min_np=2)
        handler = create_rendezvous_handler(driver)

        common_intfs = network.get_local_intfs()
        addr = network.get_driver_ip(common_intfs)
        port = rendezvous.start_server(handler)
        nic = list(common_intfs)[0]

        rank_results = {}

        class NotificationReceiver:
            def __init__(self):
                self.events = []

            def on_hosts_updated(self, timestamp):
                self.events.append(timestamp)

        def add_host():
            slots = {'host-1': 2, 'host-2': 2}
            discovery.set(slots)

        def remove_host():
            slots = {'host-2': 2}
            discovery.set(slots)

        def exec_command(slot_info, events):
            manager = WorkerNotificationManager()
            manager.init(rendezvous_addr=addr,
                         rendezvous_port=port,
                         nic=nic,
                         hostname=slot_info.hostname,
                         local_rank=slot_info.local_rank)

            notification_receiver = NotificationReceiver()
            manager.register_listener(notification_receiver)

            driver.record_ready(slot_info.hostname, slot_info.local_rank)

            if slot_info.rank == 0:
                add_host()
            driver.wait_for_available_slots(4)

            if slot_info.rank == 0:
                remove_host()

            # Busy wait for the number of available slots to decrease
            while driver._host_manager.current_hosts.count_available_slots(
            ) > 2:
                time.sleep(0.01)

            rank_results[slot_info.rank] = notification_receiver.events
            return 0, time.time()

        driver.start(np=2, create_worker_fn=exec_command)
        res = driver.get_results()
        driver.stop()

        assert len(res) == 2
        for name, (exit_code, timestamp) in res.items():
            assert exit_code == 0, name

        assert len(rank_results) == 2
        for rank, timestamps in rank_results.items():
            expected = 2 if rank == 0 else 0
            assert len(timestamps) == expected, rank

        rendezvous.stop_server()