def _driver_fn(client, net_if): cluster_tasks = _task_commons._get_cluster_tasks(client) # Worker discovery worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"] n_workers = 1 for cluster_task in cluster_tasks: if 'worker' in cluster_task: worker_addr = event.wait(client, f"{cluster_task}/addr") logger.info(f"{cluster_task}: {worker_addr}") worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}") n_workers += 1 # Worker task allocation to workers hosts = gloo_run.parse_hosts(','.join(worker_list)) host_alloc_plan = gloo_run.get_host_assignments(hosts, n_workers) for host in host_alloc_plan: host_info = f"""\ {host.rank},{host.size},{host.local_rank},\ {host.local_size},{host.cross_rank},{host.cross_size}\ """ event.broadcast(client, f"{get_task()}/{host.hostname}", host_info) global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start_server() global_rendezv.httpd.init(host_alloc_plan) event.broadcast(client, f"{get_task()}/sock_addr", f"{net_if[1]}:{global_rendezv_port}") return global_rendezv.listen_thread
def gloo_run(settings, remote_host_names, common_intfs, env, server_ip, command): # allocate processes into slots host_alloc_plan = _allocate(settings.hosts, settings.num_proc) # create global rendezvous server global_rendezv = RendezvousServer(settings.verbose) # Start rendezvous server and get port that it is listening global_rendezv_port = global_rendezv.start_server(host_alloc_plan) iface = list(common_intfs)[0] run_command = ( 'HOROVOD_GLOO_RENDEZVOUS_ADDR={addr} ' 'HOROVOD_GLOO_RENDEZVOUS_PORT={port} ' 'HOROVOD_CONTROLLER=gloo ' 'HOROVOD_CPU_OPERATIONS=gloo ' 'HOROVOD_GLOO_IFACE={iface} ' 'NCCL_SOCKET_IFNAME={common_intfs} ' '{command}' # expect a lot of environment variables .format( addr=server_ip, port=global_rendezv_port, iface=iface, # TODO: add multiple ifaces in future common_intfs=','.join(common_intfs), command=' '.join(quote(par) for par in command))) _launch_jobs(settings, env, host_alloc_plan, remote_host_names, run_command) return
def gloo_run_elastic(settings, env, command): # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) rendezvous = RendezvousServer(settings.verbose) driver = ElasticDriver(rendezvous, settings.discovery, settings.min_np, settings.max_np, timeout=settings.elastic_timeout, verbose=settings.verbose) handler = create_rendezvous_handler(driver) global_rendezv_port = rendezvous.start_server(handler) # Host-to-host common interface detection requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(settings) current_hosts = driver.wait_for_available_slots(settings.num_proc, min_hosts=min_hosts) nics = driver_service.get_common_interfaces( settings, current_hosts.host_assignment_order) server_ip = network.get_driver_ip(nics) exec_command = _exec_command_fn(settings) event = register_shutdown_event() run_command = get_run_command(command, server_ip, nics, global_rendezv_port, elastic=True) create_worker = _create_elastic_worker_fn(exec_command, run_command, env, event) driver.start(settings.num_proc, create_worker) res = driver.get_results() driver.stop() rendezvous.stop_server() for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def launch_gloo(command, exec_command, settings, nics, env, server_ip): """ Launches the given command multiple times using gloo. Each command is launched via exec_command. :param command: command to launch :param exec_command: means to execute a single command :param settings: settings for the distribution :param nics: common interfaces :param env: environment to use :param server_ip: ip to use for rendezvous server """ # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # start global rendezvous server and get port that it is listening on rendezvous = RendezvousServer(settings.verbose) # allocate processes into slots hosts = parse_hosts(settings.hosts) host_alloc_plan = get_host_assignments(hosts, settings.num_proc) # start global rendezvous server and get port that it is listening on global_rendezv_port = rendezvous.start_server() rendezvous.httpd.init(host_alloc_plan) run_command = get_run_command(command, server_ip, nics, global_rendezv_port) slot_info_to_command = _slot_info_to_command_fn(run_command, env) event = register_shutdown_event() args_list = [[slot_info_to_command(slot_info), slot_info, [event]] for slot_info in host_alloc_plan] # If an error occurs in one thread, entire process will be terminated. # Otherwise, threads will keep running. res = threads.execute_function_multithreaded(exec_command, args_list, block_until_all_done=True) for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def gloo_run_elastic(settings, env, command): def get_common_interfaces(driver): # Host-to-host common interface detection requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(settings) current_hosts = driver.wait_for_available_slots(settings.num_proc, min_hosts=min_hosts) return driver_service.get_common_interfaces( settings, current_hosts.host_assignment_order) exec_command = _exec_command_fn(settings) rendezvous = RendezvousServer(settings.verbose) launch_gloo_elastic(command, exec_command, settings, env, get_common_interfaces, rendezvous)
def launch_gloo(command, exec_command, settings, nics, env, server_ip): """ Launches the given command multiple times using gloo. Each command is launched via exec_command. :param command: command to launch :param exec_command: means to execute a single command :param settings: settings for the distribution :param nics: common interfaces :param env: environment to use :param server_ip: ip to use for rendezvous server """ # allocate processes into slots host_alloc_plan = _allocate(settings.hosts, settings.num_proc) # create global rendezvous server global_rendezv = RendezvousServer(settings.verbose) # Start rendezvous server and get port that it is listening global_rendezv_port = global_rendezv.start_server(host_alloc_plan) run_command = ( 'HOROVOD_GLOO_RENDEZVOUS_ADDR={addr} ' 'HOROVOD_GLOO_RENDEZVOUS_PORT={port} ' 'HOROVOD_CONTROLLER=gloo ' 'HOROVOD_CPU_OPERATIONS=gloo ' 'HOROVOD_GLOO_IFACE={iface} ' 'NCCL_SOCKET_IFNAME={nics} ' '{command}' # expect a lot of environment variables .format( addr=server_ip, port=global_rendezv_port, iface=list(nics)[0], # TODO: add multiple ifaces in future nics=','.join(nics), command=' '.join(quote(par) for par in command))) # Create a event for communication between threads event = threading.Event() def set_event_on_sigterm(signum, frame): event.set() signal.signal(signal.SIGINT, set_event_on_sigterm) signal.signal(signal.SIGTERM, set_event_on_sigterm) # TODO: Workaround for over-buffered outputs. Investigate how mpirun avoids this problem. env = copy.copy(env) # copy env so we do not leak env modifications env['PYTHONUNBUFFERED'] = '1' # In case, the main thread receives a SIGINT, the event will be set so the spawned threads can # kill their corresponding middleman processes so the jobs can be killed as well. alloc_info_to_command = _alloc_info_to_command_fn(run_command, env) args_list = [[alloc_info_to_command(alloc_info), alloc_info, event] for alloc_info in host_alloc_plan] # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # If an error occurs in one thread, entire process will be terminated. # Otherwise, threads will keep running. res = threads.execute_function_multithreaded(exec_command, args_list, block_until_all_done=True) for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Gloo job detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def test_worker_notification_manager(self): """Tests that host add events are sent to the worker notification service and consumed.""" slots = {'host-1': 2} discovery = FixedHosts(slots) rendezvous = RendezvousServer() driver = ElasticDriver(rendezvous, discovery, min_np=2, max_np=4) driver.wait_for_available_slots(min_np=2) handler = create_rendezvous_handler(driver) common_intfs = network.get_local_intfs() addr = network.get_driver_ip(common_intfs) port = rendezvous.start_server(handler) nic = list(common_intfs)[0] rank_results = {} class NotificationReceiver: def __init__(self): self.events = [] def on_hosts_updated(self, timestamp): self.events.append(timestamp) def add_host(): slots = {'host-1': 2, 'host-2': 2} discovery.set(slots) def remove_host(): slots = {'host-2': 2} discovery.set(slots) def exec_command(slot_info, events): manager = WorkerNotificationManager() manager.init(rendezvous_addr=addr, rendezvous_port=port, nic=nic, hostname=slot_info.hostname, local_rank=slot_info.local_rank) notification_receiver = NotificationReceiver() manager.register_listener(notification_receiver) driver.record_ready(slot_info.hostname, slot_info.local_rank) if slot_info.rank == 0: add_host() driver.wait_for_available_slots(4) if slot_info.rank == 0: remove_host() # Busy wait for the number of available slots to decrease while driver._host_manager.current_hosts.count_available_slots( ) > 2: time.sleep(0.01) rank_results[slot_info.rank] = notification_receiver.events return 0, time.time() driver.start(np=2, create_worker_fn=exec_command) res = driver.get_results() driver.stop() assert len(res) == 2 for name, (exit_code, timestamp) in res.items(): assert exit_code == 0, name assert len(rank_results) == 2 for rank, timestamps in rank_results.items(): expected = 2 if rank == 0 else 0 assert len(timestamps) == expected, rank rendezvous.stop_server()