def start(self): """Starts the Horovod driver and services.""" self.rendezvous = RendezvousServer(self.settings.verbose) self.driver = ElasticDriver(rendezvous=self.rendezvous, discovery=self.settings.discovery, min_np=self.settings.min_np, max_np=self.settings.max_np, timeout=self.settings.elastic_timeout, reset_limit=self.settings.reset_limit, verbose=self.settings.verbose) handler = create_rendezvous_handler(self.driver) global_rendezv_port = self.rendezvous.start(handler) self.driver.wait_for_available_slots(self.settings.num_proc) # Host-to-host common interface detection # requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(self.settings) current_hosts = self.driver.wait_for_available_slots( self.settings.num_proc, min_hosts=min_hosts) nics = driver_service.get_common_interfaces( self.settings, current_hosts.host_assignment_order) server_ip = network.get_driver_ip(nics) self.run_env_vars = create_run_env_vars(server_ip, nics, global_rendezv_port, elastic=True)
def launch_gloo_elastic(command, exec_command, settings, env, get_common_interfaces, rendezvous): # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) driver = ElasticDriver(rendezvous, settings.discovery, settings.min_np, settings.max_np, timeout=settings.elastic_timeout, reset_limit=settings.reset_limit, cooldown_range=settings.cooldown_range, verbose=settings.verbose) handler = create_rendezvous_handler(driver) global_rendezv_port = rendezvous.start(handler) driver.wait_for_available_slots(settings.num_proc) nics = get_common_interfaces(driver) server_ip = network.get_driver_ip(nics) event = register_shutdown_event() run_command = get_run_command(command, server_ip, nics, global_rendezv_port, elastic=True) create_worker = _create_elastic_worker_fn(exec_command, run_command, env, event) driver.start(settings.num_proc, create_worker) res = driver.get_results() driver.stop() if res.error_message is not None: raise RuntimeError(res.error_message) for name, value in sorted(res.worker_results.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def gloo_run_fn(): driver_ip = network.get_driver_ip(nics) gloo_run(settings, nics, env, driver_ip, command)
def _run_static(args): nics_set = set(args.nics.split(',')) if args.nics else None # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, ssh_identity_file=args.ssh_identity_file, extra_mpi_args=args.mpi_args, tcp_flag=args.tcp_flag, binding_args=args.binding_args, key=secret.make_secret_key(), start_timeout=tmout, num_proc=args.np, hosts=args.hosts, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=nics_set) # This cache stores the results of checks performed by horovod # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) if args.ssh_identity_file: params += args.ssh_identity_file parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) all_host_names, _ = hosts.parse_hosts_and_slots(args.hosts) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. if not _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, args.ssh_identity_file, fn_cache=fn_cache): raise RuntimeError('could not connect to some hosts via ssh') if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') nics = driver_service.get_common_interfaces(settings, all_host_names, remote_host_names, fn_cache) if args.run_func: # get the driver IPv4 address driver_ip = network.get_driver_ip(nics) run_func_server = KVStoreServer(verbose=settings.verbose) run_func_server_port = run_func_server.start_server() put_data_into_kvstore(driver_ip, run_func_server_port, 'runfunc', 'func', args.run_func) command = [ sys.executable, '-m', 'horovod.runner.run_task', str(driver_ip), str(run_func_server_port) ] try: _launch_job(args, settings, nics, command) results = [None] * args.np # TODO: make it parallel to improve performance for i in range(args.np): results[i] = read_data_from_kvstore(driver_ip, run_func_server_port, 'runfunc_result', str(i)) return results finally: run_func_server.shutdown_server() else: command = args.command _launch_job(args, settings, nics, command) return None
def launch_gloo_elastic(command_or_func, exec_command, settings, env, get_common_interfaces, rendezvous, executable): # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) driver = ElasticDriver(rendezvous, settings.discovery, settings.min_num_proc, settings.max_num_proc, timeout=settings.elastic_timeout, reset_limit=settings.reset_limit, cooldown_range=settings.cooldown_range, verbose=settings.verbose) handler = create_rendezvous_handler(driver) global_rendezv_port = rendezvous.start(handler) driver.wait_for_available_slots(settings.num_proc) nics = get_common_interfaces(driver) server_ip = network.get_driver_ip(nics) run_func_server = None run_func_server_port = None if settings.run_func_mode: # when running a func, we have to spin up the KVStoreServer # to get the func to the remote process and the result back run_func_server = KVStoreServer(verbose=settings.verbose) run_func_server_port = run_func_server.start_server() put_data_into_kvstore(server_ip, run_func_server_port, 'runfunc', 'func', command_or_func) command = [executable, '-m', 'horovod.runner.run_task', server_ip, str(run_func_server_port)] else: command = command_or_func try: event = register_shutdown_event() run_command = get_run_command(command, server_ip, nics, global_rendezv_port, elastic=True) create_worker = _create_elastic_worker_fn(exec_command, run_command, env, event) driver.start(settings.num_proc, create_worker) res = driver.get_results() driver.stop() if res.error_message is not None: raise RuntimeError(res.error_message) for name, value in sorted(res.worker_results.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError('Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n' .format(name=name, code=exit_code)) # fetch the result if running a func if settings.run_func_mode: results = [None] * settings.min_num_proc # TODO: make it parallel to improve performance for i in range(settings.min_num_proc): results[i] = read_data_from_kvstore(server_ip, run_func_server_port, 'runfunc_result', str(i)) return results return None finally: if run_func_server: run_func_server.shutdown_server()
def test_worker_notification_manager(self): """Tests that host add events are sent to the worker notification service and consumed.""" slots = {'host-1': 2} discovery = FixedHosts(slots) rendezvous = RendezvousServer() driver = ElasticDriver(rendezvous, discovery, min_np=2, max_np=4) driver.wait_for_available_slots(min_np=2) handler = create_rendezvous_handler(driver) common_intfs = network.get_local_intfs() addr = network.get_driver_ip(common_intfs) port = rendezvous.start(handler) nic = list(common_intfs)[0] rank_results = {} class NotificationReceiver: def __init__(self): self.events = [] def on_hosts_updated(self, timestamp, res): self.events.append((timestamp, res)) def add_host(): slots = {'host-1': 2, 'host-2': 2} discovery.set(slots) def remove_host(): slots = {'host-2': 2} discovery.set(slots) def exec_command(slot_info, events): manager = WorkerNotificationManager() manager.init(rendezvous_addr=addr, rendezvous_port=port, nic=nic, hostname=slot_info.hostname, local_rank=slot_info.local_rank) notification_receiver = NotificationReceiver() manager.register_listener(notification_receiver) driver.record_ready(slot_info.hostname, slot_info.local_rank) if slot_info.rank == 0: add_host() driver.wait_for_available_slots(4) if slot_info.rank == 0: remove_host() # Busy wait for the number of available slots to decrease while driver._host_manager.current_hosts.count_available_slots( ) > 2: time.sleep(0.01) rank_results[slot_info.rank] = notification_receiver.events return 0, time.time() driver.start(np=2, create_worker_fn=exec_command) res = driver.get_results().worker_results driver.stop() assert len(res) == 2 for name, (exit_code, timestamp) in res.items(): assert exit_code == 0, name assert len(rank_results) == 2 for rank, events in rank_results.items(): expected = 2 if rank == 0 else 0 assert len(events) == expected, rank if rank == 0: # First update is an add assert events[0][1] == HostUpdateResult.added # Second update is a removal assert events[1][1] == HostUpdateResult.removed rendezvous.stop()