def _check_all_hosts_ssh_successful(host_addresses, ssh_port=None, ssh_ports=None, ssh_identity_file=None): """ checks if ssh can successfully be performed to all the hosts. :param host_addresses: list of addresses to ssh into. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type host_addresses: list(strings) :return: Returns True if all ssh was successful into all the addresses. """ def exec_command(command): exit_code = 1 output_msg = '' # Try ssh 5 times for i in range(SSH_ATTEMPTS): output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=output, stderr=output) if exit_code == 0: break output_msg = output.getvalue() finally: output.close() return exit_code, output_msg ssh_port_per_host = driver_service.get_ssh_port_args(host_addresses, ssh_port=ssh_port, ssh_ports=ssh_ports) args_list = [[ get_remote_command(local_command='true', host=host_address, port=port, identity_file=ssh_identity_file, timeout_s=SSH_CONNECT_TIMEOUT_S) ] for host_address, port in zip(host_addresses, ssh_port_per_host)] ssh_exit_codes = \ threads.execute_function_multithreaded(exec_command, args_list) ssh_successful_to_all_hosts = True for index, ssh_status in ssh_exit_codes.items(): exit_code, output_msg = ssh_status[0], ssh_status[1] if exit_code != 0: print('ssh not successful for host {host}:\n{msg_output}'.format( host=host_addresses[index], msg_output=output_msg)) ssh_successful_to_all_hosts = False if not ssh_successful_to_all_hosts: return None # we could return False here but do not want it to be cached return True
def _check_all_hosts_ssh_successful(host_addresses, ssh_port=None): """ checks if ssh can successfully be performed to all the hosts. :param host_addresses: list of addresses to ssh into. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type host_addresses: list(strings) :return: Returns True if all ssh was successful into all the addresses. """ def exec_command(command): exit_code = 1 output_msg = '' # Try ssh 5 times for i in range(SSH_ATTEMPTS): output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=output, stderr=output) if exit_code == 0: break output_msg = output.getvalue() finally: output.close() return exit_code, output_msg ssh_port_arg = '-p {ssh_port}'.format( ssh_port=ssh_port) if ssh_port else '' ssh_command_format = 'ssh -o PasswordAuthentication=no -o StrictHostKeyChecking=no' \ ' {host} {ssh_port_arg} true' args_list = [[ ssh_command_format.format(host=host_address, ssh_port_arg=ssh_port_arg) ] for host_address in host_addresses] ssh_exit_codes = \ threads.execute_function_multithreaded(exec_command, args_list) ssh_successful_to_all_hosts = True for index, ssh_status in ssh_exit_codes.items(): exit_code, output_msg = ssh_status[0], ssh_status[1] if exit_code != 0: print('ssh not successful for host {host}:\n{msg_output}'.format( host=host_addresses[index], msg_output=output_msg)) ssh_successful_to_all_hosts = False if not ssh_successful_to_all_hosts: return None # we could return False here but do not want it to be cached return True
def launch_gloo(command, exec_command, settings, nics, env, server_ip): """ Launches the given command multiple times using gloo. Each command is launched via exec_command. :param command: command to launch :param exec_command: means to execute a single command :param settings: settings for the distribution :param nics: common interfaces :param env: environment to use :param server_ip: ip to use for rendezvous server """ # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # start global rendezvous server and get port that it is listening on rendezvous = RendezvousServer(settings.verbose) # allocate processes into slots hosts = parse_hosts(settings.hosts) host_alloc_plan = get_host_assignments(hosts, settings.num_proc) # start global rendezvous server and get port that it is listening on pedl_provisioned_port = int( os.environ.get('PEDL_HOROVOD_GLOO_RENDEZVOUS_PORT', 0)) global_rendezv_port = rendezvous.start( pedl_provisioned_port=pedl_provisioned_port) rendezvous.init(host_alloc_plan) run_command = get_run_command(command, server_ip, nics, global_rendezv_port) slot_info_to_command = _slot_info_to_command_fn(run_command, env) event = register_shutdown_event() args_list = [[slot_info_to_command(slot_info), slot_info, [event]] for slot_info in host_alloc_plan] # If an error occurs in one thread, entire process will be terminated. # Otherwise, threads will keep running. res = threads.execute_function_multithreaded(exec_command, args_list, block_until_all_done=True) for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def filter_local_addresses(all_host_names): local_addresses = get_local_host_addresses() args_list = [[host] for host in all_host_names] host_addresses = threads.execute_function_multithreaded( resolve_host_address, args_list) # host_addresses is a map remote_host_names = [] for i in range(len(all_host_names)): host_address = host_addresses[i] host_name = all_host_names[i] if not host_address or host_address not in local_addresses: remote_host_names.append(host_name) return remote_host_names
def _launch_task_servers(all_host_names, local_host_names, driver_addresses, settings): """ Executes the task server and service client task for registration on the hosts. :param all_host_names: list of addresses. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type all_host_names: list(string) :param local_host_names: names that are resolved to one of the addresses of local hosts interfaces. For example, set(['localhost', '127.0.0.1']) :type local_host_names: set :param driver_addresses: map of interfaces and their address and port for the service. For example: { 'lo': [('127.0.0.1', 34588)], 'docker0': [('172.122.10.1', 34588)], 'eth0': [('11.111.33.73', 34588)] } :type driver_addresses: map :param settings: the object that contains the setting for running horovod :type settings: horovod.runner.common.util.settings.Settings :return: :rtype: """ def _exec_command(command): host_output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=host_output, stderr=host_output) if exit_code != 0: print('Launching horovod task function was not ' 'successful:\n{host_output}'.format( host_output=host_output.getvalue())) os._exit(exit_code) finally: host_output.close() return exit_code args_list = [] num_hosts = len(all_host_names) for index in range(num_hosts): host_name = all_host_names[index] command = \ '{python} -m horovod.runner.task_fn {index} {num_hosts} ' \ '{driver_addresses} {settings}' \ .format(python=sys.executable, index=codec.dumps_base64(index), num_hosts=codec.dumps_base64(num_hosts), driver_addresses=codec.dumps_base64(driver_addresses), settings=codec.dumps_base64(settings)) if host_name not in local_host_names: command = get_ssh_command(command, host=host_name, port=settings.ssh_port, identity_file=settings.ssh_identity_file) if settings.verbose >= 2: print('Launching horovod task function: {}'.format(command)) args_list.append([command]) # Each thread will use ssh command to launch the server on one task. If an # error occurs in one thread, entire process will be terminated. Otherwise, # threads will keep running and ssh session -- and the the task server -- # will be bound to the thread. In case, the horovod process dies, all # the ssh sessions and all the task servers will die as well. threads.execute_function_multithreaded(_exec_command, args_list, block_until_all_done=False)