def _get_host_assignments(self, current_hosts): # Adjust the host assignments to account for added / removed hosts host_list = [hosts.HostInfo(host, current_hosts.get_slots(host)) for host in current_hosts.host_assignment_order] host_assignments_list = hosts.get_host_assignments(host_list, self._min_np, self._max_np) host_assignments = defaultdict(list) for slot_info in host_assignments_list: host_assignments[slot_info.hostname].append(slot_info) return host_assignments, host_assignments_list
def launch_gloo(command, exec_command, settings, nics, env, server_ip): """ Launches the given command multiple times using gloo. Each command is launched via exec_command. :param command: command to launch :param exec_command: means to execute a single command :param settings: settings for the distribution :param nics: common interfaces :param env: environment to use :param server_ip: ip to use for rendezvous server """ # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # start global rendezvous server and get port that it is listening on rendezvous = RendezvousServer(settings.verbose) # allocate processes into slots hosts = parse_hosts(settings.hosts) host_alloc_plan = get_host_assignments(hosts, settings.num_proc) # start global rendezvous server and get port that it is listening on global_rendezv_port = rendezvous.start_server() rendezvous.httpd.init(host_alloc_plan) run_command = get_run_command(command, server_ip, nics, global_rendezv_port) slot_info_to_command = _slot_info_to_command_fn(run_command, env) event = register_shutdown_event() args_list = [[slot_info_to_command(slot_info), slot_info, [event]] for slot_info in host_alloc_plan] # If an error occurs in one thread, entire process will be terminated. # Otherwise, threads will keep running. res = threads.execute_function_multithreaded(exec_command, args_list, block_until_all_done=True) for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))