def establish_rendezvous(self) -> Dict[str, str]: """Creates the rendezvous server and identifies the nics to be used. Returns: Environment variables for each worker. """ # start global rendezvous server and get port that it is listening on self.rendezvous = RendezvousServer(self.settings.verbose) # allocate processes into slots # hosts = parse_hosts(hosts_string="10.11.11.11:4,10.11.11.12:4") parsed_node_ids = hosts.parse_hosts(hosts_string=self.node_id_string) host_alloc_plan = hosts.get_host_assignments(parsed_node_ids, self.world_size) # start global rendezvous server and get port that it is listening on self.global_rendezv_port = self.rendezvous.start() self.rendezvous.init(host_alloc_plan) return { # needs to be real address "HOROVOD_GLOO_RENDEZVOUS_ADDR": ray.util.get_node_ip_address(), "HOROVOD_GLOO_RENDEZVOUS_PORT": str(self.global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", }
def test_get_host_assignments_heterogeneous(self): hosts = parse_hosts('worker-0:1,worker-1:2') np = 3 assignments = get_host_assignments(hosts, np) expected = [ SlotInfo(hostname='worker-0', rank=0, local_rank=0, cross_rank=0, size=3, local_size=1, cross_size=2), SlotInfo(hostname='worker-1', rank=1, local_rank=0, cross_rank=1, size=3, local_size=2, cross_size=2), SlotInfo(hostname='worker-1', rank=2, local_rank=1, cross_rank=0, size=3, local_size=2, cross_size=1) ] self.assertListEqual(assignments, expected)
def establish_rendezvous(self) -> Dict[str, str]: """Creates the rendezvous server and identifies the nics to be used. Returns: Environment variables for each worker. """ # start global rendezvous server and get port that it is listening on self.rendezvous = RendezvousServer(self.settings.verbose) # allocate processes into slots # hosts = parse_hosts(hosts_string="10.11.11.11:4,10.11.11.12:4") parsed_hosts = hosts.parse_hosts(hosts_string=self.hoststring) host_alloc_plan = hosts.get_host_assignments(parsed_hosts, self.world_size) # start global rendezvous server and get port that it is listening on self.global_rendezv_port = self.rendezvous.start() self.rendezvous.init(host_alloc_plan) # remote_host_names = network.filter_local_addresses() self.nics = driver_service.get_common_interfaces( self.settings, list(self.hostnames_by_rank)) return { "HOROVOD_GLOO_RENDEZVOUS_ADDR": ray.services.get_node_ip_address(), "HOROVOD_GLOO_RENDEZVOUS_PORT": str(self.global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", "HOROVOD_GLOO_IFACE": str(list(self.nics)[0]), # TODO "NCCL_SOCKET_IFNAME": ",".join(self.nics), # TDOO }
def test_get_host_assignments(self): hosts = parse_hosts('worker-0:2,worker-1:2') np = 4 assignments = get_host_assignments(hosts, np) sizes = dict(size=4, local_size=2, cross_size=2) expected = [ SlotInfo(hostname='worker-0', rank=0, local_rank=0, cross_rank=0, **sizes), SlotInfo(hostname='worker-0', rank=1, local_rank=1, cross_rank=0, **sizes), SlotInfo(hostname='worker-1', rank=2, local_rank=0, cross_rank=1, **sizes), SlotInfo(hostname='worker-1', rank=3, local_rank=1, cross_rank=1, **sizes) ] self.assertListEqual(assignments, expected)
def static_driver_fn(): if is_in_test_mode: print("In unit test mode. fake port: " + fake_server_port) return (fake_server_port, get_host_assignments(parse_hosts(worker_list), 1)) global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start() print("Rendezvous server started, port: " + str(global_rendezv_port)) # worker_list = "localhost:1" hosts = parse_hosts(worker_list) host_alloc_plan = get_host_assignments(hosts, 1) global_rendezv.init(host_alloc_plan) return (global_rendezv_port, host_alloc_plan)
def _get_host_plan(self): hosts = [] for host in self._worker_hosts: hosts.append(host + ":" + str(_WORKER_SLOT_NUMBER)) host_infos = parse_hosts(_HOST_SEP.join(hosts)) host_alloc_plan = get_host_assignments(host_infos, len(host_infos)) return host_alloc_plan
def _get_host_assignments(self, current_hosts): # Adjust the host assignments to account for added / removed hosts host_list = [hosts.HostInfo(host, current_hosts.get_slots(host)) for host in current_hosts.host_assignment_order] host_assignments_list = hosts.get_host_assignments(host_list, self._min_np, self._max_np) host_assignments = defaultdict(list) for slot_info in host_assignments_list: host_assignments[slot_info.hostname].append(slot_info) return host_assignments, host_assignments_list
def test_get_host_assignments_elastic(self): hosts = parse_hosts('worker-0:2,worker-1:2') min_np = 1 max_np = 2 assignments = get_host_assignments(hosts, min_np=min_np, max_np=max_np) sizes = dict(size=2, local_size=2, cross_size=1) expected = [SlotInfo(hostname='worker-0', rank=0, local_rank=0, cross_rank=0, **sizes), SlotInfo(hostname='worker-0', rank=1, local_rank=1, cross_rank=0, **sizes)] self.assertListEqual(assignments, expected)
def static_driver_fn(): global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start() print("Rendezvous server started, port: " + str(global_rendezv_port)) # worker_list = "localhost:1" hosts = parse_hosts(worker_list) host_alloc_plan = get_host_assignments(hosts, 1) global_rendezv.init(host_alloc_plan) return (global_rendezv_port, host_alloc_plan)
def launch_gloo(command, exec_command, settings, nics, env, server_ip): """ Launches the given command multiple times using gloo. Each command is launched via exec_command. :param command: command to launch :param exec_command: means to execute a single command :param settings: settings for the distribution :param nics: common interfaces :param env: environment to use :param server_ip: ip to use for rendezvous server """ # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # start global rendezvous server and get port that it is listening on rendezvous = RendezvousServer(settings.verbose) # allocate processes into slots hosts = parse_hosts(settings.hosts) host_alloc_plan = get_host_assignments(hosts, settings.num_proc) # start global rendezvous server and get port that it is listening on pedl_provisioned_port = int( os.environ.get('PEDL_HOROVOD_GLOO_RENDEZVOUS_PORT', 0)) global_rendezv_port = rendezvous.start( pedl_provisioned_port=pedl_provisioned_port) rendezvous.init(host_alloc_plan) run_command = get_run_command(command, server_ip, nics, global_rendezv_port) slot_info_to_command = _slot_info_to_command_fn(run_command, env) event = register_shutdown_event() args_list = [[slot_info_to_command(slot_info), slot_info, [event]] for slot_info in host_alloc_plan] # If an error occurs in one thread, entire process will be terminated. # Otherwise, threads will keep running. res = threads.execute_function_multithreaded(exec_command, args_list, block_until_all_done=True) for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))