Esempio n. 1
0
    def establish_rendezvous(self) -> Dict[str, str]:
        """Creates the rendezvous server and identifies the nics to be used.

        Returns:
            Environment variables for each worker.
        """

        # start global rendezvous server and get port that it is listening on
        self.rendezvous = RendezvousServer(self.settings.verbose)

        # allocate processes into slots
        # hosts = parse_hosts(hosts_string="10.11.11.11:4,10.11.11.12:4")
        parsed_node_ids = hosts.parse_hosts(hosts_string=self.node_id_string)
        host_alloc_plan = hosts.get_host_assignments(parsed_node_ids,
                                                     self.world_size)

        # start global rendezvous server and get port that it is listening on
        self.global_rendezv_port = self.rendezvous.start()
        self.rendezvous.init(host_alloc_plan)

        return {
            # needs to be real address
            "HOROVOD_GLOO_RENDEZVOUS_ADDR": ray.util.get_node_ip_address(),
            "HOROVOD_GLOO_RENDEZVOUS_PORT": str(self.global_rendezv_port),
            "HOROVOD_CONTROLLER": "gloo",
            "HOROVOD_CPU_OPERATIONS": "gloo",
        }
Esempio n. 2
0
    def test_get_host_assignments_heterogeneous(self):
        hosts = parse_hosts('worker-0:1,worker-1:2')
        np = 3
        assignments = get_host_assignments(hosts, np)

        expected = [
            SlotInfo(hostname='worker-0',
                     rank=0,
                     local_rank=0,
                     cross_rank=0,
                     size=3,
                     local_size=1,
                     cross_size=2),
            SlotInfo(hostname='worker-1',
                     rank=1,
                     local_rank=0,
                     cross_rank=1,
                     size=3,
                     local_size=2,
                     cross_size=2),
            SlotInfo(hostname='worker-1',
                     rank=2,
                     local_rank=1,
                     cross_rank=0,
                     size=3,
                     local_size=2,
                     cross_size=1)
        ]
        self.assertListEqual(assignments, expected)
Esempio n. 3
0
    def establish_rendezvous(self) -> Dict[str, str]:
        """Creates the rendezvous server and identifies the nics to be used.

        Returns:
            Environment variables for each worker.
        """

        # start global rendezvous server and get port that it is listening on
        self.rendezvous = RendezvousServer(self.settings.verbose)

        # allocate processes into slots
        # hosts = parse_hosts(hosts_string="10.11.11.11:4,10.11.11.12:4")
        parsed_hosts = hosts.parse_hosts(hosts_string=self.hoststring)
        host_alloc_plan = hosts.get_host_assignments(parsed_hosts,
                                                     self.world_size)

        # start global rendezvous server and get port that it is listening on
        self.global_rendezv_port = self.rendezvous.start()
        self.rendezvous.init(host_alloc_plan)
        # remote_host_names = network.filter_local_addresses()
        self.nics = driver_service.get_common_interfaces(
            self.settings, list(self.hostnames_by_rank))

        return {
            "HOROVOD_GLOO_RENDEZVOUS_ADDR": ray.services.get_node_ip_address(),
            "HOROVOD_GLOO_RENDEZVOUS_PORT": str(self.global_rendezv_port),
            "HOROVOD_CONTROLLER": "gloo",
            "HOROVOD_CPU_OPERATIONS": "gloo",
            "HOROVOD_GLOO_IFACE": str(list(self.nics)[0]),  # TODO
            "NCCL_SOCKET_IFNAME": ",".join(self.nics),  # TDOO
        }
Esempio n. 4
0
    def test_get_host_assignments(self):
        hosts = parse_hosts('worker-0:2,worker-1:2')
        np = 4
        assignments = get_host_assignments(hosts, np)

        sizes = dict(size=4, local_size=2, cross_size=2)
        expected = [
            SlotInfo(hostname='worker-0',
                     rank=0,
                     local_rank=0,
                     cross_rank=0,
                     **sizes),
            SlotInfo(hostname='worker-0',
                     rank=1,
                     local_rank=1,
                     cross_rank=0,
                     **sizes),
            SlotInfo(hostname='worker-1',
                     rank=2,
                     local_rank=0,
                     cross_rank=1,
                     **sizes),
            SlotInfo(hostname='worker-1',
                     rank=3,
                     local_rank=1,
                     cross_rank=1,
                     **sizes)
        ]
        self.assertListEqual(assignments, expected)
Esempio n. 5
0
def static_driver_fn():
    if is_in_test_mode:
        print("In unit test mode. fake port: " + fake_server_port)
        return (fake_server_port,
                get_host_assignments(parse_hosts(worker_list), 1))

    global_rendezv = RendezvousServer(verbose=1)
    global_rendezv_port = global_rendezv.start()
    print("Rendezvous server started, port: " + str(global_rendezv_port))

    # worker_list = "localhost:1"
    hosts = parse_hosts(worker_list)
    host_alloc_plan = get_host_assignments(hosts, 1)

    global_rendezv.init(host_alloc_plan)
    return (global_rendezv_port, host_alloc_plan)
Esempio n. 6
0
    def _get_host_plan(self):
        hosts = []
        for host in self._worker_hosts:
            hosts.append(host + ":" + str(_WORKER_SLOT_NUMBER))

        host_infos = parse_hosts(_HOST_SEP.join(hosts))
        host_alloc_plan = get_host_assignments(host_infos, len(host_infos))
        return host_alloc_plan
Esempio n. 7
0
 def _get_host_assignments(self, current_hosts):
     # Adjust the host assignments to account for added / removed hosts
     host_list = [hosts.HostInfo(host, current_hosts.get_slots(host))
                  for host in current_hosts.host_assignment_order]
     host_assignments_list = hosts.get_host_assignments(host_list, self._min_np, self._max_np)
     host_assignments = defaultdict(list)
     for slot_info in host_assignments_list:
         host_assignments[slot_info.hostname].append(slot_info)
     return host_assignments, host_assignments_list
Esempio n. 8
0
    def test_get_host_assignments_elastic(self):
        hosts = parse_hosts('worker-0:2,worker-1:2')
        min_np = 1
        max_np = 2
        assignments = get_host_assignments(hosts, min_np=min_np, max_np=max_np)

        sizes = dict(size=2, local_size=2, cross_size=1)
        expected = [SlotInfo(hostname='worker-0', rank=0, local_rank=0, cross_rank=0, **sizes),
                    SlotInfo(hostname='worker-0', rank=1, local_rank=1, cross_rank=0, **sizes)]
        self.assertListEqual(assignments, expected)
Esempio n. 9
0
def static_driver_fn():
    global_rendezv = RendezvousServer(verbose=1)
    global_rendezv_port = global_rendezv.start()
    print("Rendezvous server started, port: " + str(global_rendezv_port))

    # worker_list = "localhost:1"
    hosts = parse_hosts(worker_list)
    host_alloc_plan = get_host_assignments(hosts, 1)

    global_rendezv.init(host_alloc_plan)
    return (global_rendezv_port, host_alloc_plan)
Esempio n. 10
0
def launch_gloo(command, exec_command, settings, nics, env, server_ip):
    """
    Launches the given command multiple times using gloo.
    Each command is launched via exec_command.

    :param command: command to launch
    :param exec_command: means to execute a single command
    :param settings: settings for the distribution
    :param nics: common interfaces
    :param env: environment to use
    :param server_ip: ip to use for rendezvous server
    """
    # Make the output directory if it does not exist
    if settings.output_filename:
        _mkdir_p(settings.output_filename)

    # start global rendezvous server and get port that it is listening on
    rendezvous = RendezvousServer(settings.verbose)

    # allocate processes into slots
    hosts = parse_hosts(settings.hosts)
    host_alloc_plan = get_host_assignments(hosts, settings.num_proc)

    # start global rendezvous server and get port that it is listening on
    pedl_provisioned_port = int(
        os.environ.get('PEDL_HOROVOD_GLOO_RENDEZVOUS_PORT', 0))
    global_rendezv_port = rendezvous.start(
        pedl_provisioned_port=pedl_provisioned_port)
    rendezvous.init(host_alloc_plan)
    run_command = get_run_command(command, server_ip, nics,
                                  global_rendezv_port)

    slot_info_to_command = _slot_info_to_command_fn(run_command, env)
    event = register_shutdown_event()
    args_list = [[slot_info_to_command(slot_info), slot_info, [event]]
                 for slot_info in host_alloc_plan]

    # If an error occurs in one thread, entire process will be terminated.
    # Otherwise, threads will keep running.
    res = threads.execute_function_multithreaded(exec_command,
                                                 args_list,
                                                 block_until_all_done=True)

    for name, value in sorted(res.items(), key=lambda item: item[1][1]):
        exit_code, timestamp = value
        if exit_code != 0:
            raise RuntimeError(
                'Horovod detected that one or more processes exited with non-zero '
                'status, thus causing the job to be terminated. The first process '
                'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                format(name=name, code=exit_code))