Ejemplo n.º 1
0
def static_driver_fn():
    global_rendezv = RendezvousServer(verbose=1)
    global_rendezv_port = global_rendezv.start()
    print("Rendezvous server started, port: " + str(global_rendezv_port))

    # worker_list = "localhost:1"
    hosts = parse_hosts(worker_list)
    host_alloc_plan = get_host_assignments(hosts, 1)

    global_rendezv.init(host_alloc_plan)
    return (global_rendezv_port, host_alloc_plan)
Ejemplo n.º 2
0
class HorovodRendezvousServer(object):
    def __init__(self, server_host):
        self._rendezvous_host = server_host
        self._rendezvous_id = 0
        self._worker_hosts = []
        self._rendezvous_server = RendezvousServer(verbose=True)
        self._rendezvous_port = None

    def start(self):
        self._rendezvous_port = self._rendezvous_server.start()

    def set_worker_hosts(self, worker_hosts):
        """
        Set worker hosts into RendezvousServer.

        Args:
            worker_hosts: List of host string.
        """
        if sorted(worker_hosts) == sorted(self._worker_hosts):
            return

        self._rendezvous_id += 1
        self._worker_hosts = worker_hosts
        host_alloc_plan = self._get_host_plan()
        self._rendezvous_server.init(host_alloc_plan)

    def _get_host_plan(self):
        hosts = []
        for host in self._worker_hosts:
            hosts.append(host + ":" + str(_WORKER_SLOT_NUMBER))

        host_infos = parse_hosts(_HOST_SEP.join(hosts))
        host_alloc_plan = get_host_assignments(host_infos, len(host_infos))
        return host_alloc_plan

    def get_rendezvous_host(self):
        return self._rendezvous_host

    def get_rendezvous_port(self):
        return self._rendezvous_port

    def get_worker_host_rank(self, host):
        # -1 if host not in worker_hosts list.
        if host not in self._worker_hosts:
            return -1
        return self._worker_hosts.index(host)

    def get_size(self):
        return len(self._worker_hosts)

    def get_rendezvous_id(self):
        return self._rendezvous_id
Ejemplo n.º 3
0
def launch_gloo(command, exec_command, settings, nics, env, server_ip):
    """
    Launches the given command multiple times using gloo.
    Each command is launched via exec_command.

    :param command: command to launch
    :param exec_command: means to execute a single command
    :param settings: settings for the distribution
    :param nics: common interfaces
    :param env: environment to use
    :param server_ip: ip to use for rendezvous server
    """
    # Make the output directory if it does not exist
    if settings.output_filename:
        _mkdir_p(settings.output_filename)

    # start global rendezvous server and get port that it is listening on
    rendezvous = RendezvousServer(settings.verbose)

    # allocate processes into slots
    hosts = parse_hosts(settings.hosts)
    host_alloc_plan = get_host_assignments(hosts, settings.num_proc)

    # start global rendezvous server and get port that it is listening on
    pedl_provisioned_port = int(
        os.environ.get('PEDL_HOROVOD_GLOO_RENDEZVOUS_PORT', 0))
    global_rendezv_port = rendezvous.start(
        pedl_provisioned_port=pedl_provisioned_port)
    rendezvous.init(host_alloc_plan)
    run_command = get_run_command(command, server_ip, nics,
                                  global_rendezv_port)

    slot_info_to_command = _slot_info_to_command_fn(run_command, env)
    event = register_shutdown_event()
    args_list = [[slot_info_to_command(slot_info), slot_info, [event]]
                 for slot_info in host_alloc_plan]

    # If an error occurs in one thread, entire process will be terminated.
    # Otherwise, threads will keep running.
    res = threads.execute_function_multithreaded(exec_command,
                                                 args_list,
                                                 block_until_all_done=True)

    for name, value in sorted(res.items(), key=lambda item: item[1][1]):
        exit_code, timestamp = value
        if exit_code != 0:
            raise RuntimeError(
                'Horovod detected that one or more processes exited with non-zero '
                'status, thus causing the job to be terminated. The first process '
                'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                format(name=name, code=exit_code))
Ejemplo n.º 4
0
def static_driver_fn():
    if is_in_test_mode:
        print("In unit test mode. fake port: " + fake_server_port)
        return (fake_server_port,
                get_host_assignments(parse_hosts(worker_list), 1))

    global_rendezv = RendezvousServer(verbose=1)
    global_rendezv_port = global_rendezv.start()
    print("Rendezvous server started, port: " + str(global_rendezv_port))

    # worker_list = "localhost:1"
    hosts = parse_hosts(worker_list)
    host_alloc_plan = get_host_assignments(hosts, 1)

    global_rendezv.init(host_alloc_plan)
    return (global_rendezv_port, host_alloc_plan)
Ejemplo n.º 5
0
class Coordinator:
    """Responsible for instantiating the Rendezvous server.

    Args:
        settings: Horovod Settings object."""
    rendezvous = None
    global_rendezv_port = None
    nics = None
    node_id_by_rank = None

    def __init__(
        self,
        settings,
    ):
        self.settings = settings
        self.node_id_by_rank = defaultdict(list)
        self._hostnames = set()

    @property
    def world_size(self) -> int:
        return sum(len(ranks) for ranks in self.node_id_by_rank.values())

    @property
    def hostnames(self):
        return self._hostnames

    @property
    def node_id_string(self) -> str:
        return ",".join([
            f"{node_id}:{len(ranks)}"
            for node_id, ranks in self.node_id_by_rank.items()
        ])

    def register(self, hostname: str, node_id: str, world_rank: int):
        self._hostnames.add(hostname)
        self.node_id_by_rank[node_id].append(world_rank)

    def finalize_registration(self) -> dict:
        """Return a dictionary for all ranks."""
        rank_to_info = {}

        cross_sizes = defaultdict(int)
        cross_ranks = {}
        for rank_list in self.node_id_by_rank.values():
            for local_rank, world_rank in enumerate(rank_list):
                cross_ranks[world_rank] = cross_sizes[local_rank]
                cross_sizes[local_rank] += 1

        for node_world_rank, (node_id, ranks) in enumerate(
                self.node_id_by_rank.items()):
            for local_rank, world_rank in enumerate(ranks):
                rank_to_info[world_rank] = dict(
                    HOROVOD_CROSS_RANK=cross_ranks[world_rank],
                    HOROVOD_CROSS_SIZE=cross_sizes[local_rank],
                    HOROVOD_LOCAL_RANK=local_rank,
                    HOROVOD_LOCAL_SIZE=len(ranks))
        return rank_to_info

    def establish_rendezvous(self) -> Dict[str, str]:
        """Creates the rendezvous server and identifies the nics to be used.

        Returns:
            Environment variables for each worker.
        """

        # start global rendezvous server and get port that it is listening on
        self.rendezvous = RendezvousServer(self.settings.verbose)

        # allocate processes into slots
        # hosts = parse_hosts(hosts_string="10.11.11.11:4,10.11.11.12:4")
        parsed_node_ids = hosts.parse_hosts(hosts_string=self.node_id_string)
        host_alloc_plan = hosts.get_host_assignments(parsed_node_ids,
                                                     self.world_size)

        # start global rendezvous server and get port that it is listening on
        self.global_rendezv_port = self.rendezvous.start()
        self.rendezvous.init(host_alloc_plan)

        return {
            # needs to be real address
            "HOROVOD_GLOO_RENDEZVOUS_ADDR": ray.util.get_node_ip_address(),
            "HOROVOD_GLOO_RENDEZVOUS_PORT": str(self.global_rendezv_port),
            "HOROVOD_CONTROLLER": "gloo",
            "HOROVOD_CPU_OPERATIONS": "gloo",
        }
Ejemplo n.º 6
0
class HorovodRendezvousServer(object):
    def __init__(self, server_host):
        self._rendezvous_host = server_host
        self._rendezvous_id = 0
        self._worker_hosts = []
        self._rendezvous_server = RendezvousServer(verbose=True)
        self._rendezvous_port = None
        self._next_worker_hosts = None
        self._ready_worker_hosts = set()
        self._rendezvous_completed = True
        self._lock = Lock()

    def start(self):
        self._rendezvous_port = self._rendezvous_server.start()

    def set_worker_hosts(self, worker_hosts):
        """
        Set worker hosts into RendezvousServer.

        Args:
            worker_hosts: List of host string.
        """

        if sorted(worker_hosts) != sorted(self._worker_hosts):
            self._next_worker_hosts = worker_hosts

    def _init_rendezvous_server(self):
        self._worker_hosts = self._next_worker_hosts
        self._next_worker_hosts = None
        host_alloc_plan = self._get_host_plan()
        self._rendezvous_server.init(host_alloc_plan)
        self._rendezvous_id += 1
        self._rendezvous_completed = False

    def _get_host_plan(self):
        hosts = []
        for host in self._worker_hosts:
            hosts.append(host + ":" + str(_WORKER_SLOT_NUMBER))

        host_infos = parse_hosts(_HOST_SEP.join(hosts))
        host_alloc_plan = get_host_assignments(host_infos, len(host_infos))
        return host_alloc_plan

    def get_rendezvous_host(self):
        return self._rendezvous_host

    def get_rendezvous_port(self):
        return self._rendezvous_port

    def get_worker_host_rank(self, host):
        with self._lock:
            if self._next_worker_hosts and self._rendezvous_completed:
                time.sleep(2)  # Wait 2s for workers to complete rendezvous.
                self._init_rendezvous_server()

            # -1 if host not in worker_hosts list.
            if host not in self._worker_hosts:
                return -1

            if not self._rendezvous_completed:
                self._ready_worker_hosts.add(host)
                # If all active workers in the rendezvous are ready,
                # the server can start to set hosts for the next rendezvous
                if self._ready_worker_hosts == set(self._worker_hosts):
                    self._rendezvous_completed = True
                    self._ready_worker_hosts = set()

            return self._worker_hosts.index(host)

    def get_size(self):
        return len(self._worker_hosts)

    def get_rendezvous_id(self):
        return self._rendezvous_id
Ejemplo n.º 7
0
class HorovodRendezvousServer(object):
    """The rendezvous server can collect worker hosts (ip) to
    help these workers to build an AllReduce ring using `hvd.init`.

    The state transition diagram of the server is:

                    |------------------|
                    |       start      |
                    |next_hosts = None |
                    |------------------|
                            | worker-0 sends the start
                            | message
                            |
                     |------------------|
                 |-- |next_hosts = [0]  |------------------|
                 |   |------------------|                  |
worker-1 sends   |                        worker-0 queries |
the start message|                                  a rank |
    |---------------------| worker-0 queries   |--------------------|
    |next_hosts = [0, 1]  |     a rank         |cur_hosts=next_hosts|
    |                     | ---------------->  |next_hosts=None     |
    |---------------------|                    | ready_hosts adds    |
                                               | the worker         |<---|
                                  |<---------  |--------------------|    |
                worker-2 sends    |                                      |
                the start message |              worker-2 quries         |
                    |-------------------------|  a rank and              |
                    |next_hosts=cur_hosts+[2] |  ready_hosts=cur_hosts   |
                    | ------------------------|  ----------------------->|
    """
    def __init__(self, server_host):
        self._rendezvous_host = server_host
        self._init_attributes()

    def _init_attributes(self):
        self._rendezvous_id = 0
        self._cur_rendezvous_hosts = []
        self._rendezvous_server = RendezvousServer(verbose=True)
        self._rendezvous_port = None
        self._next_rendezvous_hosts = None
        self._ready_worker_hosts = set()
        self._cur_rendezvous_completed = True
        self._lock = Lock()

    def start(self):
        self._rendezvous_port = self._rendezvous_server.start()

    def _init_rendezvous_server(self):
        logger.info("Initialize rendezvous server with hosts {}".format(
            self._next_rendezvous_hosts))
        self._cur_rendezvous_hosts = self._next_rendezvous_hosts
        self._next_rendezvous_hosts = None
        host_alloc_plan = self._get_host_plan()
        self._rendezvous_server.init(host_alloc_plan)
        self._rendezvous_id += 1
        self._cur_rendezvous_completed = False

    def _get_host_plan(self):
        hosts = []
        for host in self._cur_rendezvous_hosts:
            hosts.append(host + ":" + str(_WORKER_SLOT_NUMBER))

        host_infos = parse_hosts(_HOST_SEP.join(hosts))
        host_alloc_plan = get_host_assignments(host_infos, len(host_infos))
        return host_alloc_plan

    def get_rendezvous_host(self):
        return self._rendezvous_host

    def get_rendezvous_port(self):
        return self._rendezvous_port

    def get_worker_host_rank(self, host):
        with self._lock:
            if self._next_rendezvous_hosts and self._cur_rendezvous_completed:
                time.sleep(2)  # Wait 2s for workers to complete rendezvous.
                self._init_rendezvous_server()

            # -1 if host not in worker_hosts list.
            if host not in self._cur_rendezvous_hosts:
                return -1

            if not self._cur_rendezvous_completed:
                self._ready_worker_hosts.add(host)
                # If all active workers in the rendezvous are ready,
                # the server can start to set hosts for the next rendezvous
                if self._ready_worker_hosts == set(self._cur_rendezvous_hosts):
                    self._cur_rendezvous_completed = True
                    self._ready_worker_hosts = set()

            return self._cur_rendezvous_hosts.index(host)

    def get_size(self):
        return len(self._cur_rendezvous_hosts)

    def get_rendezvous_id(self):
        return self._rendezvous_id

    def add_worker(self, worker_host):
        with self._lock:
            logger.info(
                "Add worker host {} into rendenzvous and cur hosts {}.".format(
                    worker_host, self._cur_rendezvous_hosts))
            if worker_host:
                if self._next_rendezvous_hosts is None:
                    self._next_rendezvous_hosts = copy.deepcopy(
                        self._cur_rendezvous_hosts)
                # Master will not add any worker if the current rendezvous
                # hosts become empty after starting training.
                if self._rendezvous_id > 0 and not self._next_rendezvous_hosts:
                    return
                if worker_host not in self._next_rendezvous_hosts:
                    self._next_rendezvous_hosts.append(worker_host)

    def remove_worker(self, worker_host):
        with self._lock:
            logger.info(
                "Remove worker host {} from rendenzvous.".format(worker_host))
            if worker_host in self._cur_rendezvous_hosts:
                if self._next_rendezvous_hosts is None:
                    self._next_rendezvous_hosts = copy.deepcopy(
                        self._cur_rendezvous_hosts)
                self._next_rendezvous_hosts.pop(
                    self._next_rendezvous_hosts.index(worker_host))
Ejemplo n.º 8
0
class Coordinator:
    """Responsible for instantiating the Rendezvous server.

    Args:
        settings: Horovod Settings object."""
    rendezvous = None
    global_rendezv_port = None
    nics = None
    hostnames = None

    def __init__(
        self,
        settings,
    ):
        self.settings = settings
        self.hostnames_by_rank = defaultdict(list)

    @property
    def world_size(self) -> int:
        return sum(len(ranks) for ranks in self.hostnames_by_rank.values())

    @property
    def hoststring(self) -> str:
        return ",".join([
            f"{host}:{len(ranks)}"
            for host, ranks in self.hostnames_by_rank.items()
        ])

    def register(self, hostname: str, world_rank: int):
        self.hostnames_by_rank[hostname].append(world_rank)

    def finalize_registration(self) -> dict:
        """Return a dictionary for all ranks."""
        rank_to_info = {}
        for node_world_rank, (hostname, ranks) in enumerate(
                self.hostnames_by_rank.items()):
            for local_rank, world_rank in enumerate(ranks):
                rank_to_info[world_rank] = dict(
                    NODE_WORLD_RANK=node_world_rank,
                    NODE_WORLD_SIZE=len(self.hostnames_by_rank),
                    LOCAL_RANK=local_rank,
                    LOCAL_SIZE=len(ranks))
        return rank_to_info

    def establish_rendezvous(self) -> Dict[str, str]:
        """Creates the rendezvous server and identifies the nics to be used.

        Returns:
            Environment variables for each worker.
        """

        # start global rendezvous server and get port that it is listening on
        self.rendezvous = RendezvousServer(self.settings.verbose)

        # allocate processes into slots
        # hosts = parse_hosts(hosts_string="10.11.11.11:4,10.11.11.12:4")
        parsed_hosts = hosts.parse_hosts(hosts_string=self.hoststring)
        host_alloc_plan = hosts.get_host_assignments(parsed_hosts,
                                                     self.world_size)

        # start global rendezvous server and get port that it is listening on
        self.global_rendezv_port = self.rendezvous.start()
        self.rendezvous.init(host_alloc_plan)
        # remote_host_names = network.filter_local_addresses()
        self.nics = driver_service.get_common_interfaces(
            self.settings, list(self.hostnames_by_rank))

        return {
            "HOROVOD_GLOO_RENDEZVOUS_ADDR": ray.services.get_node_ip_address(),
            "HOROVOD_GLOO_RENDEZVOUS_PORT": str(self.global_rendezv_port),
            "HOROVOD_CONTROLLER": "gloo",
            "HOROVOD_CPU_OPERATIONS": "gloo",
            "HOROVOD_GLOO_IFACE": str(list(self.nics)[0]),  # TODO
            "NCCL_SOCKET_IFNAME": ",".join(self.nics),  # TDOO
        }