def static_driver_fn(): global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start() print("Rendezvous server started, port: " + str(global_rendezv_port)) # worker_list = "localhost:1" hosts = parse_hosts(worker_list) host_alloc_plan = get_host_assignments(hosts, 1) global_rendezv.init(host_alloc_plan) return (global_rendezv_port, host_alloc_plan)
class HorovodRendezvousServer(object): def __init__(self, server_host): self._rendezvous_host = server_host self._rendezvous_id = 0 self._worker_hosts = [] self._rendezvous_server = RendezvousServer(verbose=True) self._rendezvous_port = None def start(self): self._rendezvous_port = self._rendezvous_server.start() def set_worker_hosts(self, worker_hosts): """ Set worker hosts into RendezvousServer. Args: worker_hosts: List of host string. """ if sorted(worker_hosts) == sorted(self._worker_hosts): return self._rendezvous_id += 1 self._worker_hosts = worker_hosts host_alloc_plan = self._get_host_plan() self._rendezvous_server.init(host_alloc_plan) def _get_host_plan(self): hosts = [] for host in self._worker_hosts: hosts.append(host + ":" + str(_WORKER_SLOT_NUMBER)) host_infos = parse_hosts(_HOST_SEP.join(hosts)) host_alloc_plan = get_host_assignments(host_infos, len(host_infos)) return host_alloc_plan def get_rendezvous_host(self): return self._rendezvous_host def get_rendezvous_port(self): return self._rendezvous_port def get_worker_host_rank(self, host): # -1 if host not in worker_hosts list. if host not in self._worker_hosts: return -1 return self._worker_hosts.index(host) def get_size(self): return len(self._worker_hosts) def get_rendezvous_id(self): return self._rendezvous_id
def launch_gloo(command, exec_command, settings, nics, env, server_ip): """ Launches the given command multiple times using gloo. Each command is launched via exec_command. :param command: command to launch :param exec_command: means to execute a single command :param settings: settings for the distribution :param nics: common interfaces :param env: environment to use :param server_ip: ip to use for rendezvous server """ # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) # start global rendezvous server and get port that it is listening on rendezvous = RendezvousServer(settings.verbose) # allocate processes into slots hosts = parse_hosts(settings.hosts) host_alloc_plan = get_host_assignments(hosts, settings.num_proc) # start global rendezvous server and get port that it is listening on pedl_provisioned_port = int( os.environ.get('PEDL_HOROVOD_GLOO_RENDEZVOUS_PORT', 0)) global_rendezv_port = rendezvous.start( pedl_provisioned_port=pedl_provisioned_port) rendezvous.init(host_alloc_plan) run_command = get_run_command(command, server_ip, nics, global_rendezv_port) slot_info_to_command = _slot_info_to_command_fn(run_command, env) event = register_shutdown_event() args_list = [[slot_info_to_command(slot_info), slot_info, [event]] for slot_info in host_alloc_plan] # If an error occurs in one thread, entire process will be terminated. # Otherwise, threads will keep running. res = threads.execute_function_multithreaded(exec_command, args_list, block_until_all_done=True) for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def static_driver_fn(): if is_in_test_mode: print("In unit test mode. fake port: " + fake_server_port) return (fake_server_port, get_host_assignments(parse_hosts(worker_list), 1)) global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start() print("Rendezvous server started, port: " + str(global_rendezv_port)) # worker_list = "localhost:1" hosts = parse_hosts(worker_list) host_alloc_plan = get_host_assignments(hosts, 1) global_rendezv.init(host_alloc_plan) return (global_rendezv_port, host_alloc_plan)
class Coordinator: """Responsible for instantiating the Rendezvous server. Args: settings: Horovod Settings object.""" rendezvous = None global_rendezv_port = None nics = None node_id_by_rank = None def __init__( self, settings, ): self.settings = settings self.node_id_by_rank = defaultdict(list) self._hostnames = set() @property def world_size(self) -> int: return sum(len(ranks) for ranks in self.node_id_by_rank.values()) @property def hostnames(self): return self._hostnames @property def node_id_string(self) -> str: return ",".join([ f"{node_id}:{len(ranks)}" for node_id, ranks in self.node_id_by_rank.items() ]) def register(self, hostname: str, node_id: str, world_rank: int): self._hostnames.add(hostname) self.node_id_by_rank[node_id].append(world_rank) def finalize_registration(self) -> dict: """Return a dictionary for all ranks.""" rank_to_info = {} cross_sizes = defaultdict(int) cross_ranks = {} for rank_list in self.node_id_by_rank.values(): for local_rank, world_rank in enumerate(rank_list): cross_ranks[world_rank] = cross_sizes[local_rank] cross_sizes[local_rank] += 1 for node_world_rank, (node_id, ranks) in enumerate( self.node_id_by_rank.items()): for local_rank, world_rank in enumerate(ranks): rank_to_info[world_rank] = dict( HOROVOD_CROSS_RANK=cross_ranks[world_rank], HOROVOD_CROSS_SIZE=cross_sizes[local_rank], HOROVOD_LOCAL_RANK=local_rank, HOROVOD_LOCAL_SIZE=len(ranks)) return rank_to_info def establish_rendezvous(self) -> Dict[str, str]: """Creates the rendezvous server and identifies the nics to be used. Returns: Environment variables for each worker. """ # start global rendezvous server and get port that it is listening on self.rendezvous = RendezvousServer(self.settings.verbose) # allocate processes into slots # hosts = parse_hosts(hosts_string="10.11.11.11:4,10.11.11.12:4") parsed_node_ids = hosts.parse_hosts(hosts_string=self.node_id_string) host_alloc_plan = hosts.get_host_assignments(parsed_node_ids, self.world_size) # start global rendezvous server and get port that it is listening on self.global_rendezv_port = self.rendezvous.start() self.rendezvous.init(host_alloc_plan) return { # needs to be real address "HOROVOD_GLOO_RENDEZVOUS_ADDR": ray.util.get_node_ip_address(), "HOROVOD_GLOO_RENDEZVOUS_PORT": str(self.global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", }
class HorovodRendezvousServer(object): def __init__(self, server_host): self._rendezvous_host = server_host self._rendezvous_id = 0 self._worker_hosts = [] self._rendezvous_server = RendezvousServer(verbose=True) self._rendezvous_port = None self._next_worker_hosts = None self._ready_worker_hosts = set() self._rendezvous_completed = True self._lock = Lock() def start(self): self._rendezvous_port = self._rendezvous_server.start() def set_worker_hosts(self, worker_hosts): """ Set worker hosts into RendezvousServer. Args: worker_hosts: List of host string. """ if sorted(worker_hosts) != sorted(self._worker_hosts): self._next_worker_hosts = worker_hosts def _init_rendezvous_server(self): self._worker_hosts = self._next_worker_hosts self._next_worker_hosts = None host_alloc_plan = self._get_host_plan() self._rendezvous_server.init(host_alloc_plan) self._rendezvous_id += 1 self._rendezvous_completed = False def _get_host_plan(self): hosts = [] for host in self._worker_hosts: hosts.append(host + ":" + str(_WORKER_SLOT_NUMBER)) host_infos = parse_hosts(_HOST_SEP.join(hosts)) host_alloc_plan = get_host_assignments(host_infos, len(host_infos)) return host_alloc_plan def get_rendezvous_host(self): return self._rendezvous_host def get_rendezvous_port(self): return self._rendezvous_port def get_worker_host_rank(self, host): with self._lock: if self._next_worker_hosts and self._rendezvous_completed: time.sleep(2) # Wait 2s for workers to complete rendezvous. self._init_rendezvous_server() # -1 if host not in worker_hosts list. if host not in self._worker_hosts: return -1 if not self._rendezvous_completed: self._ready_worker_hosts.add(host) # If all active workers in the rendezvous are ready, # the server can start to set hosts for the next rendezvous if self._ready_worker_hosts == set(self._worker_hosts): self._rendezvous_completed = True self._ready_worker_hosts = set() return self._worker_hosts.index(host) def get_size(self): return len(self._worker_hosts) def get_rendezvous_id(self): return self._rendezvous_id
class HorovodRendezvousServer(object): """The rendezvous server can collect worker hosts (ip) to help these workers to build an AllReduce ring using `hvd.init`. The state transition diagram of the server is: |------------------| | start | |next_hosts = None | |------------------| | worker-0 sends the start | message | |------------------| |-- |next_hosts = [0] |------------------| | |------------------| | worker-1 sends | worker-0 queries | the start message| a rank | |---------------------| worker-0 queries |--------------------| |next_hosts = [0, 1] | a rank |cur_hosts=next_hosts| | | ----------------> |next_hosts=None | |---------------------| | ready_hosts adds | | the worker |<---| |<--------- |--------------------| | worker-2 sends | | the start message | worker-2 quries | |-------------------------| a rank and | |next_hosts=cur_hosts+[2] | ready_hosts=cur_hosts | | ------------------------| ----------------------->| """ def __init__(self, server_host): self._rendezvous_host = server_host self._init_attributes() def _init_attributes(self): self._rendezvous_id = 0 self._cur_rendezvous_hosts = [] self._rendezvous_server = RendezvousServer(verbose=True) self._rendezvous_port = None self._next_rendezvous_hosts = None self._ready_worker_hosts = set() self._cur_rendezvous_completed = True self._lock = Lock() def start(self): self._rendezvous_port = self._rendezvous_server.start() def _init_rendezvous_server(self): logger.info("Initialize rendezvous server with hosts {}".format( self._next_rendezvous_hosts)) self._cur_rendezvous_hosts = self._next_rendezvous_hosts self._next_rendezvous_hosts = None host_alloc_plan = self._get_host_plan() self._rendezvous_server.init(host_alloc_plan) self._rendezvous_id += 1 self._cur_rendezvous_completed = False def _get_host_plan(self): hosts = [] for host in self._cur_rendezvous_hosts: hosts.append(host + ":" + str(_WORKER_SLOT_NUMBER)) host_infos = parse_hosts(_HOST_SEP.join(hosts)) host_alloc_plan = get_host_assignments(host_infos, len(host_infos)) return host_alloc_plan def get_rendezvous_host(self): return self._rendezvous_host def get_rendezvous_port(self): return self._rendezvous_port def get_worker_host_rank(self, host): with self._lock: if self._next_rendezvous_hosts and self._cur_rendezvous_completed: time.sleep(2) # Wait 2s for workers to complete rendezvous. self._init_rendezvous_server() # -1 if host not in worker_hosts list. if host not in self._cur_rendezvous_hosts: return -1 if not self._cur_rendezvous_completed: self._ready_worker_hosts.add(host) # If all active workers in the rendezvous are ready, # the server can start to set hosts for the next rendezvous if self._ready_worker_hosts == set(self._cur_rendezvous_hosts): self._cur_rendezvous_completed = True self._ready_worker_hosts = set() return self._cur_rendezvous_hosts.index(host) def get_size(self): return len(self._cur_rendezvous_hosts) def get_rendezvous_id(self): return self._rendezvous_id def add_worker(self, worker_host): with self._lock: logger.info( "Add worker host {} into rendenzvous and cur hosts {}.".format( worker_host, self._cur_rendezvous_hosts)) if worker_host: if self._next_rendezvous_hosts is None: self._next_rendezvous_hosts = copy.deepcopy( self._cur_rendezvous_hosts) # Master will not add any worker if the current rendezvous # hosts become empty after starting training. if self._rendezvous_id > 0 and not self._next_rendezvous_hosts: return if worker_host not in self._next_rendezvous_hosts: self._next_rendezvous_hosts.append(worker_host) def remove_worker(self, worker_host): with self._lock: logger.info( "Remove worker host {} from rendenzvous.".format(worker_host)) if worker_host in self._cur_rendezvous_hosts: if self._next_rendezvous_hosts is None: self._next_rendezvous_hosts = copy.deepcopy( self._cur_rendezvous_hosts) self._next_rendezvous_hosts.pop( self._next_rendezvous_hosts.index(worker_host))
class Coordinator: """Responsible for instantiating the Rendezvous server. Args: settings: Horovod Settings object.""" rendezvous = None global_rendezv_port = None nics = None hostnames = None def __init__( self, settings, ): self.settings = settings self.hostnames_by_rank = defaultdict(list) @property def world_size(self) -> int: return sum(len(ranks) for ranks in self.hostnames_by_rank.values()) @property def hoststring(self) -> str: return ",".join([ f"{host}:{len(ranks)}" for host, ranks in self.hostnames_by_rank.items() ]) def register(self, hostname: str, world_rank: int): self.hostnames_by_rank[hostname].append(world_rank) def finalize_registration(self) -> dict: """Return a dictionary for all ranks.""" rank_to_info = {} for node_world_rank, (hostname, ranks) in enumerate( self.hostnames_by_rank.items()): for local_rank, world_rank in enumerate(ranks): rank_to_info[world_rank] = dict( NODE_WORLD_RANK=node_world_rank, NODE_WORLD_SIZE=len(self.hostnames_by_rank), LOCAL_RANK=local_rank, LOCAL_SIZE=len(ranks)) return rank_to_info def establish_rendezvous(self) -> Dict[str, str]: """Creates the rendezvous server and identifies the nics to be used. Returns: Environment variables for each worker. """ # start global rendezvous server and get port that it is listening on self.rendezvous = RendezvousServer(self.settings.verbose) # allocate processes into slots # hosts = parse_hosts(hosts_string="10.11.11.11:4,10.11.11.12:4") parsed_hosts = hosts.parse_hosts(hosts_string=self.hoststring) host_alloc_plan = hosts.get_host_assignments(parsed_hosts, self.world_size) # start global rendezvous server and get port that it is listening on self.global_rendezv_port = self.rendezvous.start() self.rendezvous.init(host_alloc_plan) # remote_host_names = network.filter_local_addresses() self.nics = driver_service.get_common_interfaces( self.settings, list(self.hostnames_by_rank)) return { "HOROVOD_GLOO_RENDEZVOUS_ADDR": ray.services.get_node_ip_address(), "HOROVOD_GLOO_RENDEZVOUS_PORT": str(self.global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", "HOROVOD_GLOO_IFACE": str(list(self.nics)[0]), # TODO "NCCL_SOCKET_IFNAME": ",".join(self.nics), # TDOO }