def init_horovod_if_needed(self): self._set_horovod_env() for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM): rank_response = self._master_client.get_comm_rank() if rank_response.rank_id < 0: logger.warning("The master has not added the worker host into " "rendezvous yet. Retrying to get rank") time.sleep(RETRY_ALLREDUCE_INTERVAL_SECS) else: break # If the rendezvous from master is unequal to self._rendezvous_id, # the worker should rebuild the communication because the master # has updated the communication group. if rank_response.rendezvous_id != self._rendezvous_id: logger.info( "Initialize Horovod with rank = {} and size = {}".format( rank_response.rank_id, rank_response.world_size)) os.environ[HorovodEnv.RENDEZVOUS_PORT] = str( rank_response.rendezvous_port) os.environ[HorovodEnv.RANK] = str(rank_response.rank_id) os.environ[HorovodEnv.SIZE] = str(rank_response.world_size) # Not using Horovod elastic feature in init, but need it for # allreduce to call allreduce op when size=1. os.environ[HorovodEnv.ELASTIC] = str(0) hvd.shutdown() hvd.init() os.environ[HorovodEnv.ELASTIC] = str(1) self._rendezvous_id = rank_response.rendezvous_id self.need_broadcast = True
def _init_rendezvous_server(self): logger.info("Initialize rendezvous server with hosts {}".format( self._next_rendezvous_hosts)) self._cur_rendezvous_hosts = self._next_rendezvous_hosts self._next_rendezvous_hosts = None host_alloc_plan = self._get_host_plan() self._rendezvous_server.init(host_alloc_plan) self._rendezvous_id += 1 self._cur_rendezvous_completed = False
def remove_worker(self, worker_host): with self._lock: logger.info( "Remove worker host {} from rendenzvous.".format(worker_host)) if worker_host in self._cur_rendezvous_hosts: if self._next_rendezvous_hosts is None: self._next_rendezvous_hosts = copy.deepcopy( self._cur_rendezvous_hosts) self._next_rendezvous_hosts.pop( self._next_rendezvous_hosts.index(worker_host))
def add_worker(self, worker_host): with self._lock: logger.info( "Add worker host {} into rendenzvous and cur hosts {}.".format( worker_host, self._cur_rendezvous_hosts)) if worker_host: if self._next_rendezvous_hosts is None: self._next_rendezvous_hosts = copy.deepcopy( self._cur_rendezvous_hosts) if worker_host not in self._next_rendezvous_hosts: self._next_rendezvous_hosts.append(worker_host)
def add_worker(self, worker_host): with self._lock: logger.info( "Add worker host {} into rendenzvous and cur hosts {}.".format( worker_host, self._cur_rendezvous_hosts)) if worker_host: if self._next_rendezvous_hosts is None: self._next_rendezvous_hosts = copy.deepcopy( self._cur_rendezvous_hosts) # Master will not add any worker if the current rendezvous # hosts become empty after starting training. if self._rendezvous_id > 0 and not self._next_rendezvous_hosts: return if worker_host not in self._next_rendezvous_hosts: self._next_rendezvous_hosts.append(worker_host)
def reset_backward_passes_per_step(self): # Only reset backward_passes_per_step when using the optimizer # with fixed_global_batch_size if (hasattr(self._optimizer, "fixed_global_batch_size") and self._optimizer.fixed_global_batch_size): world_size = hvd.size() rank = hvd.rank() self.backward_passes_per_step = (self.global_batch_num_per_step // world_size) if rank < self.global_batch_num_per_step % world_size: self.backward_passes_per_step += 1 if (self.backward_passes_per_step != self._optimizer.backward_passes_per_step): self._optimizer.set_backward_passes_per_step( self.backward_passes_per_step) logger.info("Backward passes per step = {}".format( self._optimizer.backward_passes_per_step))
def init_horovod_if_needed(self): self._set_horovod_env() for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM): rank_response = self._master_client.get_comm_rank() if rank_response.rank_id < 0: logger.warning("The master has not added the worker host into " "rendezvous yet. Retrying to get rank") time.sleep(RETRY_ALLREDUCE_INTERVAL_SECS) else: break if rank_response.rank_id < 0: raise ValueError("Invalid rank {}".format(rank_response.rank_id)) # If the rendezvous from master is unequal to self._rendezvous_id, # the worker should rebuild the communication because the master # has updated the communication group. if rank_response.rendezvous_id != self._rendezvous_id: logger.info( "Initialize Horovod with rank = {} and size = {}".format( rank_response.rank_id, rank_response.world_size)) self._restart_hvd(rank_response)
def _broadcast_if_needed(self): if self._rendezvous_manager.need_broadcast: logger.info("Broadcast models") self.broadcast() self._rendezvous_manager.need_broadcast = False