def rendezvous_barrier(self): """ Main entry point for next rendezvous. This method is blocking until rendezvous succeeds or a timeout occurs. Returns: ``(rdzv_version, rank, world_size)`` Raises: RendezvousTimeoutError - timeout waiting for rendezvous RendezvousClosedError - rendezvous is or was closed while waiting RendezvousError - other persistent errors that render the rendezvous non-retryable """ self._rendezvous_deadline = time.time() + self._timeout while True: if time.time() > self._rendezvous_deadline: raise RendezvousTimeoutError() log.info("Attempting to join next rendezvous") try: # Dis-own our lease in the previous rendezvous, if exists if self._lease_this_rank_stop is not None: self._lease_this_rank_stop.set() return self.init_phase() except EtcdRendezvousRetryImmediately: # The type of failure suggests we can retry without delay pass except EtcdRendezvousRetryableFailure: # In case of retryable failure, wait a small delay # to avoid spamming etcd time.sleep(1) except RendezvousTimeoutError: log.info("Rendezvous timeout occured in EtcdRendezvousHandler") raise except RendezvousClosedError: log.info( f"Rendezvous for run_id={self._run_id} was observed to be closed" ) raise except RendezvousError: raise except Exception as e: # In case of a general exception, wait a small delay # to avoid spamming etcd # FIXME: there are a few things that fall under this like # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly. log.info("Rendezvous attempt failed, will retry. Reason: " + str(e)) time.sleep(1)
def try_wait_for_state_change(self, etcd_index, timeout=None): # Don't sleep past the overall deadline (at least more than by 1s) overall_timeout = max(self._rendezvous_deadline - time.time(), 0.0) + 1.0 timeout = overall_timeout if timeout is None else min( timeout, overall_timeout) try: self.client.watch(self.get_path("/rdzv/active_version"), index=etcd_index, timeout=timeout) except (etcd.EtcdEventIndexCleared, etcd.EtcdWatchTimedOut): pass if time.time() > self._rendezvous_deadline: raise RendezvousTimeoutError() # Unfortunately, we have to do another fetch in order to get last etcd_index. return self.get_rdzv_state()
def wait_for_rendezvous_to_free(self, expected_version): """ When there's an existing valid rendezvous in state 'final', we have to wait until the next opportunity to join. Such opportunity may come from: 1. rendezvous state changed by someone else, in which case we unblock and retry. 2. rendezvous becomes invalid because at least one member failed to renew their leased keep_alive node. We detect this, and destroy the rendezvous. """ active_version, state = self.get_rdzv_state() while True: if state["status"] != "final" or state[ "version"] != expected_version: return # Check if current rendezvous state is valid, in the sense that all # its members are alive (renewing their lease). # If not, try destroy this rendezvous, so a new one can be created. alive_members = self.client.get( self.get_path( "/rdzv/v_{version}".format(version=expected_version))) keep_alive_keys = [ch.key for ch in alive_members.children] for key in state["keep_alives"]: if key not in keep_alive_keys: # This participant didn't renew their lease. We'll declare this # rendezvous version as dead (but only if it hadn't changed) log.info("Keep-alive key {} is not renewed.".format(key)) log.info("Rendevous version {} is incomplete. ".format( expected_version)) log.info("Attempting to destroy it.") # Compare-and-delete operation. Throws if compare failed, # which means rendezvous was already destroyed/re-created/closed, # and we can try to re-enter the barrier. self.client.delete( key=self.get_path("/rdzv/active_version"), prevValue=active_version.value, ) log.info( "Destroyed rendezvous version {} successfully.".format( expected_version)) # We can return (and retry) immediately return # Existing rendezvous seems valid, no reason to destroy it. # We just have to wait until something changes and re-check. try: overall_timeout = ( max(self._rendezvous_deadline - time.time(), 0.0) + 1.0) self.client.watch( key=self.get_path("/rdzv"), index=active_version.etcd_index + 1, recursive=True, timeout=overall_timeout, ) except (etcd.EtcdEventIndexCleared, etcd.EtcdWatchTimedOut): pass if time.time() > self._rendezvous_deadline: raise RendezvousTimeoutError() active_version, state = self.get_rdzv_state()