def wait_for_rendezvous_to_free(self, expected_version): active_version, state = self.get_rdzv_state() while True: if state["status"] != "final" or state["version"] != expected_version: return # Check if current rendezvous state is valid, in the sense that all # its members are alive (renewing their lease). # If not, try destroy this rendezvous, so a new one can be created. alive_members = self.client.get( self.get_path("/rdzv/v_{version}".format(version=expected_version)) ) keep_alive_keys = [ch.key for ch in alive_members.children] for key in state["keep_alives"]: if key not in keep_alive_keys: # This participant didn't renew their lease. We'll declare this # rendezvous version as dead (but only if it hadn't changed) log.info("Keep-alive key {} is not renewed.".format(key)) log.info( "Rendevous version {} is incomplete. ".format(expected_version) ) log.info("Attempting to destroy it.") # Compare-and-delete operation. Throws if compare failed, # which means rendezvous was already destroyed/re-created/closed, # and we can try to re-enter the barrier. self.client.delete( key=self.get_path("/rdzv/active_version"), prevValue=active_version.value, ) log.info( "Destroyed rendezvous version {} successfully.".format( expected_version ) ) # We can return (and retry) immediately return # Existing rendezvous seems valid, no reason to destroy it. # We just have to wait until something changes and re-check. try: overall_timeout = ( max(self._rendezvous_deadline - time.time(), 0.0) + 1.0 ) self.client.watch( key=self.get_path("/rdzv"), index=active_version.etcd_index + 1, recursive=True, timeout=overall_timeout, ) except (etcd.EtcdEventIndexCleared, etcd.EtcdWatchTimedOut): pass if time.time() > self._rendezvous_deadline: raise RendezvousTimeoutException() active_version, state = self.get_rdzv_state()
def rendezvous_barrier(self): """ Main entry point for next rendezvous. This method is blocking until rendezvous succeeds or a timeout occurs. Returns: ``(rdzv_version, rank, world_size)`` Raises: RendezvousTimeoutException - timeout waiting for rendezvous RendezvousNonRetryableError - other persistent errors that render the rendezvous non-retryable RendezvousClosedException - rendezvous is or was closed while waiting """ self._rendezvous_deadline = time.time() + self._timeout while True: if time.time() > self._rendezvous_deadline: raise RendezvousTimeoutException() log.info("Attempting to join next rendezvous") try: # Dis-own our lease in the previous rendezvous, if exists if self._lease_this_rank_stop is not None: self._lease_this_rank_stop.set() return self.init_phase() except EtcdRendezvousRetryImmediately: # The type of failure suggests we can retry without delay pass except EtcdRendezvousRetryableFailure: # In case of retryable failure, wait a small delay # to avoid spamming etcd time.sleep(1) except RendezvousTimeoutException: log.info("Rendezvous timeout occured in EtcdRendezvousHandler") raise except RendezvousClosedException: log.info( f"Rendezvous for run_id={self._run_id} was observed to be closed" ) raise except RendezvousNonRetryableError: raise except Exception as e: # In case of a general exception, wait a small delay # to avoid spamming etcd # FIXME: there are a few things that fall under this like # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly. log.info("Rendezvous attempt failed, will retry. Reason: " + str(e)) time.sleep(1)
def try_wait_for_state_change(self, etcd_index, timeout=None): # Don't sleep past the overall deadline (at least more than by 1s) overall_timeout = max(self._rendezvous_deadline - time.time(), 0.0) + 1.0 timeout = overall_timeout if timeout is None else min(timeout, overall_timeout) try: self.client.watch( self.get_path("/rdzv/active_version"), index=etcd_index, timeout=timeout ) except (etcd.EtcdEventIndexCleared, etcd.EtcdWatchTimedOut): pass if time.time() > self._rendezvous_deadline: raise RendezvousTimeoutException() # Unfortunately, we have to do another fetch in order to get last etcd_index. return self.get_rdzv_state()