def WaitUntilRunning(self, deadline: float) -> None: assert self._pod_name, ( 'Pod has not been created yet. You should call Start() first.') while time.time() < deadline: try: pod = self._k8s_core_api.read_namespaced_pod( name=self._pod_name, namespace=self._namespace) except rest.ApiException as e: logging.info('Continue polling after getting ApiException(%s)', e) time.sleep(_DEFAULT_POLLING_INTERVAL_SEC) continue # Pod phase is one of Pending, Running, Succeeded, Failed, or Unknown. # Succeeded and Failed indicates the pod lifecycle has reached its end, # while we expect the job to be running and hanging. Phase is Unknown if # the state of the pod could not be obtained, thus we can wait until we # confirm the phase. pod_phase = _PodPhase(pod.status.phase) if pod_phase == _PodPhase.RUNNING and pod.status.pod_ip: self._endpoint = '{}:{}'.format( pod.status.pod_ip, self._serving_binary.container_port) return if pod_phase.is_done: raise error_types.JobAborted( 'Job has been aborted. (phase={})'.format(pod_phase)) logging.info('Waiting for the pod to be running. (phase=%s)', pod_phase) time.sleep(_DEFAULT_POLLING_INTERVAL_SEC) raise error_types.DeadlineExceeded( 'Deadline exceeded while waiting for pod to be running.')
def WaitUntilModelLoaded(self, deadline: float, polling_interval_sec: int) -> None: """Wait until model is loaded and available. Args: deadline: A deadline time in UTC timestamp (in seconds). polling_interval_sec: GetServingStatus() polling interval. Raises: DeadlineExceeded: When deadline exceeded before model is ready. ValidationFailed: If validation failed explicitly. """ while time.time() < deadline: status = self._GetServingStatus() if status == types.ModelServingStatus.NOT_READY: logging.log_every_n_seconds( level=logging.INFO, n_seconds=10, msg='Waiting for model to be loaded...') time.sleep(polling_interval_sec) continue elif status == types.ModelServingStatus.UNAVAILABLE: raise error_types.ValidationFailed( 'Model server failed to load the model.') else: logging.info('Model is successfully loaded.') return raise error_types.DeadlineExceeded( 'Deadline exceeded while waiting the model to be loaded.')
def WaitUntilRunning(self, deadline): assert self._container is not None, 'container has not been started.' while time.time() < deadline: try: # Reload container attributes from server. This is the only right way to # retrieve the latest container status from docker engine. self._container.reload() status = self._container.status except docker_errors.NotFound: # If the job has been aborted and container has specified auto_removal # to True, we might get a NotFound error during container.reload(). raise error_types.JobAborted( 'Container not found. Possibly removed after the job has been ' 'aborted.') # The container is just created and not yet in the running status. if status == 'created': time.sleep(_POLLING_INTERVAL_SEC) continue # The container is running :) if status == 'running': return # Docker status is one of {'created', 'restarting', 'running', 'removing', # 'paused', 'exited', or 'dead'}. Status other than 'created' and # 'running' indicates the job has been aborted. raise error_types.JobAborted( 'Job has been aborted (container status={})'.format(status)) raise error_types.DeadlineExceeded( 'Deadline exceeded while waiting for the container to be running.')