def WaitUntilRunning(self, deadline): assert self._container is not None, 'container has not been started.' while time.time() < deadline: try: # Reload container attributes from server. This is the only right way to # retrieve the latest container status from docker engine. self._container.reload() status = self._container.status except docker_errors.NotFound: # If the job has been aborted and container has specified auto_removal # to True, we might get a NotFound error during container.reload(). raise error_types.JobAborted( 'Container not found. Possibly removed after the job has been ' 'aborted.') # The container is just created and not yet in the running status. if status == 'created': time.sleep(_POLLING_INTERVAL_SEC) continue # The container is running :) if status == 'running': return # Docker status is one of {'created', 'restarting', 'running', 'removing', # 'paused', 'exited', or 'dead'}. Status other than 'created' and # 'running' indicates the job has been aborted. raise error_types.JobAborted( 'Job has been aborted (container status={})'.format(status)) raise error_types.DeadlineExceeded( 'Deadline exceeded while waiting for the container to be running.')
def WaitUntilRunning(self, deadline: float) -> None: assert self._pod_name, ( 'Pod has not been created yet. You should call Start() first.') while time.time() < deadline: try: pod = self._k8s_core_api.read_namespaced_pod( name=self._pod_name, namespace=self._namespace) except rest.ApiException as e: logging.info('Continue polling after getting ApiException(%s)', e) time.sleep(_DEFAULT_POLLING_INTERVAL_SEC) continue # Pod phase is one of Pending, Running, Succeeded, Failed, or Unknown. # Succeeded and Failed indicates the pod lifecycle has reached its end, # while we expect the job to be running and hanging. Phase is Unknown if # the state of the pod could not be obtained, thus we can wait until we # confirm the phase. pod_phase = _PodPhase(pod.status.phase) if pod_phase == _PodPhase.RUNNING and pod.status.pod_ip: self._endpoint = '{}:{}'.format( pod.status.pod_ip, self._serving_binary.container_port) return if pod_phase.is_done: raise error_types.JobAborted( 'Job has been aborted. (phase={})'.format(pod_phase)) logging.info('Waiting for the pod to be running. (phase=%s)', pod_phase) time.sleep(_DEFAULT_POLLING_INTERVAL_SEC) raise error_types.DeadlineExceeded( 'Deadline exceeded while waiting for pod to be running.')