def handle_pod_retry( self, pod_data, task_run, increment_try_number=False, reason=PodRetryReason.exit_code, ): metadata = pod_data.metadata logger.info("Reached handle pod retry for pod %s for reason %s!", metadata.name, reason) if reason == PodRetryReason.exit_code: pod_exit_code = self._try_get_pod_exit_code(pod_data) if not pod_exit_code: # Couldn't find an exit code - container is still alive - wait for the next event logger.debug("No exit code found for pod %s, doing nothing", metadata.name) return False logger.info("Found pod exit code %d for pod %s", pod_exit_code, metadata.name) if self._should_pod_be_retried(pod_exit_code, self.kube_config): retry_count, retry_delay = self._get_pod_retry_parameters( pod_exit_code, self.kube_config) task_instance = get_airflow_task_instance(task_run) task_instance.max_tries = retry_count """ TaskInstance has no retry delay property, it is gathered from the DbndOperator. The operator's values are overridden and taken from configuration if the engine is running K8s. See dbnd_operator.py """ return self._schedule_pod_for_retry( metadata, retry_count, retry_delay, task_instance, task_run, increment_try_number, ) else: logger.debug( "Pod %s was not scheduled for retry because its exit code %d was not found in config", metadata.name, pod_exit_code, ) return False elif reason == PodRetryReason.err_image_pull: retry_count = self.kube_config.retry_on_image_pull_error_count retry_delay = self.kube_config.pod_retry_delay task_instance = get_airflow_task_instance(task_run) task_instance.max_tries = retry_count return self._schedule_pod_for_retry( metadata, retry_count, retry_delay, task_instance, task_run, increment_try_number, )
def _handle_crashed_task_instance(self, task_run, task_run_error, failure_reason, session=None): # type: (TaskRun, TaskRunError, PodFailureReason, Optional[Session]) -> None task_instance = get_airflow_task_instance(task_run, session=session) task_instance.task = task_run.task.ctrl.airflow_op retry_config = self.kube_dbnd.engine_config.pod_retry_config retry_count = retry_config.get_retry_count(failure_reason) if retry_count is not None: # update retry for the latest values (we don't have task_run.task.task_retries = retry_count task_instance.task.retries = retry_count task_instance.max_tries = retry_count self.log.info( "Retry %s task: max_retries=%s, task.retries=%s, current:%s state:%s", task_run, task_instance.max_tries, task_instance.task.retries, task_instance._try_number, task_instance.state, ) if task_instance.state == State.QUEUED: # Special case - no airflow code has been run in the pod at all. # usually its increased the momen state moved to Running. And while at running state -> it will be the same value # Must increment try number, task_instance._try_number += 1 session.merge(task_instance) session.commit() # Airflow decide if to retry the task by this condition: # >>> task_instance.task.retries and task_instance.try_number <= task_instance.max_tries task_instance.handle_failure(str(task_run_error.exception), session=session) # will be logged to help debug why we did or didn't retry the task retry_data = "task.retries={reties}, try_number={try_number}, max_tries={max_tries}.".format( reties=task_instance.task.retries, try_number=task_instance.try_number, max_tries=task_instance.max_tries, ) if task_instance.state == State.UP_FOR_RETRY: self.log.info("Set the task {task_id} to retry: {data}".format( task_id=str(task_run), data=retry_data)) task_run.set_task_run_state(TaskRunState.UP_FOR_RETRY, track=True, error=task_run_error) else: self.log.info( "Task {task_id} is not set up to retry: {data}".format( task_id=str(task_run), data=retry_data)) task_run.set_task_run_state(TaskRunState.FAILED, track=True, error=task_run_error)
def _handle_crashed_task_instance( self, task_run, task_run_error, failure_reason, session=None ): task_instance = get_airflow_task_instance(task_run, session=session) task_instance.task = task_run.task.ctrl.airflow_op retry_config = self.kube_dbnd.engine_config.pod_retry_config retry_count = retry_config.get_retry_count(failure_reason) if retry_count is not None: # update retry for the latest values (we don't have task_run.task.task_retries = retry_count task_instance.task.retries = retry_count task_instance.max_tries = retry_count self.log.info( "Retry %s task: max_retries=%s, task.retries=%s, current:%s state:%s", task_run, task_instance.max_tries, task_instance.task.retries, task_instance._try_number, task_instance.state, ) # retry condition: self.task.retries and self.try_number <= self.max_tries increase_try_number = False if task_instance.state == State.QUEUED: # Special case - no airflow code has been run in the pod at all. # usually its increased the momen state moved to Running. And while at running state -> it will be the same value # Must increment try number, task_instance._try_number += 1 session.merge(task_instance) session.commit() task_instance.handle_failure(str(task_run_error.exception), session=session) if task_instance.state == State.UP_FOR_RETRY: task_run.set_task_run_state( TaskRunState.UP_FOR_RETRY, track=True, error=task_run_error ) else: task_run.set_task_run_state( TaskRunState.FAILED, track=True, error=task_run_error )
def _process_pod_success(self, submitted_pod): task_run = submitted_pod.task_run pod_name = submitted_pod.pod_name if submitted_pod.processed: self.log.info( "%s Skipping pod 'success' event from %s: already processed", pod_name ) return ti = get_airflow_task_instance(task_run=task_run) # we print success message to the screen # we will not send it to databand tracking store if ti.state == State.SUCCESS: dbnd_state = TaskRunState.SUCCESS elif ti.state in {State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE}: dbnd_state = TaskRunState.UP_FOR_RETRY elif ti.state in {State.FAILED, State.SHUTDOWN}: dbnd_state = TaskRunState.FAILED else: # we got a corruption here: error_msg = ( "Pod %s has finished with SUCCESS, but task instance state is %s, failing the job." % (pod_name, ti.state) ) error_help = "Please check pod logs/eviction retry" task_run_error = TaskRunError.build_from_message( task_run, error_msg, help_msg=error_help ) self._handle_crashed_task_instance( failure_reason=PodFailureReason.err_pod_evicted, task_run_error=task_run_error, task_run=task_run, ) return task_run.set_task_run_state(dbnd_state, track=False) self.log.info( "%s has been completed at pod '%s' with state %s try_number=%s!" % (task_run, pod_name, ti.state, ti._try_number) )
def _process_pod_success(self, submitted_pod): # type: (SubmittedPodState) -> None task_run = submitted_pod.task_run pod_name = submitted_pod.pod_name if submitted_pod.processed: self.log.info( "%s Skipping pod 'success' event from %s: already processed", pod_name) return # get refreshed TI from Airflow DB ti = get_airflow_task_instance(task_run=task_run) # we print success message to the screen # we will not send it to databand tracking store if ti.state == State.SUCCESS: dbnd_state = TaskRunState.SUCCESS elif ti.state in {State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE}: dbnd_state = TaskRunState.UP_FOR_RETRY elif ti.state in {State.FAILED, State.SHUTDOWN}: dbnd_state = TaskRunState.FAILED else: # we got a corruption here, pod has finished, but the AF state is not "final" state # meaning: AF execution was interrupted in the middle self.log.error( "Pod %s has finished with SUCCESS, but task instance state is %s, failing the job." % (pod_name, ti.state)) self._process_pod_failed(submitted_pod, PodFailureReason.err_pod_evicted) return # only print to console task_run.set_task_run_state(dbnd_state, track=False) self.log.info( "%s has been completed at pod '%s' with state %s try_number=%s!" % (task_run, pod_name, ti.state, ti._try_number))
def _find_pending_zombies(self, session): # type: (Session) -> List[TaskInstance] """ Find pods that are on `pending` state but disappeared this is very unique scenario where: 1) Pod was pending 2) The pod disappear and we didn't see any event telling us that the pod failed More info: https://app.asana.com/0/1141064349624642/1200130408884044/f """ now = timezone.utcnow() pending_zombies = [] for pod_name, pod_state in six.iteritems( self.k8s_executor.kube_scheduler.submitted_pods): # we look for a state where the pod is pending for too long if (not pod_state.is_started_running and (now - pod_state.submitted_at) >= self._pending_zombies_timeout): pod_status = self.k8s_executor.kube_dbnd.get_pod_status( pod_name) if pod_status is None: # the pod doesn't exit anymore so its a zombie pending af_ti = get_airflow_task_instance(pod_state.task_run, session=session) pending_zombies.append(af_ti) if pending_zombies: self.log.warning( "Failing pending pods for more than {timeout}".format( timeout=self._pending_zombies_timeout)) self.log.warning( "Detected pending zombies pods for task instance: \n\t\t\t%s", "\n\t\t\t".join( self._build_ti_msg(ti) for ti in pending_zombies), ) return pending_zombies
def dbnd_set_task_pending_fail(self, pod_data, ex): metadata = pod_data.metadata task_run = _get_task_run_from_pod_data(pod_data) if not task_run: return from dbnd._core.task_run.task_run_error import TaskRunError task_run_error = TaskRunError.build_from_ex(ex, task_run) status_log = _get_status_log_safe(pod_data) logger.info( "Pod '%s' is Pending with exception, marking it as failed. Pod Status:\n%s", metadata.name, status_log, ) task_run.set_task_run_state(TaskRunState.FAILED, error=task_run_error) task_instance = get_airflow_task_instance(task_run) from airflow.utils.state import State task_instance.state = State.FAILED update_airflow_task_instance_in_db(task_instance) task_run.tracker.save_task_run_log(status_log)