Beispiel #1
0
 def handle_pod_retry(
     self,
     pod_data,
     task_run,
     increment_try_number=False,
     reason=PodRetryReason.exit_code,
 ):
     metadata = pod_data.metadata
     logger.info("Reached handle pod retry for pod %s for reason %s!",
                 metadata.name, reason)
     if reason == PodRetryReason.exit_code:
         pod_exit_code = self._try_get_pod_exit_code(pod_data)
         if not pod_exit_code:
             # Couldn't find an exit code - container is still alive - wait for the next event
             logger.debug("No exit code found for pod %s, doing nothing",
                          metadata.name)
             return False
         logger.info("Found pod exit code %d for pod %s", pod_exit_code,
                     metadata.name)
         if self._should_pod_be_retried(pod_exit_code, self.kube_config):
             retry_count, retry_delay = self._get_pod_retry_parameters(
                 pod_exit_code, self.kube_config)
             task_instance = get_airflow_task_instance(task_run)
             task_instance.max_tries = retry_count
             """
             TaskInstance has no retry delay property, it is gathered from the DbndOperator.
             The operator's values are overridden and taken from configuration if the engine is running K8s.
             See dbnd_operator.py
             """
             return self._schedule_pod_for_retry(
                 metadata,
                 retry_count,
                 retry_delay,
                 task_instance,
                 task_run,
                 increment_try_number,
             )
         else:
             logger.debug(
                 "Pod %s was not scheduled for retry because its exit code %d was not found in config",
                 metadata.name,
                 pod_exit_code,
             )
             return False
     elif reason == PodRetryReason.err_image_pull:
         retry_count = self.kube_config.retry_on_image_pull_error_count
         retry_delay = self.kube_config.pod_retry_delay
         task_instance = get_airflow_task_instance(task_run)
         task_instance.max_tries = retry_count
         return self._schedule_pod_for_retry(
             metadata,
             retry_count,
             retry_delay,
             task_instance,
             task_run,
             increment_try_number,
         )
    def _handle_crashed_task_instance(self,
                                      task_run,
                                      task_run_error,
                                      failure_reason,
                                      session=None):
        # type: (TaskRun, TaskRunError, PodFailureReason, Optional[Session]) -> None

        task_instance = get_airflow_task_instance(task_run, session=session)
        task_instance.task = task_run.task.ctrl.airflow_op

        retry_config = self.kube_dbnd.engine_config.pod_retry_config
        retry_count = retry_config.get_retry_count(failure_reason)
        if retry_count is not None:
            # update retry for the latest values (we don't have
            task_run.task.task_retries = retry_count
            task_instance.task.retries = retry_count
            task_instance.max_tries = retry_count

        self.log.info(
            "Retry %s  task: max_retries=%s, task.retries=%s, current:%s state:%s",
            task_run,
            task_instance.max_tries,
            task_instance.task.retries,
            task_instance._try_number,
            task_instance.state,
        )

        if task_instance.state == State.QUEUED:
            # Special case - no airflow code has been run in the pod at all.
            # usually its increased the momen state moved to Running. And while at running state -> it will be the same value
            # Must increment try number,
            task_instance._try_number += 1
            session.merge(task_instance)
            session.commit()

        # Airflow decide if to retry the task by this condition:
        # >>> task_instance.task.retries and task_instance.try_number <= task_instance.max_tries
        task_instance.handle_failure(str(task_run_error.exception),
                                     session=session)

        # will be logged to help debug why we did or didn't retry the task
        retry_data = "task.retries={reties}, try_number={try_number}, max_tries={max_tries}.".format(
            reties=task_instance.task.retries,
            try_number=task_instance.try_number,
            max_tries=task_instance.max_tries,
        )

        if task_instance.state == State.UP_FOR_RETRY:
            self.log.info("Set the task {task_id} to retry: {data}".format(
                task_id=str(task_run), data=retry_data))
            task_run.set_task_run_state(TaskRunState.UP_FOR_RETRY,
                                        track=True,
                                        error=task_run_error)
        else:
            self.log.info(
                "Task {task_id} is not set up to retry: {data}".format(
                    task_id=str(task_run), data=retry_data))
            task_run.set_task_run_state(TaskRunState.FAILED,
                                        track=True,
                                        error=task_run_error)
Beispiel #3
0
    def _handle_crashed_task_instance(
        self, task_run, task_run_error, failure_reason, session=None
    ):

        task_instance = get_airflow_task_instance(task_run, session=session)
        task_instance.task = task_run.task.ctrl.airflow_op

        retry_config = self.kube_dbnd.engine_config.pod_retry_config
        retry_count = retry_config.get_retry_count(failure_reason)
        if retry_count is not None:
            # update retry for the latest values (we don't have
            task_run.task.task_retries = retry_count
            task_instance.task.retries = retry_count
            task_instance.max_tries = retry_count

        self.log.info(
            "Retry %s  task: max_retries=%s, task.retries=%s, current:%s state:%s",
            task_run,
            task_instance.max_tries,
            task_instance.task.retries,
            task_instance._try_number,
            task_instance.state,
        )
        # retry condition: self.task.retries and self.try_number <= self.max_tries
        increase_try_number = False

        if task_instance.state == State.QUEUED:
            # Special case - no airflow code has been run in the pod at all.
            # usually its increased the momen state moved to Running. And while at running state -> it will be the same value
            # Must increment try number,
            task_instance._try_number += 1
            session.merge(task_instance)
            session.commit()

        task_instance.handle_failure(str(task_run_error.exception), session=session)

        if task_instance.state == State.UP_FOR_RETRY:
            task_run.set_task_run_state(
                TaskRunState.UP_FOR_RETRY, track=True, error=task_run_error
            )
        else:
            task_run.set_task_run_state(
                TaskRunState.FAILED, track=True, error=task_run_error
            )
Beispiel #4
0
    def _process_pod_success(self, submitted_pod):
        task_run = submitted_pod.task_run
        pod_name = submitted_pod.pod_name

        if submitted_pod.processed:
            self.log.info(
                "%s Skipping pod 'success' event from %s: already processed", pod_name
            )
            return
        ti = get_airflow_task_instance(task_run=task_run)

        # we print success message to the screen
        # we will not send it to databand tracking store

        if ti.state == State.SUCCESS:
            dbnd_state = TaskRunState.SUCCESS
        elif ti.state in {State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE}:
            dbnd_state = TaskRunState.UP_FOR_RETRY
        elif ti.state in {State.FAILED, State.SHUTDOWN}:
            dbnd_state = TaskRunState.FAILED
        else:
            # we got a corruption here:
            error_msg = (
                "Pod %s has finished with SUCCESS, but task instance state is %s, failing the job."
                % (pod_name, ti.state)
            )
            error_help = "Please check pod logs/eviction retry"
            task_run_error = TaskRunError.build_from_message(
                task_run, error_msg, help_msg=error_help
            )
            self._handle_crashed_task_instance(
                failure_reason=PodFailureReason.err_pod_evicted,
                task_run_error=task_run_error,
                task_run=task_run,
            )
            return

        task_run.set_task_run_state(dbnd_state, track=False)
        self.log.info(
            "%s has been completed at pod '%s' with state %s try_number=%s!"
            % (task_run, pod_name, ti.state, ti._try_number)
        )
    def _process_pod_success(self, submitted_pod):
        # type: (SubmittedPodState) -> None

        task_run = submitted_pod.task_run
        pod_name = submitted_pod.pod_name

        if submitted_pod.processed:
            self.log.info(
                "%s Skipping pod 'success' event from %s: already processed",
                pod_name)
            return

        # get refreshed TI from Airflow DB
        ti = get_airflow_task_instance(task_run=task_run)

        # we print success message to the screen
        # we will not send it to databand tracking store

        if ti.state == State.SUCCESS:
            dbnd_state = TaskRunState.SUCCESS
        elif ti.state in {State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE}:
            dbnd_state = TaskRunState.UP_FOR_RETRY
        elif ti.state in {State.FAILED, State.SHUTDOWN}:
            dbnd_state = TaskRunState.FAILED
        else:
            # we got a corruption here, pod has finished, but the AF state is not "final" state
            # meaning: AF execution was interrupted in the middle
            self.log.error(
                "Pod %s has finished with SUCCESS, but task instance state is %s, failing the job."
                % (pod_name, ti.state))

            self._process_pod_failed(submitted_pod,
                                     PodFailureReason.err_pod_evicted)
            return

        # only print to console
        task_run.set_task_run_state(dbnd_state, track=False)
        self.log.info(
            "%s has been completed at pod '%s' with state %s try_number=%s!" %
            (task_run, pod_name, ti.state, ti._try_number))
Beispiel #6
0
    def _find_pending_zombies(self, session):
        # type: (Session) -> List[TaskInstance]
        """
        Find pods that are on `pending` state but disappeared

        this is very unique scenario where:
            1) Pod was pending
            2) The pod disappear and we didn't see any event telling us that the pod failed

        More info:
            https://app.asana.com/0/1141064349624642/1200130408884044/f
        """
        now = timezone.utcnow()
        pending_zombies = []

        for pod_name, pod_state in six.iteritems(
                self.k8s_executor.kube_scheduler.submitted_pods):
            # we look for a state where the pod is pending for too long
            if (not pod_state.is_started_running
                    and (now - pod_state.submitted_at) >=
                    self._pending_zombies_timeout):
                pod_status = self.k8s_executor.kube_dbnd.get_pod_status(
                    pod_name)
                if pod_status is None:
                    # the pod doesn't exit anymore so its a zombie pending
                    af_ti = get_airflow_task_instance(pod_state.task_run,
                                                      session=session)
                    pending_zombies.append(af_ti)

        if pending_zombies:
            self.log.warning(
                "Failing pending pods for more than {timeout}".format(
                    timeout=self._pending_zombies_timeout))
            self.log.warning(
                "Detected pending zombies pods for task instance: \n\t\t\t%s",
                "\n\t\t\t".join(
                    self._build_ti_msg(ti) for ti in pending_zombies),
            )

        return pending_zombies
Beispiel #7
0
    def dbnd_set_task_pending_fail(self, pod_data, ex):
        metadata = pod_data.metadata

        task_run = _get_task_run_from_pod_data(pod_data)
        if not task_run:
            return
        from dbnd._core.task_run.task_run_error import TaskRunError

        task_run_error = TaskRunError.build_from_ex(ex, task_run)

        status_log = _get_status_log_safe(pod_data)
        logger.info(
            "Pod '%s' is Pending with exception, marking it as failed. Pod Status:\n%s",
            metadata.name,
            status_log,
        )
        task_run.set_task_run_state(TaskRunState.FAILED, error=task_run_error)
        task_instance = get_airflow_task_instance(task_run)
        from airflow.utils.state import State

        task_instance.state = State.FAILED
        update_airflow_task_instance_in_db(task_instance)
        task_run.tracker.save_task_run_log(status_log)