Beispiel #1
0
    def sync(self):
        task_failed = False
        for key, ti, pool in self.tasks_to_run:
            if self.fail_fast and task_failed:
                logger.info("Setting %s to %s", key, State.UPSTREAM_FAILED)
                ti.set_state(State.UPSTREAM_FAILED)
                self.change_state(key, State.UPSTREAM_FAILED)
                continue

            if current.is_killed():
                logger.info("Databand Context is killed! Stopping %s to %s",
                            key, State.FAILED)
                ti.set_state(State.FAILED)
                self.change_state(key, State.FAILED)
                continue

            self.log.debug("Executing task: %s", ti)

            try:
                self._run_task_instance(ti, mark_success=False, pool=pool)
                self.change_state(key, State.SUCCESS)
            except subprocess.CalledProcessError as e:
                task_failed = True
                self.change_state(key, State.FAILED)
                self.log.error("Failed to execute task: %s.", str(e))
            except DatabandError as e:
                task_failed = True
                self.change_state(key, State.FAILED)
                self.log.error("Failed to execute task: %s.", str(e))
            except KeyboardInterrupt as e:
                task_failed = True
                fail_fast = True
                self.change_state(key, State.FAILED)
                self.log.exception("Interrupted to execute task: %s.", str(e))
            except Exception as e:
                task_failed = True
                self.change_state(key, State.FAILED)
                show_error_once.log_error(self.log, e,
                                          "Failed to execute task %s: %s.",
                                          ti.task_id, str(e))

        self.tasks_to_run = []
Beispiel #2
0
    def _process_dag_task_instances(self, ti_status, executor, pickle_id, session=None):
        """
        Process a set of task instances from a set of dag runs. Special handling is done
        to account for different task instance states that could be present when running
        them in a backfill process.
        :param ti_status: the internal status of the job
        :type ti_status: DagRunJob._DagRunTaskStatus
        :param executor: the executor to run the task instances
        :type executor: BaseExecutor
        :param pickle_id: the pickle_id if dag is pickled, None otherwise
        :type pickle_id: int
        :param start_date: the start date of the backfill job
        :type start_date: datetime
        :param session: the current session object
        :type session: Session
        :return: the list of execution_dates for the finished dag runs
        :rtype: list
        """

        executed_run_dates = []

        # values() returns a view so we copy to maintain a full list of the TIs to run
        all_ti = list(ti_status.to_run.values())
        waiting_for_executor_result = {}

        while (len(ti_status.to_run) > 0 or len(ti_status.running) > 0) and len(
            ti_status.deadlocked
        ) == 0:
            if current.is_killed():
                raise friendly_error.task_execution.databand_context_killed(
                    "SingleDagRunJob scheduling main loop"
                )
            self.log.debug("*** Clearing out not_ready list ***")
            ti_status.not_ready.clear()

            self.ti_state_manager.refresh_task_instances_state(
                all_ti, self.dag.dag_id, self.execution_date, session=session
            )

            # we need to execute the tasks bottom to top
            # or leaf to root, as otherwise tasks might be
            # determined deadlocked while they are actually
            # waiting for their upstream to finish
            for task in self.dag.topological_sort():

                # TODO: too complicated mechanism,
                # it's not possible that we have multiple tasks with the same id in to run
                for key, ti in list(ti_status.to_run.items()):
                    if task.task_id != ti.task_id:
                        continue

                    if not self._optimize:
                        ti.refresh_from_db()

                    task = self.dag.get_task(ti.task_id)
                    ti.task = task

                    # TODO : do we need that?
                    # ignore_depends_on_past = (
                    #     self.ignore_first_depends_on_past and
                    #     ti.execution_date == (start_date or ti.start_date))
                    ignore_depends_on_past = False
                    self.log.debug("Task instance to run %s state %s", ti, ti.state)

                    # guard against externally modified tasks instances or
                    # in case max concurrency has been reached at task runtime
                    if ti.state == State.NONE:
                        self.log.warning(
                            "FIXME: task instance {} state was set to None "
                            "externally. This should not happen"
                        )
                        ti.set_state(State.SCHEDULED, session=session)

                    # The task was already marked successful or skipped by a
                    # different Job. Don't rerun it.
                    if ti.state == State.SUCCESS:
                        ti_status.succeeded.add(key)
                        self.log.debug("Task instance %s succeeded. Don't rerun.", ti)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue
                    elif ti.state == State.SKIPPED:
                        ti_status.skipped.add(key)
                        self.log.debug("Task instance %s skipped. Don't rerun.", ti)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue
                    elif ti.state == State.FAILED:
                        self.log.error("Task instance %s failed", ti)
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue
                    elif ti.state == State.UPSTREAM_FAILED:
                        self.log.error("Task instance %s upstream failed", ti)
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue

                    runtime_deps = []
                    if self.airflow_config.disable_dag_concurrency_rules:
                        # RUN Deps validate dag and task concurrency
                        # It's less relevant when we run in stand along mode with SingleDagRunJob
                        # from airflow.ti_deps.deps.runnable_exec_date_dep import RunnableExecDateDep
                        from airflow.ti_deps.deps.valid_state_dep import ValidStateDep

                        # from airflow.ti_deps.deps.dag_ti_slots_available_dep import DagTISlotsAvailableDep
                        # from airflow.ti_deps.deps.task_concurrency_dep import TaskConcurrencyDep
                        # from airflow.ti_deps.deps.pool_slots_available_dep import PoolSlotsAvailableDep
                        runtime_deps = {
                            # RunnableExecDateDep(),
                            ValidStateDep(SCHEDUALED_OR_RUNNABLE),
                            # DagTISlotsAvailableDep(),
                            # TaskConcurrencyDep(),
                            # PoolSlotsAvailableDep(),
                        }
                    else:
                        runtime_deps = RUNNING_DEPS

                    dagrun_dep_context = DepContext(
                        deps=runtime_deps,
                        ignore_depends_on_past=ignore_depends_on_past,
                        ignore_task_deps=self.ignore_task_deps,
                        flag_upstream_failed=True,
                    )

                    # Is the task runnable? -- then run it
                    # the dependency checker can change states of tis
                    if ti.are_dependencies_met(
                        dep_context=dagrun_dep_context,
                        session=session,
                        verbose=self.verbose,
                    ):
                        ti.refresh_from_db(lock_for_update=True, session=session)
                        if (
                            ti.state == State.SCHEDULED
                            or ti.state == State.UP_FOR_RETRY
                        ):
                            if executor.has_task(ti):
                                self.log.debug(
                                    "Task Instance %s already in executor "
                                    "waiting for queue to clear",
                                    ti,
                                )
                            else:
                                self.log.debug("Sending %s to executor", ti)
                                # if ti.state == State.UP_FOR_RETRY:
                                #     ti._try_number += 1
                                # Skip scheduled state, we are executing immediately
                                ti.state = State.QUEUED
                                session.merge(ti)

                                cfg_path = None
                                if executor.__class__ in (
                                    executors.LocalExecutor,
                                    executors.SequentialExecutor,
                                ):
                                    cfg_path = tmp_configuration_copy()

                                executor.queue_task_instance(
                                    ti,
                                    mark_success=self.mark_success,
                                    pickle_id=pickle_id,
                                    ignore_task_deps=self.ignore_task_deps,
                                    ignore_depends_on_past=ignore_depends_on_past,
                                    pool=self.pool,
                                    cfg_path=cfg_path,
                                )

                                ti_status.to_run.pop(key)
                                ti_status.running[key] = ti
                                waiting_for_executor_result[key] = ti
                        session.commit()
                        continue

                    if ti.state == State.UPSTREAM_FAILED:
                        self.log.error("Task instance %s upstream failed", ti)
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue

                    # special case
                    if ti.state == State.UP_FOR_RETRY:
                        self.log.debug(
                            "Task instance %s retry period not " "expired yet", ti
                        )
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        ti_status.to_run[key] = ti
                        continue

                    # all remaining tasks
                    self.log.debug("Adding %s to not_ready", ti)
                    ti_status.not_ready.add(key)
            # execute the tasks in the queue
            self.heartbeat()
            executor.heartbeat()

            # If the set of tasks that aren't ready ever equals the set of
            # tasks to run and there are no running tasks then the backfill
            # is deadlocked
            if (
                ti_status.not_ready
                and ti_status.not_ready == set(ti_status.to_run)
                and len(ti_status.running) == 0
            ):
                self.log.warning(
                    "scheduler: Deadlock discovered for ti_status.to_run=%s",
                    ti_status.to_run.values(),
                )
                ti_status.deadlocked.update(ti_status.to_run.values())
                ti_status.to_run.clear()

            self.ti_state_manager.refresh_task_instances_state(
                all_ti, self.dag.dag_id, self.execution_date, session=session
            )

            # check executor state
            self._manage_executor_state(ti_status.running, waiting_for_executor_result)

            if self._zombie_cleaner:
                # this code exists in airflow original scheduler
                # clean zombies ( we don't need multiple runs here actually
                self._zombie_cleaner.find_and_clean_dag_zombies(
                    dag=self.dag, execution_date=self.execution_date
                )

            # update the task counters
            self._update_counters(ti_status, waiting_for_executor_result)

            # update dag run state
            _dag_runs = ti_status.active_runs[:]
            for run in _dag_runs:
                run.update_state(session=session)

                self._update_databand_task_run_states(run)

                if run.state in State.finished():
                    ti_status.finished_runs += 1
                    ti_status.active_runs.remove(run)
                    executed_run_dates.append(run.execution_date)

            self._log_progress(ti_status)

            if self.fail_fast and ti_status.failed:
                msg = ",".join([t[1] for t in ti_status.failed])
                logger.error(
                    "scheduler: Terminating executor because a task failed and fail_fast mode is enabled %s",
                    msg,
                )
                raise DatabandFailFastError(
                    "Failing whole pipeline as it has failed/canceled tasks %s" % msg,
                )

        # return updated status
        return executed_run_dates