Exemple #1
0
 def kill_run(self):
     _is_killed.set()
     try:
         self.context.kill_api_client.kill_run(str(self.run_uid))
     except Exception as e:
         raise DatabandFailFastError(
             "Could not send request to kill databand run!", e)
Exemple #2
0
 def kill_run(self):
     _is_killed.set()
     try:
         return kill_run(str(self.run_uid), ctx=self.context)
     except Exception as e:
         raise DatabandFailFastError(
             "Could not send request to kill databand run!", e)
Exemple #3
0
    def kill_run(self, message=None):
        _is_killed.set()

        # When initiating kill_run, the api's kill_run sends a signal to all running runs,
        # to change their state to shutdown, which in the end sets it to cancelled.
        # the task which initiated the killing, the current task run, should have a state of Failed, and not Canceled.
        # It is important to set it with an error, to allow the passing of error message, to be displayed in the UI
        # as the error_message for the whole run.
        tr = current_task_run()
        if tr.run == self.run:
            task_run_error = TaskRunError.build_from_message(
                task_run=tr,
                msg=message or DEFAULT_TASK_CANCELED_ERR_MSG,
                help_msg="task with task_run_uid:%s initiated kill_run" %
                (tr.task_run_uid),
                ex_class=DbndCanceledRunError,
            )
            tr.set_task_run_state(TaskRunState.FAILED,
                                  track=True,
                                  error=task_run_error)
        try:
            kill_run(str(self.run.run_uid), ctx=self.run.context)
        except Exception as e:
            raise DatabandFailFastError(
                "Could not send request to kill databand run!", e)
        if tr.run == self.run:
            raise DatabandError(message or DEFAULT_TASK_CANCELED_ERR_MSG)
Exemple #4
0
    def _process_dag_task_instances(self, ti_status, executor, pickle_id, session=None):
        """
        Process a set of task instances from a set of dag runs. Special handling is done
        to account for different task instance states that could be present when running
        them in a backfill process.
        :param ti_status: the internal status of the job
        :type ti_status: DagRunJob._DagRunTaskStatus
        :param executor: the executor to run the task instances
        :type executor: BaseExecutor
        :param pickle_id: the pickle_id if dag is pickled, None otherwise
        :type pickle_id: int
        :param start_date: the start date of the backfill job
        :type start_date: datetime
        :param session: the current session object
        :type session: Session
        :return: the list of execution_dates for the finished dag runs
        :rtype: list
        """

        executed_run_dates = []

        # values() returns a view so we copy to maintain a full list of the TIs to run
        all_ti = list(ti_status.to_run.values())
        waiting_for_executor_result = {}

        while (len(ti_status.to_run) > 0 or len(ti_status.running) > 0) and len(
            ti_status.deadlocked
        ) == 0:
            if current.is_killed():
                raise friendly_error.task_execution.databand_context_killed(
                    "SingleDagRunJob scheduling main loop"
                )
            self.log.debug("*** Clearing out not_ready list ***")
            ti_status.not_ready.clear()

            self.ti_state_manager.refresh_task_instances_state(
                all_ti, self.dag.dag_id, self.execution_date, session=session
            )

            # we need to execute the tasks bottom to top
            # or leaf to root, as otherwise tasks might be
            # determined deadlocked while they are actually
            # waiting for their upstream to finish
            for task in self.dag.topological_sort():

                # TODO: too complicated mechanism,
                # it's not possible that we have multiple tasks with the same id in to run
                for key, ti in list(ti_status.to_run.items()):
                    if task.task_id != ti.task_id:
                        continue

                    if not self._optimize:
                        ti.refresh_from_db()

                    task = self.dag.get_task(ti.task_id)
                    ti.task = task

                    # TODO : do we need that?
                    # ignore_depends_on_past = (
                    #     self.ignore_first_depends_on_past and
                    #     ti.execution_date == (start_date or ti.start_date))
                    ignore_depends_on_past = False
                    self.log.debug("Task instance to run %s state %s", ti, ti.state)

                    # guard against externally modified tasks instances or
                    # in case max concurrency has been reached at task runtime
                    if ti.state == State.NONE:
                        self.log.warning(
                            "FIXME: task instance {} state was set to None "
                            "externally. This should not happen"
                        )
                        ti.set_state(State.SCHEDULED, session=session)

                    # The task was already marked successful or skipped by a
                    # different Job. Don't rerun it.
                    if ti.state == State.SUCCESS:
                        ti_status.succeeded.add(key)
                        self.log.debug("Task instance %s succeeded. Don't rerun.", ti)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue
                    elif ti.state == State.SKIPPED:
                        ti_status.skipped.add(key)
                        self.log.debug("Task instance %s skipped. Don't rerun.", ti)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue
                    elif ti.state == State.FAILED:
                        self.log.error("Task instance %s failed", ti)
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue
                    elif ti.state == State.UPSTREAM_FAILED:
                        self.log.error("Task instance %s upstream failed", ti)
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue

                    runtime_deps = []
                    if self.airflow_config.disable_dag_concurrency_rules:
                        # RUN Deps validate dag and task concurrency
                        # It's less relevant when we run in stand along mode with SingleDagRunJob
                        # from airflow.ti_deps.deps.runnable_exec_date_dep import RunnableExecDateDep
                        from airflow.ti_deps.deps.valid_state_dep import ValidStateDep

                        # from airflow.ti_deps.deps.dag_ti_slots_available_dep import DagTISlotsAvailableDep
                        # from airflow.ti_deps.deps.task_concurrency_dep import TaskConcurrencyDep
                        # from airflow.ti_deps.deps.pool_slots_available_dep import PoolSlotsAvailableDep
                        runtime_deps = {
                            # RunnableExecDateDep(),
                            ValidStateDep(SCHEDUALED_OR_RUNNABLE),
                            # DagTISlotsAvailableDep(),
                            # TaskConcurrencyDep(),
                            # PoolSlotsAvailableDep(),
                        }
                    else:
                        runtime_deps = RUNNING_DEPS

                    dagrun_dep_context = DepContext(
                        deps=runtime_deps,
                        ignore_depends_on_past=ignore_depends_on_past,
                        ignore_task_deps=self.ignore_task_deps,
                        flag_upstream_failed=True,
                    )

                    # Is the task runnable? -- then run it
                    # the dependency checker can change states of tis
                    if ti.are_dependencies_met(
                        dep_context=dagrun_dep_context,
                        session=session,
                        verbose=self.verbose,
                    ):
                        ti.refresh_from_db(lock_for_update=True, session=session)
                        if (
                            ti.state == State.SCHEDULED
                            or ti.state == State.UP_FOR_RETRY
                        ):
                            if executor.has_task(ti):
                                self.log.debug(
                                    "Task Instance %s already in executor "
                                    "waiting for queue to clear",
                                    ti,
                                )
                            else:
                                self.log.debug("Sending %s to executor", ti)
                                # if ti.state == State.UP_FOR_RETRY:
                                #     ti._try_number += 1
                                # Skip scheduled state, we are executing immediately
                                ti.state = State.QUEUED
                                session.merge(ti)

                                cfg_path = None
                                if executor.__class__ in (
                                    executors.LocalExecutor,
                                    executors.SequentialExecutor,
                                ):
                                    cfg_path = tmp_configuration_copy()

                                executor.queue_task_instance(
                                    ti,
                                    mark_success=self.mark_success,
                                    pickle_id=pickle_id,
                                    ignore_task_deps=self.ignore_task_deps,
                                    ignore_depends_on_past=ignore_depends_on_past,
                                    pool=self.pool,
                                    cfg_path=cfg_path,
                                )

                                ti_status.to_run.pop(key)
                                ti_status.running[key] = ti
                                waiting_for_executor_result[key] = ti
                        session.commit()
                        continue

                    if ti.state == State.UPSTREAM_FAILED:
                        self.log.error("Task instance %s upstream failed", ti)
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        continue

                    # special case
                    if ti.state == State.UP_FOR_RETRY:
                        self.log.debug(
                            "Task instance %s retry period not " "expired yet", ti
                        )
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        ti_status.to_run[key] = ti
                        continue

                    # all remaining tasks
                    self.log.debug("Adding %s to not_ready", ti)
                    ti_status.not_ready.add(key)
            # execute the tasks in the queue
            self.heartbeat()
            executor.heartbeat()

            # If the set of tasks that aren't ready ever equals the set of
            # tasks to run and there are no running tasks then the backfill
            # is deadlocked
            if (
                ti_status.not_ready
                and ti_status.not_ready == set(ti_status.to_run)
                and len(ti_status.running) == 0
            ):
                self.log.warning(
                    "scheduler: Deadlock discovered for ti_status.to_run=%s",
                    ti_status.to_run.values(),
                )
                ti_status.deadlocked.update(ti_status.to_run.values())
                ti_status.to_run.clear()

            self.ti_state_manager.refresh_task_instances_state(
                all_ti, self.dag.dag_id, self.execution_date, session=session
            )

            # check executor state
            self._manage_executor_state(ti_status.running, waiting_for_executor_result)

            if self._zombie_cleaner:
                # this code exists in airflow original scheduler
                # clean zombies ( we don't need multiple runs here actually
                self._zombie_cleaner.find_and_clean_dag_zombies(
                    dag=self.dag, execution_date=self.execution_date
                )

            # update the task counters
            self._update_counters(ti_status, waiting_for_executor_result)

            # update dag run state
            _dag_runs = ti_status.active_runs[:]
            for run in _dag_runs:
                run.update_state(session=session)

                self._update_databand_task_run_states(run)

                if run.state in State.finished():
                    ti_status.finished_runs += 1
                    ti_status.active_runs.remove(run)
                    executed_run_dates.append(run.execution_date)

            self._log_progress(ti_status)

            if self.fail_fast and ti_status.failed:
                msg = ",".join([t[1] for t in ti_status.failed])
                logger.error(
                    "scheduler: Terminating executor because a task failed and fail_fast mode is enabled %s",
                    msg,
                )
                raise DatabandFailFastError(
                    "Failing whole pipeline as it has failed/canceled tasks %s" % msg,
                )

        # return updated status
        return executed_run_dates