def _set_unfinished_dag_runs_to_failed(self, dag_runs, session=None): """ Go through the dag_runs and update the state based on the task_instance state. Then set DAG runs that are not finished to failed. :param dag_runs: DAG runs :param session: session :return: None """ for dag_run in dag_runs: dag_run.update_state() if dag_run.state not in State.finished(): dag_run.set_state(State.FAILED) session.merge(dag_run)
def poke(self, context, session=None): ti = context['ti'] tasks = ( session.query(ErgoTask) .options(joinedload('job')) .filter_by(task_id=self.ergo_task_id, ti_dag_id=ti.dag_id) # TODO filter on execution date (otherwise there's no point) ) tasks = list(tasks) self.log.info('Received %d results...', len(tasks)) for task in tasks: job = task.job if job is None or task.state in State.unfinished(): return False return True
def _draw_task( task: Union[MappedOperator, BaseOperator], parent_graph: graphviz.Digraph, states_by_task_id: Optional[Dict[Any, Any]], ) -> None: """Draw a single task on the given parent_graph""" if states_by_task_id: state = states_by_task_id.get(task.task_id, State.NONE) color = State.color_fg(state) fill_color = State.color(state) else: color = task.ui_fgcolor fill_color = task.ui_color parent_graph.node( task.task_id, _attributes={ "label": task.label, "shape": "rectangle", "style": "filled,rounded", "color": _refine_color(color), "fillcolor": _refine_color(fill_color), }, )
def verify_state(self, dag, task_ids, execution_dates, state, old_tis, session=None): TI = models.TaskInstance tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date.in_(execution_dates) ).all() self.assertTrue(len(tis) > 0) for ti in tis: # pylint: disable=too-many-nested-blocks if ti.task_id in task_ids and ti.execution_date in execution_dates: self.assertEqual(ti.state, state) if state in State.finished(): self.assertIsNotNone(ti.end_date) else: for old_ti in old_tis: if old_ti.task_id == ti.task_id and old_ti.execution_date == ti.execution_date: self.assertEqual(ti.state, old_ti.state)
def ensure_finished_tasks(self, dag, execution_date, session): """ This method makes sure finished_tasks is populated if it's currently None. This is for the strange feature of running tasks without dag_run. :param dag: The DAG for which to find finished tasks :type dag: airflow.models.DAG :param execution_date: The execution_date to look for :param session: Database session to use :return: A list of all the finished tasks of this DAG and execution_date :rtype: list[airflow.models.TaskInstance] """ if self.finished_tasks is None: self.finished_tasks = dag.get_task_instances( start_date=execution_date, end_date=execution_date, state=State.finished() + [State.UPSTREAM_FAILED], session=session, ) return self.finished_tasks
def get_backlog(session): backlog = dict(( session.query(TI.queue, func.count()).filter( or_( # This case is unfinished, but not yet queued, tasks # In some cases, the task state can be null, # which we will choose to ignore and_( TI.queued_dttm.is_(None), TI.state.isnot(None), TI.state.notin_(State.finished()), )), TI.state.in_([State.QUEUED]), )).group_by(TI.queue).all()) queues = [] for queue, count in backlog.items(): q = Queue(name=queue, backlog=count) queues.append(q) return queues
def _get_states_count_upstream_ti(ti, finished_tasks, session): """ This function returns the states of the upstream tis for a specific ti in order to determine whether this ti can run in this iteration :param ti: the ti that we want to calculate deps for :type ti: airflow.models.TaskInstance :param finished_tasks: all the finished tasks of the dag_run :type finished_tasks: list[airflow.models.TaskInstance] """ if finished_tasks is None: # this is for the strange feature of running tasks without dag_run finished_tasks = ti.task.dag.get_task_instances( start_date=ti.execution_date, end_date=ti.execution_date, state=State.finished() + [State.UPSTREAM_FAILED], session=session) counter = Counter(task.state for task in finished_tasks if task.task_id in ti.task.upstream_task_ids) return counter.get(State.SUCCESS, 0), counter.get(State.SKIPPED, 0), counter.get(State.FAILED, 0), \ counter.get(State.UPSTREAM_FAILED, 0), sum(counter.values())
def _get_dag_runs(self, event, session): dag_runs = [] if EventType.is_in(event.event_type) and EventType( event.event_type) != EventType.UNDEFINED: if EventType(event.event_type) == EventType.DAG_RUN_EXECUTABLE: dag_run_id = int(event.key) dag_run = session.query(DagRun).filter( DagRun.id == dag_run_id).first() if dag_run is None: self.log.error("DagRun is None id {0}".format(dag_run_id)) return dag_runs simple_dag = event.simple_dag dag_run.pickle_id = None # create route self.dagrun_route.add_dagrun(dag_run, simple_dag, session) dag_runs.append(dag_run) elif EventType(event.event_type) == EventType.TASK_STATUS_CHANGED: dag_id, task_id, execution_date = TaskInstanceHelper.from_task_key( event.key) state, try_num = TaskInstanceHelper.from_event_value( event.value) dag_run = self.dagrun_route.find_dagrun(dag_id, execution_date) if dag_run is None: return dag_runs self._set_task_instance_state(dag_run, dag_id, task_id, execution_date, state, try_num) sync_dag_run = session.query(DagRun).filter( DagRun.id == dag_run.id).first() if sync_dag_run.state in State.finished(): self.log.info( "DagRun finished dag_id {0} execution_date {1} state {2}" .format(dag_run.dag_id, dag_run.execution_date, sync_dag_run.state)) if self.dagrun_route.find_dagrun_by_id( sync_dag_run.id) is not None: self.dagrun_route.remove_dagrun(dag_run, session) self.log.debug("Route remove dag run {0}".format( sync_dag_run.id)) self.mail_box.send_message( DagRunFinishedEvent(dag_run.id, sync_dag_run.state)) else: dag_runs.append(dag_run) elif EventType(event.event_type) == EventType.DAG_RUN_FINISHED: self.log.debug("DagRun {0} finished".format(event.key)) elif EventType(event.event_type) == EventType.STOP_SCHEDULER_CMD: if self.unit_test_mode: self.running = False return dag_runs else: runs = self.dagrun_route.find_dagruns_by_event( event_key=event.key, event_type=event.event_type) if runs is not None: for run in runs: task_deps = load_task_dependencies(dag_id=run.dag_id, session=session) tis = run.get_task_instances(session=session) for ti in tis: if ti.task_id not in task_deps: continue if (event.key, event.event_type) in task_deps[ti.task_id]: self.log.debug("{0} handle event {1}".format( ti.task_id, event)) ts = TaskState.query_task_state(ti, session=session) handler = ts.event_handler if handler is not None: action = handler.handle_event(event, ti=ti, ts=ts, session=session) ts.action = action session.merge(ts) session.commit() self.log.debug( "set task action {0} {1}".format( ti.task_id, action)) dag_runs.extend(runs) session.commit() for dag_run in dag_runs: run_process_func(target=process_tasks, args=( dag_run, self.dagrun_route.find_simple_dag(dag_run.id), self.log, )) return dag_runs
def set_state(self, state): if self._state != state: self._state = state self.end_date = timezone.utcnow() if self._state in State.finished() else None
def set_state(self, state): if self._state != state: self._state = state self.end_date = timezone.utcnow() if self._state in State.finished( ) else None
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: ready_tis: the tis that can be scheduled in the current loop :rtype ready_tis: list[airflow.models.TaskInstance] """ dag = self.get_dag() ready_tis = [] tis = [ ti for ti in self.get_task_instances( session=session, state=State.task_states + (State.SHUTDOWN, )) ] self.log.debug("number of tis tasks for %s: %s task(s)", self, len(tis)) for ti in tis: ti.task = dag.get_task(ti.task_id) start_dttm = timezone.utcnow() unfinished_tasks = [t for t in tis if t.state in State.unfinished()] finished_tasks = [ t for t in tis if t.state in State.finished() + [State.UPSTREAM_FAILED] ] none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) if unfinished_tasks: scheduleable_tasks = [ ut for ut in unfinished_tasks if ut.state in SCHEDULEABLE_STATES ] self.log.debug("number of scheduleable tasks for %s: %s task(s)", self, len(scheduleable_tasks)) ready_tis, changed_tis = self._get_ready_tis( scheduleable_tasks, finished_tasks, session) self.log.debug("ready tis length for %s: %s task(s)", self, len(ready_tis)) if none_depends_on_past and none_task_concurrency: # small speed up are_runnable_tasks = ready_tis or self._are_premature_tis( unfinished_tasks, finished_tasks, session) or changed_tis duration = (timezone.utcnow() - start_dttm) Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) leaf_task_ids = {t.task_id for t in dag.leaves} leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids] # if all roots finished and at least one failed, the run failed if not unfinished_tasks and any( leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED} for leaf_ti in leaf_tis): self.log.error('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all leafs succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( leaf_ti.state in {State.SUCCESS, State.SKIPPED} for leaf_ti in leaf_tis): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and not are_runnable_tasks): self.log.error('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return ready_tis
def state_token(state): color = State.color(state) return Markup('<span class="label" style="background-color:{color};">' '{state}</span>').format(color=color, state=state)
def _process_dag_task_instances(self, ti_status, executor, pickle_id, session=None): """ Process a set of task instances from a set of dag runs. Special handling is done to account for different task instance states that could be present when running them in a backfill process. :param ti_status: the internal status of the job :type ti_status: DagRunJob._DagRunTaskStatus :param executor: the executor to run the task instances :type executor: BaseExecutor :param pickle_id: the pickle_id if dag is pickled, None otherwise :type pickle_id: int :param start_date: the start date of the backfill job :type start_date: datetime :param session: the current session object :type session: Session :return: the list of execution_dates for the finished dag runs :rtype: list """ executed_run_dates = [] # values() returns a view so we copy to maintain a full list of the TIs to run all_ti = list(ti_status.to_run.values()) waiting_for_executor_result = {} while (len(ti_status.to_run) > 0 or len(ti_status.running) > 0) and len( ti_status.deadlocked ) == 0: if current.is_killed(): raise friendly_error.task_execution.databand_context_killed( "SingleDagRunJob scheduling main loop" ) self.log.debug("*** Clearing out not_ready list ***") ti_status.not_ready.clear() self.ti_state_manager.refresh_task_instances_state( all_ti, self.dag.dag_id, self.execution_date, session=session ) # we need to execute the tasks bottom to top # or leaf to root, as otherwise tasks might be # determined deadlocked while they are actually # waiting for their upstream to finish for task in self.dag.topological_sort(): # TODO: too complicated mechanism, # it's not possible that we have multiple tasks with the same id in to run for key, ti in list(ti_status.to_run.items()): if task.task_id != ti.task_id: continue if not self._optimize: ti.refresh_from_db() task = self.dag.get_task(ti.task_id) ti.task = task # TODO : do we need that? # ignore_depends_on_past = ( # self.ignore_first_depends_on_past and # ti.execution_date == (start_date or ti.start_date)) ignore_depends_on_past = False self.log.debug("Task instance to run %s state %s", ti, ti.state) # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime if ti.state == State.NONE: self.log.warning( "FIXME: task instance {} state was set to None " "externally. This should not happen" ) ti.set_state(State.SCHEDULED, session=session) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == State.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue elif ti.state == State.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue elif ti.state == State.FAILED: self.log.error("Task instance %s failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue elif ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue runtime_deps = [] if self.airflow_config.disable_dag_concurrency_rules: # RUN Deps validate dag and task concurrency # It's less relevant when we run in stand along mode with SingleDagRunJob # from airflow.ti_deps.deps.runnable_exec_date_dep import RunnableExecDateDep from airflow.ti_deps.deps.valid_state_dep import ValidStateDep # from airflow.ti_deps.deps.dag_ti_slots_available_dep import DagTISlotsAvailableDep # from airflow.ti_deps.deps.task_concurrency_dep import TaskConcurrencyDep # from airflow.ti_deps.deps.pool_slots_available_dep import PoolSlotsAvailableDep runtime_deps = { # RunnableExecDateDep(), ValidStateDep(SCHEDUALED_OR_RUNNABLE), # DagTISlotsAvailableDep(), # TaskConcurrencyDep(), # PoolSlotsAvailableDep(), } else: runtime_deps = RUNNING_DEPS dagrun_dep_context = DepContext( deps=runtime_deps, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True, ) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met( dep_context=dagrun_dep_context, session=session, verbose=self.verbose, ): ti.refresh_from_db(lock_for_update=True, session=session) if ( ti.state == State.SCHEDULED or ti.state == State.UP_FOR_RETRY ): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor " "waiting for queue to clear", ti, ) else: self.log.debug("Sending %s to executor", ti) # if ti.state == State.UP_FOR_RETRY: # ti._try_number += 1 # Skip scheduled state, we are executing immediately ti.state = State.QUEUED session.merge(ti) cfg_path = None if executor.__class__ in ( executors.LocalExecutor, executors.SequentialExecutor, ): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path, ) ti_status.to_run.pop(key) ti_status.running[key] = ti waiting_for_executor_result[key] = ti session.commit() continue if ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue # special case if ti.state == State.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not " "expired yet", ti ) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti continue # all remaining tasks self.log.debug("Adding %s to not_ready", ti) ti_status.not_ready.add(key) # execute the tasks in the queue self.heartbeat() executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run and there are no running tasks then the backfill # is deadlocked if ( ti_status.not_ready and ti_status.not_ready == set(ti_status.to_run) and len(ti_status.running) == 0 ): self.log.warning( "scheduler: Deadlock discovered for ti_status.to_run=%s", ti_status.to_run.values(), ) ti_status.deadlocked.update(ti_status.to_run.values()) ti_status.to_run.clear() self.ti_state_manager.refresh_task_instances_state( all_ti, self.dag.dag_id, self.execution_date, session=session ) # check executor state self._manage_executor_state(ti_status.running, waiting_for_executor_result) if self._zombie_cleaner: # this code exists in airflow original scheduler # clean zombies ( we don't need multiple runs here actually self._zombie_cleaner.find_and_clean_dag_zombies( dag=self.dag, execution_date=self.execution_date ) # update the task counters self._update_counters(ti_status, waiting_for_executor_result) # update dag run state _dag_runs = ti_status.active_runs[:] for run in _dag_runs: run.update_state(session=session) self._update_databand_task_run_states(run) if run.state in State.finished(): ti_status.finished_runs += 1 ti_status.active_runs.remove(run) executed_run_dates.append(run.execution_date) self._log_progress(ti_status) if self.fail_fast and ti_status.failed: msg = ",".join([t[1] for t in ti_status.failed]) logger.error( "scheduler: Terminating executor because a task failed and fail_fast mode is enabled %s", msg, ) raise DatabandFailFastError( "Failing whole pipeline as it has failed/canceled tasks %s" % msg, ) # return updated status return executed_run_dates
import logging import subprocess from airflow.models import DAG, DagRun, TaskInstance from airflow.operators.python_operator import PythonOperator from airflow.operators.subdag_operator import SubDagOperator from airflow.utils.db import provide_session from airflow.utils.state import State from datetime import datetime, timedelta import pendulum unfinished_states = State.unfinished() delta_from_max_date = timedelta(minutes=10) delta_to_mark_as_stuck = timedelta(hours=12) @provide_session def find_dag_runs(first, dag_id=None, execution_date=None, state=None, session=None): query = session.query(DagRun) query = query if dag_id is None else query.filter(DagRun.dag_id == dag_id) query = query if execution_date is None else query.filter(DagRun.execution_date == execution_date) query = query if state is None else query.filter(DagRun.state == state) return query.first() if first else query.all() @provide_session def find_task_instances(first, task_id=None, dag_id=None, execution_date=None, state=None, operator=None, session=None): query = session.query(TaskInstance) query = query if task_id is None else query.filter(TaskInstance.task_id == task_id) query = query if dag_id is None else query.filter(TaskInstance.dag_id == dag_id) query = query if execution_date is None else query.filter(TaskInstance.execution_date == execution_date)
def state_token(state): color = State.color(state) return Markup( '<span class="label" style="background-color:{color};">' '{state}</span>').format(color=color, state=state)
def mark_state(ti, sensor_instance): ti.state = state sensor_instance.state = state if state in State.finished(): ti.end_date = end_date ti.set_duration()
def _is_finished_wait_for_gapped_task(self, context): ''' @return: bool - whether there are tasks to wait for. ''' if (self._gapped_root_dag_run == None): self._init_gapped_root_dag_run(context['execution_date']) if (self._gapped_root_dag_run == None): # The start time of the external gapped dag run is more recent than the needed gap. # so there is nothing to sense here. return True self.log.info( 'Poking for the following' '{self._external_dag_id}.' '{self._external_task_id} on ' '{self._gapped_root_dag_run.execution_date} ... '.format(**locals())) self._gapped_root_dag_run.refresh_from_db() is_finished_wait_for_gapped_task = True root_state = self._gapped_root_dag_run.get_state() if (root_state == State.RUNNING): is_finished_wait_for_gapped_task = False self._refresh_gapped_dag_run() if (self._gapped_dag_run != None): gapped_dag_run_state = self._gapped_dag_run.get_state() if (gapped_dag_run_state in State.unfinished()): external_task_instance = self._gapped_dag_run.get_task_instance(task_id=self._external_task_id) if (external_task_instance == None): self.log.info( 'Still poking since the dag run has not finished and the gapped task instance still have not started: ' 'dag_id: {self._gapped_dag_run.dag_id} ' 'run_id: {self._gapped_dag_run.run_id} ' 'state: {gapped_dag_run_state} '.format(**locals())) elif (external_task_instance.state in State.unfinished()): self.log.info( 'Still poking since the gapped task instance has not finished: ' 'dag_id: {self._gapped_dag_run.dag_id} ' 'run_id: {self._gapped_dag_run.run_id} ' 'start_date: {external_task_instance.start_date} ' 'end_date: {external_task_instance.end_date} ' 'task_state: {external_task_instance.state} '.format(**locals())) else: is_finished_wait_for_gapped_task = True self.log.info( 'Finish poking since the gapped task instance has finished: ' 'dag_id: {self._gapped_dag_run.dag_id} ' 'run_id: {self._gapped_dag_run.run_id} ' 'start_date: {external_task_instance.start_date} ' 'end_date: {external_task_instance.end_date} ' 'task_state: {external_task_instance.state} '.format(**locals())) else: is_finished_wait_for_gapped_task = True self.log.info( 'Finish poking since the gapped dag run is not running any more: ' 'dag_id: {self._gapped_dag_run.dag_id} ' 'run_id: {self._gapped_dag_run.run_id} ' 'state: {gapped_dag_run_state} '.format(**locals())) else: self.log.info( 'Still poking since the root dag is still running and the gapped dag run does not exist: ' 'dag_id: {self._gapped_root_dag_run.dag_id} ' 'run_id: {self._gapped_root_dag_run.run_id} ' 'state: {root_state} '.format(**locals())) else: self.log.info( 'Finish poking since the root dag is not running any more: ' 'dag_id: {self._gapped_root_dag_run.dag_id} ' 'run_id: {self._gapped_root_dag_run.run_id} ' 'state: {root_state} '.format(**locals())) return is_finished_wait_for_gapped_task
def state_token(state): """Returns a formatted string with HTML for a given State""" color = State.color(state) return Markup('<span class="label" style="background-color:{color};">' '{state}</span>').format(color=color, state=state)
def state_f(v, c, m, p): state = m.state color = State.color(m.state) return Markup('<span class="label" style="background-color:{color};">' '{state}</span>').format(**locals())
def _process_backfill_task_instances(self, ti_status, executor, pickle_id, start_date=None, session=None): """ Process a set of task instances from a set of dag runs. Special handling is done to account for different task instance states that could be present when running them in a backfill process. :param ti_status: the internal status of the job :type ti_status: BackfillJob._DagRunTaskStatus :param executor: the executor to run the task instances :type executor: BaseExecutor :param pickle_id: the pickle_id if dag is pickled, None otherwise :type pickle_id: int :param start_date: the start date of the backfill job :type start_date: datetime.datetime :param session: the current session object :type session: sqlalchemy.orm.session.Session :return: the list of execution_dates for the finished dag runs :rtype: list """ executed_run_dates = [] while ((len(ti_status.to_run) > 0 or len(ti_status.running) > 0) and len(ti_status.deadlocked) == 0): self.log.debug("*** Clearing out not_ready list ***") ti_status.not_ready.clear() # we need to execute the tasks bottom to top # or leaf to root, as otherwise tasks might be # determined deadlocked while they are actually # waiting for their upstream to finish @provide_session def _per_task_process(task, key, ti, session=None): ti.refresh_from_db() task = self.dag.get_task(ti.task_id) ti.task = task ignore_depends_on_past = (self.ignore_first_depends_on_past and ti.execution_date == (start_date or ti.start_date)) self.log.debug("Task instance to run %s state %s", ti, ti.state) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == State.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return elif ti.state == State.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime elif ti.state == State.NONE: self.log.warning( "FIXME: task instance {} state was set to None " "externally. This should not happen") ti.set_state(State.SCHEDULED, session=session) if self.rerun_failed_tasks: # Rerun failed tasks or upstreamed failed tasks if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance {ti} " "with state {state}".format( ti=ti, state=ti.state)) if key in ti_status.running: ti_status.running.pop(key) # Reset the failed task in backfill to scheduled state ti.set_state(State.SCHEDULED, session=session) else: # Default behaviour which works for subdag. if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance {ti} " "with {state} state".format( ti=ti, state=ti.state)) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return backfill_context = DepContext( deps=BACKFILL_QUEUED_DEPS, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True) ti.refresh_from_db(lock_for_update=True, session=session) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met(dep_context=backfill_context, session=session, verbose=self.verbose): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor " "waiting for queue to clear", ti) else: self.log.debug('Sending %s to executor', ti) # Skip scheduled state, we are executing immediately ti.state = State.QUEUED ti.queued_dttm = timezone.utcnow( ) if not ti.queued_dttm else ti.queued_dttm session.merge(ti) cfg_path = None if executor.__class__ in ( executors.LocalExecutor, executors.SequentialExecutor): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path) ti_status.running[key] = ti ti_status.to_run.pop(key) session.commit() return if ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # special case if ti.state == State.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # special case if ti.state == State.UP_FOR_RESCHEDULE: self.log.debug( "Task instance %s reschedule period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # all remaining tasks self.log.debug('Adding %s to not_ready', ti) ti_status.not_ready.add(key) try: for task in self.dag.topological_sort(): for key, ti in list(ti_status.to_run.items()): if task.task_id != ti.task_id: continue pool = session.query(models.Pool) \ .filter(models.Pool.pool == task.pool) \ .first() if not pool: raise PoolNotFound('Unknown pool: {}'.format( task.pool)) open_slots = pool.open_slots(session=session) if open_slots <= 0: raise NoAvailablePoolSlot( "Not scheduling since there are " "%s open slots in pool %s".format( open_slots, task.pool)) num_running_task_instances_in_dag = DAG.get_num_task_instances( self.dag_id, states=self.STATES_COUNT_AS_RUNNING, ) if num_running_task_instances_in_dag >= self.dag.concurrency: raise DagConcurrencyLimitReached( "Not scheduling since DAG concurrency limit " "is reached.") if task.task_concurrency: num_running_task_instances_in_task = DAG.get_num_task_instances( dag_id=self.dag_id, task_ids=[task.task_id], states=self.STATES_COUNT_AS_RUNNING, ) if num_running_task_instances_in_task >= task.task_concurrency: raise TaskConcurrencyLimitReached( "Not scheduling since Task concurrency limit " "is reached.") _per_task_process(task, key, ti) except (NoAvailablePoolSlot, DagConcurrencyLimitReached, TaskConcurrencyLimitReached) as e: self.log.debug(e) # execute the tasks in the queue self.heartbeat() executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run and there are no running tasks then the backfill # is deadlocked if (ti_status.not_ready and ti_status.not_ready == set(ti_status.to_run) and len(ti_status.running) == 0): self.log.warning("Deadlock discovered for ti_status.to_run=%s", ti_status.to_run.values()) ti_status.deadlocked.update(ti_status.to_run.values()) ti_status.to_run.clear() # check executor state self._manage_executor_state(ti_status.running) # update the task counters self._update_counters(ti_status=ti_status) # update dag run state _dag_runs = ti_status.active_runs[:] for run in _dag_runs: run.update_state(session=session) if run.state in State.finished(): ti_status.finished_runs += 1 ti_status.active_runs.remove(run) executed_run_dates.append(run.execution_date) self._log_progress(ti_status) # return updated status return executed_run_dates
def set_state(tasks: Iterable[BaseOperator], execution_date: datetime.datetime, upstream: bool = False, downstream: bool = False, future: bool = False, past: bool = False, state: str = State.SUCCESS, commit: bool = False, session=None): # pylint: disable=too-many-arguments,too-many-locals """ Set the state of a task instance and if needed its relatives. Can set state for future tasks (calculated from execution_date) and retroactively for past tasks. Will verify integrity of past dag runs in order to create tasks that did not exist. It will not create dag runs that are missing on the schedule (but it will as for subdag dag runs if needed). :param tasks: the iterable of tasks from which to work. task.task.dag needs to be set :param execution_date: the execution date from which to start looking :param upstream: Mark all parents (upstream tasks) :param downstream: Mark all siblings (downstream tasks) of task_id, including SubDags :param future: Mark all future tasks on the interval of the dag up until last execution date. :param past: Retroactively mark all tasks starting from start_date of the DAG :param state: State to which the tasks need to be set :param commit: Commit tasks to be altered to the database :param session: database session :return: list of tasks that have been created and updated """ if not tasks: return [] if not timezone.is_localized(execution_date): raise ValueError( "Received non-localized date {}".format(execution_date)) task_dags = {task.dag for task in tasks} if len(task_dags) > 1: raise ValueError( "Received tasks from multiple DAGs: {}".format(task_dags)) dag = next(iter(task_dags)) if dag is None: raise ValueError("Received tasks with no DAG") dates = get_execution_dates(dag, execution_date, future, past) task_ids = list(find_task_relatives(tasks, downstream, upstream)) confirmed_dates = verify_dag_run_integrity(dag, dates) sub_dag_run_ids = get_subdag_runs(dag, session, state, task_ids, commit, confirmed_dates) # now look for the task instances that are affected qry_dag = get_all_dag_task_query(dag, session, state, task_ids, confirmed_dates) if commit: tis_altered = qry_dag.with_for_update().all() if sub_dag_run_ids: qry_sub_dag = all_subdag_tasks_query(sub_dag_run_ids, session, state, confirmed_dates) tis_altered += qry_sub_dag.with_for_update().all() for task_instance in tis_altered: task_instance.state = state if state in State.finished(): task_instance.end_date = timezone.utcnow() task_instance.set_duration() else: tis_altered = qry_dag.all() if sub_dag_run_ids: qry_sub_dag = all_subdag_tasks_query(sub_dag_run_ids, session, state, confirmed_dates) tis_altered += qry_sub_dag.all() return tis_altered
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: State """ dag = self.get_dag() tis = self.get_task_instances(session=session) self.log.debug("Updating state for %s considering %s task(s)", self, len(tis)) for ti in list(tis): # skip in db? if ti.state == State.REMOVED: tis.remove(ti) else: ti.task = dag.get_task(ti.task_id) # pre-calculate # db is faster start_dttm = timezone.utcnow() unfinished_tasks = self.get_task_instances(state=State.unfinished(), session=session) none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) # small speed up if unfinished_tasks and none_depends_on_past and none_task_concurrency: # todo: this can actually get pretty slow: one task costs between 0.01-015s no_dependencies_met = True for ut in unfinished_tasks: # We need to flag upstream and check for changes because upstream # failures/re-schedules can result in deadlock false positives old_state = ut.state deps_met = ut.are_dependencies_met(dep_context=DepContext( flag_upstream_failed=True, ignore_in_retry_period=True, ignore_in_reschedule_period=True), session=session) if deps_met or old_state != ut.current_state(session=session): no_dependencies_met = False break duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) root_ids = [t.task_id for t in dag.roots] roots = [t for t in tis if t.task_id in root_ids] # if all roots finished and at least one failed, the run failed if (not unfinished_tasks and any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)): self.log.info('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all roots succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( r.state in (State.SUCCESS, State.SKIPPED) for r in roots): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and no_dependencies_met): self.log.info('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return self.state
def update_state( self, session: Session = None, execute_callbacks: bool = True ) -> Tuple[List[TI], Optional[callback_requests.DagCallbackRequest]]: """ Determines the overall state of the DagRun based on the state of its TaskInstances. :param session: Sqlalchemy ORM Session :type session: Session :param execute_callbacks: Should dag callbacks (success/failure, SLA etc) be invoked directly (default: true) or recorded as a pending request in the ``callback`` property :type execute_callbacks: bool :return: Tuple containing tis that can be scheduled in the current loop & `callback` that needs to be executed """ # Callback to execute in case of Task Failures callback: Optional[callback_requests.DagCallbackRequest] = None start_dttm = timezone.utcnow() self.last_scheduling_decision = start_dttm dag = self.get_dag() ready_tis: List[TI] = [] tis = list( self.get_task_instances(session=session, state=State.task_states + (State.SHUTDOWN, ))) self.log.debug("number of tis tasks for %s: %s task(s)", self, len(tis)) for ti in tis: ti.task = dag.get_task(ti.task_id) unfinished_tasks = [t for t in tis if t.state in State.unfinished()] finished_tasks = [ t for t in tis if t.state in State.finished() + [State.UPSTREAM_FAILED] ] none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) if unfinished_tasks: scheduleable_tasks = [ ut for ut in unfinished_tasks if ut.state in SCHEDULEABLE_STATES ] self.log.debug("number of scheduleable tasks for %s: %s task(s)", self, len(scheduleable_tasks)) ready_tis, changed_tis = self._get_ready_tis( scheduleable_tasks, finished_tasks, session) self.log.debug("ready tis length for %s: %s task(s)", self, len(ready_tis)) if none_depends_on_past and none_task_concurrency: # small speed up are_runnable_tasks = ready_tis or self._are_premature_tis( unfinished_tasks, finished_tasks, session) or changed_tis duration = (timezone.utcnow() - start_dttm) Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) leaf_task_ids = {t.task_id for t in dag.leaves} leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids] # if all roots finished and at least one failed, the run failed if not unfinished_tasks and any( leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED} for leaf_ti in leaf_tis): self.log.error('Marking run %s failed', self) self.set_state(State.FAILED) if execute_callbacks: dag.handle_callback(self, success=False, reason='task_failure', session=session) else: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=True, msg='task_failure') # if all leafs succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( leaf_ti.state in {State.SUCCESS, State.SKIPPED} for leaf_ti in leaf_tis): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) if execute_callbacks: dag.handle_callback(self, success=True, reason='success', session=session) else: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=False, msg='success') # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and not are_runnable_tasks): self.log.error('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) if execute_callbacks: dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) else: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=True, msg='all_tasks_deadlocked') # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() session.merge(self) return ready_tis, callback
def test_get_states_count_upstream_ti(self): """ this test tests the helper function '_get_states_count_upstream_ti' as a unit and inside update_state """ from airflow.ti_deps.dep_context import DepContext get_states_count_upstream_ti = TriggerRuleDep._get_states_count_upstream_ti session = settings.Session() now = timezone.utcnow() dag = DAG('test_dagrun_with_pre_tis', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E', trigger_rule=TriggerRule.ONE_FAILED) op1.set_downstream([op2, op3]) # op1 >> op2, op3 op4.set_upstream([op3, op2]) # op3, op2 >> op4 op5.set_upstream([op2, op3, op4]) # (op2, op3, op4) >> op5 dag.clear() dr = dag.create_dagrun(run_id='test_dagrun_with_pre_tis', state=State.RUNNING, execution_date=now, start_date=now) ti_op1 = TaskInstance(task=dag.get_task(op1.task_id), execution_date=dr.execution_date) ti_op2 = TaskInstance(task=dag.get_task(op2.task_id), execution_date=dr.execution_date) ti_op3 = TaskInstance(task=dag.get_task(op3.task_id), execution_date=dr.execution_date) ti_op4 = TaskInstance(task=dag.get_task(op4.task_id), execution_date=dr.execution_date) ti_op5 = TaskInstance(task=dag.get_task(op5.task_id), execution_date=dr.execution_date) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) ti_op5.set_state(state=State.SUCCESS, session=session) # check handling with cases that tasks are triggered from backfill with no finished tasks finished_tasks = DepContext().ensure_finished_tasks( ti_op2.task.dag, ti_op2.execution_date, session) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op2), (1, 0, 0, 0, 1)) finished_tasks = dr.get_task_instances(state=State.finished() + [State.UPSTREAM_FAILED], session=session) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op4), (1, 0, 1, 0, 2)) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op5), (2, 0, 1, 0, 3)) dr.update_state() self.assertEqual(State.SUCCESS, dr.state)
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: State """ dag = self.get_dag() tis = self.get_task_instances(session=session) self.log.debug("Updating state for %s considering %s task(s)", self, len(tis)) for ti in list(tis): # skip in db? if ti.state == State.REMOVED: tis.remove(ti) else: ti.task = dag.get_task(ti.task_id) # pre-calculate # db is faster start_dttm = timezone.utcnow() unfinished_tasks = self.get_task_instances( state=State.unfinished(), session=session ) none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) # small speed up if unfinished_tasks and none_depends_on_past and none_task_concurrency: # todo: this can actually get pretty slow: one task costs between 0.01-015s no_dependencies_met = True for ut in unfinished_tasks: # We need to flag upstream and check for changes because upstream # failures/re-schedules can result in deadlock false positives old_state = ut.state deps_met = ut.are_dependencies_met( dep_context=DepContext( flag_upstream_failed=True, ignore_in_retry_period=True, ignore_in_reschedule_period=True), session=session) if deps_met or old_state != ut.current_state(session=session): no_dependencies_met = False break duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) root_ids = [t.task_id for t in dag.roots] roots = [t for t in tis if t.task_id in root_ids] # if all roots finished and at least one failed, the run failed if (not unfinished_tasks and any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)): self.log.info('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all roots succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all(r.state in (State.SUCCESS, State.SKIPPED) for r in roots): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and no_dependencies_met): self.log.info('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return self.state