def unfinished(self): dag_id = self.sync_task["dag_id"] task_id = self.sync_task["task_id"] # filter the latest dependent task dep_sql = f"select end_date from task_instance where dag_id='{dag_id}' and task_id='{task_id}' and state='success' order by end_date desc limit 1" dep_res = get_mysql_dataset(mysql_conn_id="airflow_emr", schema="airflow", sql=dep_sql) self.log.info(f'dependent taskļ¼{dep_sql} {dep_res}') # filter the latest succeeded sync task sql = f"select end_date from task_instance where dag_id='{self.sync_task_dag_id}' and task_id='{self.sync_task_id}' and state='success' order by end_date desc limit 1" res = get_mysql_dataset(mysql_conn_id="airflow_db", schema="airflow", sql=sql) self.log.info(f'recent task run: {sql} {res}') need_running = bool( dep_res and (not res or res[0]["end_date"] < dep_res[0]["end_date"])) if not need_running: return True # task execution log after upstream task success upstream_task_latest_success_time = res[0]["end_date"].strftime( "%Y-%m-%d %H:%M:%S") if res else '1970-01-01 00:00:00' task_state_sql = f"select state from task_instance where dag_id='{self.sync_task_dag_id}' and task_id='{self.sync_task_id}' and execution_date > '{upstream_task_latest_success_time}'" target_tasks = get_mysql_dataset(mysql_conn_id="airflow_db", schema="airflow", sql=task_state_sql) unfinished_tasks = [ _ for _ in target_tasks if _["state"] in State.unfinished() ] # if record unfinished > 1, means task is running return len(unfinished_tasks) > 1
def _tasks_finished(self, dag_run): for ti in dag_run.get_task_instances(): if ti.task_id in self.deferred_task_ids: continue if ti.state in State.unfinished(): logging.info("Deferred tasks are not yet executable. Found " "unfinished task `{}`.".format(ti.task_id)) return False return True
def poke(self, context): TI = TaskInstance session = Session() tis = session.query(TI) tis = tis.filter(TI.state.in_(State.unfinished())) tis = tis.filter(TI.dag_id.in_(self._get_target_dags())) if self.task_ids: tis = tis.filter(TI.task_id.in_(self.task_ids)) if self.operator_ids: # tis = tis.filter(TI.operator.in_(self.operator_ids)) pass # operator attribute might be None tis = tis.all() tis = [ti for ti in tis if ti.key != context['ti'].key] # exclude self tis = [ ti for ti in tis if (ti.operator is None or self.operator_ids is None or ti.operator in self.operator_ids) ] if len(tis) == 0 and len(self.last_notifications) == 0: return datetime.now() >= context['ti'].start_date + self.start_wait now = datetime.now() start_midnight = datetime.combine(context['ti'].start_date, datetime.min.time()) ti_keys = [(ti.dag_id, ti.task_id, ti.execution_date) for ti in tis] for ti, ti_key in zip(tis, ti_keys): if self.check_execution_time: start_date = ti.start_date else: start_date = start_midnight runtime = now - start_date if runtime >= self.notify_after: last_notification = self.last_notifications.get(ti_key) if (last_notification is None or runtime >= last_notification + self.notify_delta): self.last_notifications[ti_key] = runtime self._send_notification(ti, ti_key, finished=False) # tis previously notified about but not found anymore -- finished ti_keys_to_delete = set(self.last_notifications) - set(ti_keys) for ti_key in ti_keys_to_delete: ti = self._get_task_instance(ti_key) if ti is not None: # could be deleted from db (deleted via UI) self._send_notification(ti, ti_key, finished=True) del self.last_notifications[ti_key] # return len(self.last_notifications) == 0 return True # schedule regularly, always exit
def poke(self, context, session=None): ti = context['ti'] tasks = ( session.query(ErgoTask) .options(joinedload('job')) .filter_by(task_id=self.ergo_task_id, ti_dag_id=ti.dag_id) # TODO filter on execution date (otherwise there's no point) ) tasks = list(tasks) self.log.info('Received %d results...', len(tasks)) for task in tasks: job = task.job if job is None or task.state in State.unfinished(): return False return True
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: State """ dag = self.get_dag() tis = self.get_task_instances(session=session) self.log.debug("Updating state for %s considering %s task(s)", self, len(tis)) for ti in list(tis): # skip in db? if ti.state == State.REMOVED: tis.remove(ti) else: ti.task = dag.get_task(ti.task_id) # pre-calculate # db is faster start_dttm = timezone.utcnow() unfinished_tasks = self.get_task_instances(state=State.unfinished(), session=session) none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) # small speed up if unfinished_tasks and none_depends_on_past and none_task_concurrency: # todo: this can actually get pretty slow: one task costs between 0.01-015s no_dependencies_met = True for ut in unfinished_tasks: # We need to flag upstream and check for changes because upstream # failures/re-schedules can result in deadlock false positives old_state = ut.state deps_met = ut.are_dependencies_met(dep_context=DepContext( flag_upstream_failed=True, ignore_in_retry_period=True, ignore_in_reschedule_period=True), session=session) if deps_met or old_state != ut.current_state(session=session): no_dependencies_met = False break duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) root_ids = [t.task_id for t in dag.roots] roots = [t for t in tis if t.task_id in root_ids] # if all roots finished and at least one failed, the run failed if (not unfinished_tasks and any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)): self.log.info('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all roots succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( r.state in (State.SUCCESS, State.SKIPPED) for r in roots): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and no_dependencies_met): self.log.info('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return self.state
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: ready_tis: the tis that can be scheduled in the current loop :rtype ready_tis: list[airflow.models.TaskInstance] """ dag = self.get_dag() ready_tis = [] tis = [ ti for ti in self.get_task_instances( session=session, state=State.task_states + (State.SHUTDOWN, )) ] self.log.debug("number of tis tasks for %s: %s task(s)", self, len(tis)) for ti in tis: ti.task = dag.get_task(ti.task_id) start_dttm = timezone.utcnow() unfinished_tasks = [t for t in tis if t.state in State.unfinished()] finished_tasks = [ t for t in tis if t.state in State.finished() + [State.UPSTREAM_FAILED] ] none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) if unfinished_tasks: scheduleable_tasks = [ ut for ut in unfinished_tasks if ut.state in SCHEDULEABLE_STATES ] self.log.debug("number of scheduleable tasks for %s: %s task(s)", self, len(scheduleable_tasks)) ready_tis, changed_tis = self._get_ready_tis( scheduleable_tasks, finished_tasks, session) self.log.debug("ready tis length for %s: %s task(s)", self, len(ready_tis)) if none_depends_on_past and none_task_concurrency: # small speed up are_runnable_tasks = ready_tis or self._are_premature_tis( unfinished_tasks, finished_tasks, session) or changed_tis duration = (timezone.utcnow() - start_dttm) Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) leaf_task_ids = {t.task_id for t in dag.leaves} leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids] # if all roots finished and at least one failed, the run failed if not unfinished_tasks and any( leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED} for leaf_ti in leaf_tis): self.log.error('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all leafs succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( leaf_ti.state in {State.SUCCESS, State.SKIPPED} for leaf_ti in leaf_tis): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and not are_runnable_tasks): self.log.error('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return ready_tis
def update_state( self, session: Session = None, execute_callbacks: bool = True ) -> Tuple[List[TI], Optional[callback_requests.DagCallbackRequest]]: """ Determines the overall state of the DagRun based on the state of its TaskInstances. :param session: Sqlalchemy ORM Session :type session: Session :param execute_callbacks: Should dag callbacks (success/failure, SLA etc) be invoked directly (default: true) or recorded as a pending request in the ``callback`` property :type execute_callbacks: bool :return: Tuple containing tis that can be scheduled in the current loop & `callback` that needs to be executed """ # Callback to execute in case of Task Failures callback: Optional[callback_requests.DagCallbackRequest] = None start_dttm = timezone.utcnow() self.last_scheduling_decision = start_dttm dag = self.get_dag() ready_tis: List[TI] = [] tis = list( self.get_task_instances(session=session, state=State.task_states + (State.SHUTDOWN, ))) self.log.debug("number of tis tasks for %s: %s task(s)", self, len(tis)) for ti in tis: ti.task = dag.get_task(ti.task_id) unfinished_tasks = [t for t in tis if t.state in State.unfinished()] finished_tasks = [ t for t in tis if t.state in State.finished() + [State.UPSTREAM_FAILED] ] none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) if unfinished_tasks: scheduleable_tasks = [ ut for ut in unfinished_tasks if ut.state in SCHEDULEABLE_STATES ] self.log.debug("number of scheduleable tasks for %s: %s task(s)", self, len(scheduleable_tasks)) ready_tis, changed_tis = self._get_ready_tis( scheduleable_tasks, finished_tasks, session) self.log.debug("ready tis length for %s: %s task(s)", self, len(ready_tis)) if none_depends_on_past and none_task_concurrency: # small speed up are_runnable_tasks = ready_tis or self._are_premature_tis( unfinished_tasks, finished_tasks, session) or changed_tis duration = (timezone.utcnow() - start_dttm) Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) leaf_task_ids = {t.task_id for t in dag.leaves} leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids] # if all roots finished and at least one failed, the run failed if not unfinished_tasks and any( leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED} for leaf_ti in leaf_tis): self.log.error('Marking run %s failed', self) self.set_state(State.FAILED) if execute_callbacks: dag.handle_callback(self, success=False, reason='task_failure', session=session) else: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=True, msg='task_failure') # if all leafs succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( leaf_ti.state in {State.SUCCESS, State.SKIPPED} for leaf_ti in leaf_tis): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) if execute_callbacks: dag.handle_callback(self, success=True, reason='success', session=session) else: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=False, msg='success') # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and not are_runnable_tasks): self.log.error('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) if execute_callbacks: dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) else: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=True, msg='all_tasks_deadlocked') # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() session.merge(self) return ready_tis, callback
def _is_finished_wait_for_gapped_task(self, context): ''' @return: bool - whether there are tasks to wait for. ''' if (self._gapped_root_dag_run == None): self._init_gapped_root_dag_run(context['execution_date']) if (self._gapped_root_dag_run == None): # The start time of the external gapped dag run is more recent than the needed gap. # so there is nothing to sense here. return True self.log.info( 'Poking for the following' '{self._external_dag_id}.' '{self._external_task_id} on ' '{self._gapped_root_dag_run.execution_date} ... '.format(**locals())) self._gapped_root_dag_run.refresh_from_db() is_finished_wait_for_gapped_task = True root_state = self._gapped_root_dag_run.get_state() if (root_state == State.RUNNING): is_finished_wait_for_gapped_task = False self._refresh_gapped_dag_run() if (self._gapped_dag_run != None): gapped_dag_run_state = self._gapped_dag_run.get_state() if (gapped_dag_run_state in State.unfinished()): external_task_instance = self._gapped_dag_run.get_task_instance(task_id=self._external_task_id) if (external_task_instance == None): self.log.info( 'Still poking since the dag run has not finished and the gapped task instance still have not started: ' 'dag_id: {self._gapped_dag_run.dag_id} ' 'run_id: {self._gapped_dag_run.run_id} ' 'state: {gapped_dag_run_state} '.format(**locals())) elif (external_task_instance.state in State.unfinished()): self.log.info( 'Still poking since the gapped task instance has not finished: ' 'dag_id: {self._gapped_dag_run.dag_id} ' 'run_id: {self._gapped_dag_run.run_id} ' 'start_date: {external_task_instance.start_date} ' 'end_date: {external_task_instance.end_date} ' 'task_state: {external_task_instance.state} '.format(**locals())) else: is_finished_wait_for_gapped_task = True self.log.info( 'Finish poking since the gapped task instance has finished: ' 'dag_id: {self._gapped_dag_run.dag_id} ' 'run_id: {self._gapped_dag_run.run_id} ' 'start_date: {external_task_instance.start_date} ' 'end_date: {external_task_instance.end_date} ' 'task_state: {external_task_instance.state} '.format(**locals())) else: is_finished_wait_for_gapped_task = True self.log.info( 'Finish poking since the gapped dag run is not running any more: ' 'dag_id: {self._gapped_dag_run.dag_id} ' 'run_id: {self._gapped_dag_run.run_id} ' 'state: {gapped_dag_run_state} '.format(**locals())) else: self.log.info( 'Still poking since the root dag is still running and the gapped dag run does not exist: ' 'dag_id: {self._gapped_root_dag_run.dag_id} ' 'run_id: {self._gapped_root_dag_run.run_id} ' 'state: {root_state} '.format(**locals())) else: self.log.info( 'Finish poking since the root dag is not running any more: ' 'dag_id: {self._gapped_root_dag_run.dag_id} ' 'run_id: {self._gapped_root_dag_run.run_id} ' 'state: {root_state} '.format(**locals())) return is_finished_wait_for_gapped_task
import logging import subprocess from airflow.models import DAG, DagRun, TaskInstance from airflow.operators.python_operator import PythonOperator from airflow.operators.subdag_operator import SubDagOperator from airflow.utils.db import provide_session from airflow.utils.state import State from datetime import datetime, timedelta import pendulum unfinished_states = State.unfinished() delta_from_max_date = timedelta(minutes=10) delta_to_mark_as_stuck = timedelta(hours=12) @provide_session def find_dag_runs(first, dag_id=None, execution_date=None, state=None, session=None): query = session.query(DagRun) query = query if dag_id is None else query.filter(DagRun.dag_id == dag_id) query = query if execution_date is None else query.filter(DagRun.execution_date == execution_date) query = query if state is None else query.filter(DagRun.state == state) return query.first() if first else query.all() @provide_session def find_task_instances(first, task_id=None, dag_id=None, execution_date=None, state=None, operator=None, session=None): query = session.query(TaskInstance) query = query if task_id is None else query.filter(TaskInstance.task_id == task_id) query = query if dag_id is None else query.filter(TaskInstance.dag_id == dag_id) query = query if execution_date is None else query.filter(TaskInstance.execution_date == execution_date)
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: State """ dag = self.get_dag() tis = self.get_task_instances(session=session) self.log.debug("Updating state for %s considering %s task(s)", self, len(tis)) for ti in list(tis): # skip in db? if ti.state == State.REMOVED: tis.remove(ti) else: ti.task = dag.get_task(ti.task_id) # pre-calculate # db is faster start_dttm = timezone.utcnow() unfinished_tasks = self.get_task_instances( state=State.unfinished(), session=session ) none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) # small speed up if unfinished_tasks and none_depends_on_past and none_task_concurrency: # todo: this can actually get pretty slow: one task costs between 0.01-015s no_dependencies_met = True for ut in unfinished_tasks: # We need to flag upstream and check for changes because upstream # failures/re-schedules can result in deadlock false positives old_state = ut.state deps_met = ut.are_dependencies_met( dep_context=DepContext( flag_upstream_failed=True, ignore_in_retry_period=True, ignore_in_reschedule_period=True), session=session) if deps_met or old_state != ut.current_state(session=session): no_dependencies_met = False break duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) root_ids = [t.task_id for t in dag.roots] roots = [t for t in tis if t.task_id in root_ids] # if all roots finished and at least one failed, the run failed if (not unfinished_tasks and any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)): self.log.info('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all roots succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all(r.state in (State.SUCCESS, State.SKIPPED) for r in roots): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and no_dependencies_met): self.log.info('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return self.state