Esempio n. 1
0
    def unfinished(self):
        dag_id = self.sync_task["dag_id"]
        task_id = self.sync_task["task_id"]
        # filter the latest dependent task
        dep_sql = f"select end_date from task_instance where dag_id='{dag_id}' and task_id='{task_id}' and state='success' order by end_date desc limit 1"
        dep_res = get_mysql_dataset(mysql_conn_id="airflow_emr",
                                    schema="airflow",
                                    sql=dep_sql)
        self.log.info(f'dependent task:{dep_sql} {dep_res}')
        # filter the latest succeeded sync task
        sql = f"select end_date from task_instance where dag_id='{self.sync_task_dag_id}' and task_id='{self.sync_task_id}' and state='success' order by end_date desc limit 1"
        res = get_mysql_dataset(mysql_conn_id="airflow_db",
                                schema="airflow",
                                sql=sql)
        self.log.info(f'recent task run: {sql} {res}')

        need_running = bool(
            dep_res
            and (not res or res[0]["end_date"] < dep_res[0]["end_date"]))
        if not need_running:
            return True

        # task execution log after upstream task success
        upstream_task_latest_success_time = res[0]["end_date"].strftime(
            "%Y-%m-%d %H:%M:%S") if res else '1970-01-01 00:00:00'
        task_state_sql = f"select state from task_instance where dag_id='{self.sync_task_dag_id}' and task_id='{self.sync_task_id}' and execution_date > '{upstream_task_latest_success_time}'"
        target_tasks = get_mysql_dataset(mysql_conn_id="airflow_db",
                                         schema="airflow",
                                         sql=task_state_sql)
        unfinished_tasks = [
            _ for _ in target_tasks if _["state"] in State.unfinished()
        ]

        # if record unfinished > 1, means task is running
        return len(unfinished_tasks) > 1
Esempio n. 2
0
    def _tasks_finished(self, dag_run):
        for ti in dag_run.get_task_instances():
            if ti.task_id in self.deferred_task_ids:
                continue

            if ti.state in State.unfinished():
                logging.info("Deferred tasks are not yet executable. Found "
                             "unfinished task `{}`.".format(ti.task_id))
                return False

        return True
Esempio n. 3
0
    def poke(self, context):
        TI = TaskInstance
        session = Session()
        tis = session.query(TI)

        tis = tis.filter(TI.state.in_(State.unfinished()))
        tis = tis.filter(TI.dag_id.in_(self._get_target_dags()))
        if self.task_ids:
            tis = tis.filter(TI.task_id.in_(self.task_ids))
        if self.operator_ids:
            # tis = tis.filter(TI.operator.in_(self.operator_ids))
            pass  # operator attribute might be None

        tis = tis.all()
        tis = [ti for ti in tis if ti.key != context['ti'].key]  # exclude self
        tis = [
            ti for ti in tis
            if (ti.operator is None or self.operator_ids is None
                or ti.operator in self.operator_ids)
        ]

        if len(tis) == 0 and len(self.last_notifications) == 0:
            return datetime.now() >= context['ti'].start_date + self.start_wait

        now = datetime.now()
        start_midnight = datetime.combine(context['ti'].start_date,
                                          datetime.min.time())

        ti_keys = [(ti.dag_id, ti.task_id, ti.execution_date) for ti in tis]
        for ti, ti_key in zip(tis, ti_keys):
            if self.check_execution_time:
                start_date = ti.start_date
            else:
                start_date = start_midnight

            runtime = now - start_date
            if runtime >= self.notify_after:
                last_notification = self.last_notifications.get(ti_key)
                if (last_notification is None
                        or runtime >= last_notification + self.notify_delta):
                    self.last_notifications[ti_key] = runtime
                    self._send_notification(ti, ti_key, finished=False)

        # tis previously notified about but not found anymore -- finished
        ti_keys_to_delete = set(self.last_notifications) - set(ti_keys)
        for ti_key in ti_keys_to_delete:
            ti = self._get_task_instance(ti_key)
            if ti is not None:  # could be deleted from db (deleted via UI)
                self._send_notification(ti, ti_key, finished=True)
            del self.last_notifications[ti_key]

        # return len(self.last_notifications) == 0
        return True  # schedule regularly, always exit
Esempio n. 4
0
 def poke(self, context, session=None):
     ti = context['ti']
     tasks = (
         session.query(ErgoTask)
         .options(joinedload('job'))
         .filter_by(task_id=self.ergo_task_id, ti_dag_id=ti.dag_id)
         # TODO filter on execution date (otherwise there's no point)
     )
     tasks = list(tasks)
     self.log.info('Received %d results...', len(tasks))
     for task in tasks:
         job = task.job
         if job is None or task.state in State.unfinished():
             return False
     return True
Esempio n. 5
0
    def update_state(self, session=None):
        """
        Determines the overall state of the DagRun based on the state
        of its TaskInstances.

        :return: State
        """

        dag = self.get_dag()

        tis = self.get_task_instances(session=session)
        self.log.debug("Updating state for %s considering %s task(s)", self,
                       len(tis))

        for ti in list(tis):
            # skip in db?
            if ti.state == State.REMOVED:
                tis.remove(ti)
            else:
                ti.task = dag.get_task(ti.task_id)

        # pre-calculate
        # db is faster
        start_dttm = timezone.utcnow()
        unfinished_tasks = self.get_task_instances(state=State.unfinished(),
                                                   session=session)
        none_depends_on_past = all(not t.task.depends_on_past
                                   for t in unfinished_tasks)
        none_task_concurrency = all(t.task.task_concurrency is None
                                    for t in unfinished_tasks)
        # small speed up
        if unfinished_tasks and none_depends_on_past and none_task_concurrency:
            # todo: this can actually get pretty slow: one task costs between 0.01-015s
            no_dependencies_met = True
            for ut in unfinished_tasks:
                # We need to flag upstream and check for changes because upstream
                # failures/re-schedules can result in deadlock false positives
                old_state = ut.state
                deps_met = ut.are_dependencies_met(dep_context=DepContext(
                    flag_upstream_failed=True,
                    ignore_in_retry_period=True,
                    ignore_in_reschedule_period=True),
                                                   session=session)
                if deps_met or old_state != ut.current_state(session=session):
                    no_dependencies_met = False
                    break

        duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000
        Stats.timing("dagrun.dependency-check.{}".format(self.dag_id),
                     duration)

        root_ids = [t.task_id for t in dag.roots]
        roots = [t for t in tis if t.task_id in root_ids]

        # if all roots finished and at least one failed, the run failed
        if (not unfinished_tasks
                and any(r.state in (State.FAILED, State.UPSTREAM_FAILED)
                        for r in roots)):
            self.log.info('Marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self,
                                success=False,
                                reason='task_failure',
                                session=session)

        # if all roots succeeded and no unfinished tasks, the run succeeded
        elif not unfinished_tasks and all(
                r.state in (State.SUCCESS, State.SKIPPED) for r in roots):
            self.log.info('Marking run %s successful', self)
            self.set_state(State.SUCCESS)
            dag.handle_callback(self,
                                success=True,
                                reason='success',
                                session=session)

        # if *all tasks* are deadlocked, the run failed
        elif (unfinished_tasks and none_depends_on_past
              and none_task_concurrency and no_dependencies_met):
            self.log.info('Deadlock; marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self,
                                success=False,
                                reason='all_tasks_deadlocked',
                                session=session)

        # finally, if the roots aren't done, the dag is still running
        else:
            self.set_state(State.RUNNING)

        self._emit_duration_stats_for_finished_state()

        # todo: determine we want to use with_for_update to make sure to lock the run
        session.merge(self)
        session.commit()

        return self.state
Esempio n. 6
0
    def update_state(self, session=None):
        """
        Determines the overall state of the DagRun based on the state
        of its TaskInstances.

        :return: ready_tis: the tis that can be scheduled in the current loop
        :rtype ready_tis: list[airflow.models.TaskInstance]
        """

        dag = self.get_dag()
        ready_tis = []
        tis = [
            ti for ti in self.get_task_instances(
                session=session, state=State.task_states + (State.SHUTDOWN, ))
        ]
        self.log.debug("number of tis tasks for %s: %s task(s)", self,
                       len(tis))
        for ti in tis:
            ti.task = dag.get_task(ti.task_id)

        start_dttm = timezone.utcnow()
        unfinished_tasks = [t for t in tis if t.state in State.unfinished()]
        finished_tasks = [
            t for t in tis
            if t.state in State.finished() + [State.UPSTREAM_FAILED]
        ]
        none_depends_on_past = all(not t.task.depends_on_past
                                   for t in unfinished_tasks)
        none_task_concurrency = all(t.task.task_concurrency is None
                                    for t in unfinished_tasks)
        if unfinished_tasks:
            scheduleable_tasks = [
                ut for ut in unfinished_tasks
                if ut.state in SCHEDULEABLE_STATES
            ]
            self.log.debug("number of scheduleable tasks for %s: %s task(s)",
                           self, len(scheduleable_tasks))
            ready_tis, changed_tis = self._get_ready_tis(
                scheduleable_tasks, finished_tasks, session)
            self.log.debug("ready tis length for %s: %s task(s)", self,
                           len(ready_tis))
            if none_depends_on_past and none_task_concurrency:
                # small speed up
                are_runnable_tasks = ready_tis or self._are_premature_tis(
                    unfinished_tasks, finished_tasks, session) or changed_tis

        duration = (timezone.utcnow() - start_dttm)
        Stats.timing("dagrun.dependency-check.{}".format(self.dag_id),
                     duration)

        leaf_task_ids = {t.task_id for t in dag.leaves}
        leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids]

        # if all roots finished and at least one failed, the run failed
        if not unfinished_tasks and any(
                leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED}
                for leaf_ti in leaf_tis):
            self.log.error('Marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self,
                                success=False,
                                reason='task_failure',
                                session=session)

        # if all leafs succeeded and no unfinished tasks, the run succeeded
        elif not unfinished_tasks and all(
                leaf_ti.state in {State.SUCCESS, State.SKIPPED}
                for leaf_ti in leaf_tis):
            self.log.info('Marking run %s successful', self)
            self.set_state(State.SUCCESS)
            dag.handle_callback(self,
                                success=True,
                                reason='success',
                                session=session)

        # if *all tasks* are deadlocked, the run failed
        elif (unfinished_tasks and none_depends_on_past
              and none_task_concurrency and not are_runnable_tasks):
            self.log.error('Deadlock; marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self,
                                success=False,
                                reason='all_tasks_deadlocked',
                                session=session)

        # finally, if the roots aren't done, the dag is still running
        else:
            self.set_state(State.RUNNING)

        self._emit_duration_stats_for_finished_state()

        # todo: determine we want to use with_for_update to make sure to lock the run
        session.merge(self)
        session.commit()

        return ready_tis
Esempio n. 7
0
    def update_state(
        self,
        session: Session = None,
        execute_callbacks: bool = True
    ) -> Tuple[List[TI], Optional[callback_requests.DagCallbackRequest]]:
        """
        Determines the overall state of the DagRun based on the state
        of its TaskInstances.

        :param session: Sqlalchemy ORM Session
        :type session: Session
        :param execute_callbacks: Should dag callbacks (success/failure, SLA etc) be invoked
            directly (default: true) or recorded as a pending request in the ``callback`` property
        :type execute_callbacks: bool
        :return: Tuple containing tis that can be scheduled in the current loop & `callback` that
            needs to be executed
        """
        # Callback to execute in case of Task Failures
        callback: Optional[callback_requests.DagCallbackRequest] = None

        start_dttm = timezone.utcnow()
        self.last_scheduling_decision = start_dttm

        dag = self.get_dag()
        ready_tis: List[TI] = []
        tis = list(
            self.get_task_instances(session=session,
                                    state=State.task_states +
                                    (State.SHUTDOWN, )))
        self.log.debug("number of tis tasks for %s: %s task(s)", self,
                       len(tis))
        for ti in tis:
            ti.task = dag.get_task(ti.task_id)

        unfinished_tasks = [t for t in tis if t.state in State.unfinished()]
        finished_tasks = [
            t for t in tis
            if t.state in State.finished() + [State.UPSTREAM_FAILED]
        ]
        none_depends_on_past = all(not t.task.depends_on_past
                                   for t in unfinished_tasks)
        none_task_concurrency = all(t.task.task_concurrency is None
                                    for t in unfinished_tasks)
        if unfinished_tasks:
            scheduleable_tasks = [
                ut for ut in unfinished_tasks
                if ut.state in SCHEDULEABLE_STATES
            ]
            self.log.debug("number of scheduleable tasks for %s: %s task(s)",
                           self, len(scheduleable_tasks))
            ready_tis, changed_tis = self._get_ready_tis(
                scheduleable_tasks, finished_tasks, session)
            self.log.debug("ready tis length for %s: %s task(s)", self,
                           len(ready_tis))
            if none_depends_on_past and none_task_concurrency:
                # small speed up
                are_runnable_tasks = ready_tis or self._are_premature_tis(
                    unfinished_tasks, finished_tasks, session) or changed_tis

        duration = (timezone.utcnow() - start_dttm)
        Stats.timing("dagrun.dependency-check.{}".format(self.dag_id),
                     duration)

        leaf_task_ids = {t.task_id for t in dag.leaves}
        leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids]

        # if all roots finished and at least one failed, the run failed
        if not unfinished_tasks and any(
                leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED}
                for leaf_ti in leaf_tis):
            self.log.error('Marking run %s failed', self)
            self.set_state(State.FAILED)
            if execute_callbacks:
                dag.handle_callback(self,
                                    success=False,
                                    reason='task_failure',
                                    session=session)
            else:
                callback = callback_requests.DagCallbackRequest(
                    full_filepath=dag.fileloc,
                    dag_id=self.dag_id,
                    execution_date=self.execution_date,
                    is_failure_callback=True,
                    msg='task_failure')

        # if all leafs succeeded and no unfinished tasks, the run succeeded
        elif not unfinished_tasks and all(
                leaf_ti.state in {State.SUCCESS, State.SKIPPED}
                for leaf_ti in leaf_tis):
            self.log.info('Marking run %s successful', self)
            self.set_state(State.SUCCESS)
            if execute_callbacks:
                dag.handle_callback(self,
                                    success=True,
                                    reason='success',
                                    session=session)
            else:
                callback = callback_requests.DagCallbackRequest(
                    full_filepath=dag.fileloc,
                    dag_id=self.dag_id,
                    execution_date=self.execution_date,
                    is_failure_callback=False,
                    msg='success')

        # if *all tasks* are deadlocked, the run failed
        elif (unfinished_tasks and none_depends_on_past
              and none_task_concurrency and not are_runnable_tasks):
            self.log.error('Deadlock; marking run %s failed', self)
            self.set_state(State.FAILED)
            if execute_callbacks:
                dag.handle_callback(self,
                                    success=False,
                                    reason='all_tasks_deadlocked',
                                    session=session)
            else:
                callback = callback_requests.DagCallbackRequest(
                    full_filepath=dag.fileloc,
                    dag_id=self.dag_id,
                    execution_date=self.execution_date,
                    is_failure_callback=True,
                    msg='all_tasks_deadlocked')

        # finally, if the roots aren't done, the dag is still running
        else:
            self.set_state(State.RUNNING)

        self._emit_duration_stats_for_finished_state()

        session.merge(self)

        return ready_tis, callback
    def _is_finished_wait_for_gapped_task(self, context):
        '''

        @return: bool - whether there are tasks to wait for.
        '''
        if (self._gapped_root_dag_run == None):
            self._init_gapped_root_dag_run(context['execution_date'])
            if (self._gapped_root_dag_run == None):
                # The start time of the external gapped dag run is more recent than the needed gap.
                # so there is nothing to sense here.
                return True

        self.log.info(
            'Poking for the following'
            '{self._external_dag_id}.'
            '{self._external_task_id} on '
            '{self._gapped_root_dag_run.execution_date} ... '.format(**locals()))

        self._gapped_root_dag_run.refresh_from_db()
        is_finished_wait_for_gapped_task = True
        root_state = self._gapped_root_dag_run.get_state()
        if (root_state == State.RUNNING):
            is_finished_wait_for_gapped_task = False
            self._refresh_gapped_dag_run()
            if (self._gapped_dag_run != None):
                gapped_dag_run_state = self._gapped_dag_run.get_state()
                if (gapped_dag_run_state in State.unfinished()):
                    external_task_instance = self._gapped_dag_run.get_task_instance(task_id=self._external_task_id)
                    if (external_task_instance == None):
                        self.log.info(
                            'Still poking since the dag run has not finished and the gapped task instance still have not started: '
                            'dag_id: {self._gapped_dag_run.dag_id} '
                            'run_id: {self._gapped_dag_run.run_id} '
                            'state: {gapped_dag_run_state} '.format(**locals()))
                    elif (external_task_instance.state in State.unfinished()):
                        self.log.info(
                            'Still poking since the gapped task instance has not finished: '
                            'dag_id: {self._gapped_dag_run.dag_id} '
                            'run_id: {self._gapped_dag_run.run_id} '
                            'start_date: {external_task_instance.start_date} '
                            'end_date: {external_task_instance.end_date} '
                            'task_state: {external_task_instance.state} '.format(**locals()))
                    else:
                        is_finished_wait_for_gapped_task = True
                        self.log.info(
                            'Finish poking since the gapped task instance has finished: '
                            'dag_id: {self._gapped_dag_run.dag_id} '
                            'run_id: {self._gapped_dag_run.run_id} '
                            'start_date: {external_task_instance.start_date} '
                            'end_date: {external_task_instance.end_date} '
                            'task_state: {external_task_instance.state} '.format(**locals()))
                else:
                    is_finished_wait_for_gapped_task = True
                    self.log.info(
                        'Finish poking since the gapped dag run is not running any more: '
                        'dag_id: {self._gapped_dag_run.dag_id} '
                        'run_id: {self._gapped_dag_run.run_id} '
                        'state: {gapped_dag_run_state} '.format(**locals()))
            else:
                self.log.info(
                    'Still poking since the root dag is still running and the gapped dag run does not exist: '
                    'dag_id: {self._gapped_root_dag_run.dag_id} '
                    'run_id: {self._gapped_root_dag_run.run_id} '
                    'state: {root_state} '.format(**locals()))
        else:
            self.log.info(
                'Finish poking since the root dag is not running any more: '
                'dag_id: {self._gapped_root_dag_run.dag_id} '
                'run_id: {self._gapped_root_dag_run.run_id} '
                'state: {root_state} '.format(**locals()))

        return is_finished_wait_for_gapped_task
import logging
import subprocess

from airflow.models import DAG, DagRun, TaskInstance
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.utils.db import provide_session
from airflow.utils.state import State
from datetime import datetime, timedelta
import pendulum

unfinished_states = State.unfinished()
delta_from_max_date = timedelta(minutes=10)
delta_to_mark_as_stuck = timedelta(hours=12)


@provide_session
def find_dag_runs(first, dag_id=None, execution_date=None, state=None, session=None):
    query = session.query(DagRun)
    query = query if dag_id is None else query.filter(DagRun.dag_id == dag_id)
    query = query if execution_date is None else query.filter(DagRun.execution_date == execution_date)
    query = query if state is None else query.filter(DagRun.state == state)
    return query.first() if first else query.all()


@provide_session
def find_task_instances(first, task_id=None, dag_id=None, execution_date=None, state=None, operator=None, session=None):
    query = session.query(TaskInstance)
    query = query if task_id is None else query.filter(TaskInstance.task_id == task_id)
    query = query if dag_id is None else query.filter(TaskInstance.dag_id == dag_id)
    query = query if execution_date is None else query.filter(TaskInstance.execution_date == execution_date)
Esempio n. 10
0
    def update_state(self, session=None):
        """
        Determines the overall state of the DagRun based on the state
        of its TaskInstances.

        :return: State
        """

        dag = self.get_dag()

        tis = self.get_task_instances(session=session)
        self.log.debug("Updating state for %s considering %s task(s)", self, len(tis))

        for ti in list(tis):
            # skip in db?
            if ti.state == State.REMOVED:
                tis.remove(ti)
            else:
                ti.task = dag.get_task(ti.task_id)

        # pre-calculate
        # db is faster
        start_dttm = timezone.utcnow()
        unfinished_tasks = self.get_task_instances(
            state=State.unfinished(),
            session=session
        )
        none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks)
        none_task_concurrency = all(t.task.task_concurrency is None
                                    for t in unfinished_tasks)
        # small speed up
        if unfinished_tasks and none_depends_on_past and none_task_concurrency:
            # todo: this can actually get pretty slow: one task costs between 0.01-015s
            no_dependencies_met = True
            for ut in unfinished_tasks:
                # We need to flag upstream and check for changes because upstream
                # failures/re-schedules can result in deadlock false positives
                old_state = ut.state
                deps_met = ut.are_dependencies_met(
                    dep_context=DepContext(
                        flag_upstream_failed=True,
                        ignore_in_retry_period=True,
                        ignore_in_reschedule_period=True),
                    session=session)
                if deps_met or old_state != ut.current_state(session=session):
                    no_dependencies_met = False
                    break

        duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000
        Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration)

        root_ids = [t.task_id for t in dag.roots]
        roots = [t for t in tis if t.task_id in root_ids]

        # if all roots finished and at least one failed, the run failed
        if (not unfinished_tasks and
                any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)):
            self.log.info('Marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self, success=False, reason='task_failure',
                                session=session)

        # if all roots succeeded and no unfinished tasks, the run succeeded
        elif not unfinished_tasks and all(r.state in (State.SUCCESS, State.SKIPPED)
                                          for r in roots):
            self.log.info('Marking run %s successful', self)
            self.set_state(State.SUCCESS)
            dag.handle_callback(self, success=True, reason='success', session=session)

        # if *all tasks* are deadlocked, the run failed
        elif (unfinished_tasks and none_depends_on_past and
              none_task_concurrency and no_dependencies_met):
            self.log.info('Deadlock; marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self, success=False, reason='all_tasks_deadlocked',
                                session=session)

        # finally, if the roots aren't done, the dag is still running
        else:
            self.set_state(State.RUNNING)

        self._emit_duration_stats_for_finished_state()

        # todo: determine we want to use with_for_update to make sure to lock the run
        session.merge(self)
        session.commit()

        return self.state