Beispiel #1
0
    def test_next_retry_datetime(self):
        delay = datetime.timedelta(seconds=3)
        delay_squared = datetime.timedelta(seconds=9)
        max_delay = datetime.timedelta(seconds=10)

        dag = models.DAG(dag_id='fail_dag')
        task = BashOperator(task_id='task_with_exp_backoff_and_max_delay',
                            bash_command='exit 1',
                            retries=3,
                            retry_delay=delay,
                            retry_exponential_backoff=True,
                            max_retry_delay=max_delay,
                            dag=dag,
                            owner='airflow',
                            start_date=datetime.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(task=task, execution_date=datetime.datetime.now())
        ti.end_date = datetime.datetime.now()

        ti.try_number = 1
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + delay)

        ti.try_number = 2
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + delay_squared)

        ti.try_number = 3
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + max_delay)
    def execute(self, context):
        # If the DAG Run is externally triggered, then return without
        # skipping downstream tasks
        if context['dag_run'].external_trigger:
            logging.info("""Externally triggered DAG_Run:
                         allowing execution to proceed.""")
            return

        now = datetime.datetime.now()
        left_window = context['dag'].following_schedule(
            context['execution_date'])
        right_window = context['dag'].following_schedule(left_window)
        logging.info(
            'Checking latest only with left_window: %s right_window: %s '
            'now: %s', left_window, right_window, now)
        if not left_window < now <= right_window:
            logging.info('Not latest execution, skipping downstream.')
            session = settings.Session()
            for task in context['task'].downstream_list:
                ti = TaskInstance(task,
                                  execution_date=context['ti'].execution_date)
                logging.info('Skipping task: %s', ti.task_id)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)
            session.commit()
            session.close()
            logging.info('Done.')
        else:
            logging.info('Latest, allowing execution to proceed.')
Beispiel #3
0
    def kill_zombies(self, zombies, session=None):
        """
        Fail given zombie tasks, which are tasks that haven't
        had a heartbeat for too long, in the current DagBag.

        :param zombies: zombie task instances to kill.
        :type zombies: SimpleTaskInstance
        :param session: DB session.
        :type Session.
        """
        for zombie in zombies:
            if zombie.dag_id in self.dags:
                dag = self.dags[zombie.dag_id]
                if zombie.task_id in dag.task_ids:
                    task = dag.get_task(zombie.task_id)
                    ti = TaskInstance(task, zombie.execution_date)
                    # Get properties needed for failure handling from SimpleTaskInstance.
                    ti.start_date = zombie.start_date
                    ti.end_date = zombie.end_date
                    ti.try_number = zombie.try_number
                    ti.state = zombie.state
                    ti.test_mode = configuration.getboolean(
                        'core', 'unit_test_mode')
                    ti.handle_failure("{} detected as zombie".format(ti),
                                      ti.test_mode, ti.get_template_context())
                    self.log.info('Marked zombie job %s as %s', ti, ti.state)
                    Stats.incr('zombies_killed')
        session.commit()
Beispiel #4
0
    def test_next_retry_datetime(self):
        delay = datetime.timedelta(seconds=3)
        delay_squared = datetime.timedelta(seconds=9)
        max_delay = datetime.timedelta(seconds=10)

        dag = models.DAG(dag_id='fail_dag')
        task = BashOperator(
            task_id='task_with_exp_backoff_and_max_delay',
            bash_command='exit 1',
            retries=3,
            retry_delay=delay,
            retry_exponential_backoff=True,
            max_retry_delay=max_delay,
            dag=dag,
            owner='airflow',
            start_date=datetime.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=datetime.datetime.now())
        ti.end_date = datetime.datetime.now()

        ti.try_number = 1
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date+delay)

        ti.try_number = 2
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date+delay_squared)

        ti.try_number = 3
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date+max_delay)
 def _get_task_instance(self,
                        state,
                        end_date=None,
                        retry_delay=timedelta(minutes=15)):
     task = Mock(retry_delay=retry_delay, retry_exponential_backoff=False)
     ti = TaskInstance(task=task, state=state, execution_date=None)
     ti.end_date = end_date
     return ti
Beispiel #6
0
 def test_set_duration(self):
     task = DummyOperator(task_id='op', email='*****@*****.**')
     ti = TI(
         task=task,
         execution_date=datetime.datetime.now(),
     )
     ti.start_date = datetime.datetime(2018, 10, 1, 1)
     ti.end_date = datetime.datetime(2018, 10, 1, 2)
     ti.set_duration()
     self.assertEqual(ti.duration, 3600)
 def test_set_duration(self):
     task = DummyOperator(task_id='op', email='*****@*****.**')
     ti = TI(
         task=task,
         execution_date=datetime.datetime.now(),
     )
     ti.start_date = datetime.datetime(2018, 10, 1, 1)
     ti.end_date = datetime.datetime(2018, 10, 1, 2)
     ti.set_duration()
     self.assertEqual(ti.duration, 3600)
    def execute(self, context):
        # If the DAG Run is externally triggered, then return without
        # skipping downstream tasks
        if context['dag_run'] and context['dag_run'].external_trigger:
            logging.info("""Externally triggered DAG_Run:
                         allowing execution to proceed.""")
            return

        now = datetime.datetime.now()
        left_window = context['dag'].following_schedule(
            context['execution_date'])
        right_window = context['dag'].following_schedule(left_window)
        logging.info(
            'Checking latest only with left_window: %s right_window: %s '
            'now: %s', left_window, right_window, now)

        if not left_window < now <= right_window:
            logging.info('Not latest execution, skipping downstream.')
            downstream_task_ids = context['task'].downstream_task_ids
            if downstream_task_ids:
                session = settings.Session()
                TI = TaskInstance
                tis = session.query(TI).filter(
                    TI.execution_date == context['ti'].execution_date,
                    TI.task_id.in_(downstream_task_ids)
                ).with_for_update().all()

                for ti in tis:
                    logging.info('Skipping task: %s', ti.task_id)
                    ti.state = State.SKIPPED
                    ti.start_date = now
                    ti.end_date = now
                    session.merge(ti)

                # this is defensive against dag runs that are not complete
                for task in context['task'].downstream_list:
                    if task.task_id in tis:
                        continue

                    logging.warning("Task {} was not part of a dag run. "
                                    "This should not happen."
                                    .format(task))
                    now = datetime.datetime.now()
                    ti = TaskInstance(task, execution_date=context['ti'].execution_date)
                    ti.state = State.SKIPPED
                    ti.start_date = now
                    ti.end_date = now
                    session.merge(ti)

                session.commit()
                session.close()

            logging.info('Done.')
        else:
            logging.info('Latest, allowing execution to proceed.')
    def execute(self, context):
        # If the DAG Run is externally triggered, then return without
        # skipping downstream tasks
        if context['dag_run'] and context['dag_run'].external_trigger:
            logging.info("""Externally triggered DAG_Run:
                         allowing execution to proceed.""")
            return

        now = datetime.datetime.now()
        left_window = context['dag'].following_schedule(
            context['execution_date'])
        right_window = context['dag'].following_schedule(left_window)
        logging.info(
            'Checking latest only with left_window: %s right_window: %s '
            'now: %s', left_window, right_window, now)

        if not left_window < now <= right_window:
            logging.info('Not latest execution, skipping downstream.')
            session = settings.Session()

            TI = TaskInstance
            tis = session.query(TI).filter(
                TI.execution_date == context['ti'].execution_date,
                TI.task_id.in_(context['task'].downstream_task_ids)
            ).with_for_update().all()

            for ti in tis:
                logging.info('Skipping task: %s', ti.task_id)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)

            # this is defensive against dag runs that are not complete
            for task in context['task'].downstream_list:
                if task.task_id in tis:
                    continue

                logging.warning("Task {} was not part of a dag run. "
                                "This should not happen."
                                .format(task))
                now = datetime.datetime.now()
                ti = TaskInstance(task, execution_date=context['ti'].execution_date)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)

            session.commit()
            session.close()
            logging.info('Done.')
        else:
            logging.info('Latest, allowing execution to proceed.')
Beispiel #10
0
 def execute(self, context):
     branch = super(BranchPythonOperator, self).execute(context)
     logging.info("Following branch " + branch)
     logging.info("Marking other directly downstream tasks as failed")
     session = settings.Session()
     for task in context['task'].downstream_list:
         if task.task_id != branch:
             ti = TaskInstance(task,
                               execution_date=context['ti'].execution_date)
             ti.state = State.SKIPPED
             ti.start_date = datetime.now()
             ti.end_date = datetime.now()
             session.merge(ti)
     session.commit()
     session.close()
     logging.info("Done.")
 def execute(self, context):
     branch = super(BranchPythonOperator, self).execute(context)
     logging.info("Following branch " + branch)
     logging.info("Marking other directly downstream tasks as skipped")
     session = settings.Session()
     for task in context['task'].downstream_list:
         if task.task_id != branch:
             ti = TaskInstance(
                 task, execution_date=context['ti'].execution_date)
             ti.state = State.SKIPPED
             ti.start_date = datetime.now()
             ti.end_date = datetime.now()
             session.merge(ti)
     session.commit()
     session.close()
     logging.info("Done.")
 def execute(self, context):
     condition = super(ShortCircuitOperator, self).execute(context)
     logging.info("Condition result is {}".format(condition))
     if condition:
         logging.info('Proceeding with downstream tasks...')
         return
     else:
         logging.info('Skipping downstream tasks...')
         session = settings.Session()
         for task in context['task'].downstream_list:
             ti = TaskInstance(
                 task, execution_date=context['ti'].execution_date)
             ti.state = State.SKIPPED
             ti.start_date = datetime.now()
             ti.end_date = datetime.now()
             session.merge(ti)
         session.commit()
         session.close()
         logging.info("Done.")
    def test_next_retry_datetime(self):
        delay = datetime.timedelta(seconds=30)
        max_delay = datetime.timedelta(minutes=60)

        dag = models.DAG(dag_id='fail_dag')
        task = BashOperator(
            task_id='task_with_exp_backoff_and_max_delay',
            bash_command='exit 1',
            retries=3,
            retry_delay=delay,
            retry_exponential_backoff=True,
            max_retry_delay=max_delay,
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=DEFAULT_DATE)
        ti.end_date = pendulum.instance(timezone.utcnow())

        dt = ti.next_retry_datetime()
        # between 30 * 2^0.5 and 30 * 2^1 (15 and 30)
        period = ti.end_date.add(seconds=30) - ti.end_date.add(seconds=15)
        self.assertTrue(dt in period)

        ti.try_number = 3
        dt = ti.next_retry_datetime()
        # between 30 * 2^2 and 30 * 2^3 (120 and 240)
        period = ti.end_date.add(seconds=240) - ti.end_date.add(seconds=120)
        self.assertTrue(dt in period)

        ti.try_number = 5
        dt = ti.next_retry_datetime()
        # between 30 * 2^4 and 30 * 2^5 (480 and 960)
        period = ti.end_date.add(seconds=960) - ti.end_date.add(seconds=480)
        self.assertTrue(dt in period)

        ti.try_number = 9
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + max_delay)

        ti.try_number = 50
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + max_delay)
Beispiel #14
0
 def execute(self, context):
     condition = super(JollyShortCircuitOperator, self).execute(context)
     logging.info("Condition result is {}".format(condition))
     if condition:
         logging.info('Proceeding with downstream tasks...')
         return
     else:
         logging.info('Skipping downstream tasks...')
         session = settings.Session()
         for task in context['task'].downstream_list:
             ti = TaskInstance(task,
                               execution_date=context['ti'].execution_date)
             ti.state = State.SKIPPED
             ti.start_date = datetime.now()
             ti.end_date = datetime.now()
             session.merge(ti)
         session.commit()
         session.close()
         logging.info("Done.")
    def test_next_retry_datetime(self):
        delay = datetime.timedelta(seconds=30)
        max_delay = datetime.timedelta(minutes=60)

        dag = models.DAG(dag_id='fail_dag')
        task = BashOperator(
            task_id='task_with_exp_backoff_and_max_delay',
            bash_command='exit 1',
            retries=3,
            retry_delay=delay,
            retry_exponential_backoff=True,
            max_retry_delay=max_delay,
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=DEFAULT_DATE)
        ti.end_date = pendulum.instance(timezone.utcnow())

        dt = ti.next_retry_datetime()
        # between 30 * 2^0.5 and 30 * 2^1 (15 and 30)
        period = ti.end_date.add(seconds=30) - ti.end_date.add(seconds=15)
        self.assertTrue(dt in period)

        ti.try_number = 3
        dt = ti.next_retry_datetime()
        # between 30 * 2^2 and 30 * 2^3 (120 and 240)
        period = ti.end_date.add(seconds=240) - ti.end_date.add(seconds=120)
        self.assertTrue(dt in period)

        ti.try_number = 5
        dt = ti.next_retry_datetime()
        # between 30 * 2^4 and 30 * 2^5 (480 and 960)
        period = ti.end_date.add(seconds=960) - ti.end_date.add(seconds=480)
        self.assertTrue(dt in period)

        ti.try_number = 9
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + max_delay)

        ti.try_number = 50
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + max_delay)
Beispiel #16
0
    def test_next_retry_datetime(self):
        delay = datetime.timedelta(seconds=30)
        max_delay = datetime.timedelta(minutes=60)

        dag = models.DAG(dag_id='fail_dag')
        task = BashOperator(
            task_id='task_with_exp_backoff_and_max_delay',
            bash_command='exit 1',
            retries=3,
            retry_delay=delay,
            retry_exponential_backoff=True,
            max_retry_delay=max_delay,
            dag=dag,
            owner='airflow',
            start_date=datetime.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=DEFAULT_DATE)
        ti.end_date = datetime.datetime.now()

        ti.try_number = 1
        dt = ti.next_retry_datetime()
        # between 30 * 2^0.5 and 30 * 2^1 (15 and 30)
        self.assertEqual(dt, ti.end_date + datetime.timedelta(seconds=20.0))

        ti.try_number = 4
        dt = ti.next_retry_datetime()
        # between 30 * 2^2 and 30 * 2^3 (120 and 240)
        self.assertEqual(dt, ti.end_date + datetime.timedelta(seconds=181.0))

        ti.try_number = 6
        dt = ti.next_retry_datetime()
        # between 30 * 2^4 and 30 * 2^5 (480 and 960)
        self.assertEqual(dt, ti.end_date + datetime.timedelta(seconds=825.0))

        ti.try_number = 9
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date+max_delay)

        ti.try_number = 50
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date+max_delay)
    def execute(self, context):
        condition = super(ShortCircuitOperator, self).execute(context)
        logging.info("Condition result is {}".format(condition))

        if condition:
            logging.info('Proceeding with downstream tasks...')
            return

        logging.info('Skipping downstream tasks...')
        session = settings.Session()

        TI = TaskInstance
        tis = session.query(TI).filter(
            TI.execution_date == context['ti'].execution_date,
            TI.task_id.in_(context['task'].downstream_task_ids),
        ).with_for_update().all()

        for ti in tis:
            logging.info('Skipping task: %s', ti.task_id)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()

        # this is defensive against dag runs that are not complete
        for task in context['task'].downstream_list:
            if task.task_id in tis:
                continue

            logging.warning(
                "Task {} was not part of a dag run. This should not happen.".
                format(task))
            ti = TaskInstance(task,
                              execution_date=context['ti'].execution_date)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()
            session.merge(ti)

        session.commit()
        session.close()
        logging.info("Done.")
Beispiel #18
0
    def test_next_retry_datetime(self):
        delay = datetime.timedelta(seconds=30)
        max_delay = datetime.timedelta(minutes=60)

        dag = models.DAG(dag_id='fail_dag')
        task = BashOperator(
            task_id='task_with_exp_backoff_and_max_delay',
            bash_command='exit 1',
            retries=3,
            retry_delay=delay,
            retry_exponential_backoff=True,
            max_retry_delay=max_delay,
            dag=dag,
            owner='airflow',
            start_date=datetime.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=DEFAULT_DATE)
        ti.end_date = datetime.datetime.now()

        ti.try_number = 1
        dt = ti.next_retry_datetime()
        # between 30 * 2^0.5 and 30 * 2^1 (15 and 30)
        self.assertEqual(dt, ti.end_date + datetime.timedelta(seconds=20.0))

        ti.try_number = 4
        dt = ti.next_retry_datetime()
        # between 30 * 2^2 and 30 * 2^3 (120 and 240)
        self.assertEqual(dt, ti.end_date + datetime.timedelta(seconds=181.0))

        ti.try_number = 6
        dt = ti.next_retry_datetime()
        # between 30 * 2^4 and 30 * 2^5 (480 and 960)
        self.assertEqual(dt, ti.end_date + datetime.timedelta(seconds=825.0))

        ti.try_number = 9
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date+max_delay)

        ti.try_number = 50
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date+max_delay)
Beispiel #19
0
    def skip(self, dag_run, execution_date, tasks, session=None):
        """
        Sets tasks instances to skipped from the same dag run.

        :param dag_run: the DagRun for which to set the tasks to skipped
        :param execution_date: execution_date
        :param tasks: tasks to skip (not task_ids)
        :param session: db session to use
        """
        if not tasks:
            return

        task_ids = [d.task_id for d in tasks]
        now = timezone.utcnow()

        if dag_run:
            session.query(TaskInstance).filter(
                TaskInstance.dag_id == dag_run.dag_id,
                TaskInstance.execution_date == dag_run.execution_date,
                TaskInstance.task_id.in_(task_ids)).update(
                    {
                        TaskInstance.state: State.SKIPPED,
                        TaskInstance.start_date: now,
                        TaskInstance.end_date: now
                    },
                    synchronize_session=False)
            session.commit()
        else:
            assert execution_date is not None, "Execution date is None and no dag run"

            self.log.warning("No DAG RUN present this should not happen")
            # this is defensive against dag runs that are not complete
            for task in tasks:
                ti = TaskInstance(task, execution_date=execution_date)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)

            session.commit()
    def execute(self, context):
        branch = super(BranchPythonOperator, self).execute(context)
        logging.info("Following branch " + branch)
        logging.info("Marking other directly downstream tasks as skipped")
        session = settings.Session()

        TI = TaskInstance
        tis = session.query(TI).filter(
            TI.execution_date == context['ti'].execution_date,
            TI.task_id.in_(context['task'].downstream_task_ids),
            TI.task_id != branch,
        ).with_for_update().all()

        for ti in tis:
            logging.info('Skipping task: %s', ti.task_id)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()

        # this is defensive against dag runs that are not complete
        for task in context['task'].downstream_list:
            if task.task_id in tis:
                continue

            if task.task_id == branch:
                continue

            logging.warning(
                "Task {} was not part of a dag run. This should not happen.".
                format(task))
            ti = TaskInstance(task,
                              execution_date=context['ti'].execution_date)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()
            session.merge(ti)

        session.commit()
        session.close()
        logging.info("Done.")
Beispiel #21
0
    def execute(self, context):
        condition = super(ShortCircuitOperator, self).execute(context)
        logging.info("Condition result is {}".format(condition))

        if condition:
            logging.info('Proceeding with downstream tasks...')
            return

        logging.info('Skipping downstream tasks...')
        session = settings.Session()

        TI = TaskInstance
        tis = session.query(TI).filter(
            TI.execution_date == context['ti'].execution_date,
            TI.task_id.in_(context['task'].downstream_task_ids),
        ).with_for_update().all()

        for ti in tis:
            logging.info('Skipping task: %s', ti.task_id)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()

        # this is defensive against dag runs that are not complete
        for task in context['task'].downstream_list:
            if task.task_id in tis:
                continue

            logging.warning("Task {} was not part of a dag run. This should not happen."
                            .format(task))
            ti = TaskInstance(task, execution_date=context['ti'].execution_date)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()
            session.merge(ti)

        session.commit()
        session.close()
        logging.info("Done.")
Beispiel #22
0
    def test_next_retry_datetime_short_intervals(self):
        delay = datetime.timedelta(seconds=1)
        max_delay = datetime.timedelta(minutes=60)

        dag = models.DAG(dag_id='fail_dag')
        task = BashOperator(
            task_id='task_with_exp_backoff_and_short_time_interval',
            bash_command='exit 1',
            retries=3,
            retry_delay=delay,
            retry_exponential_backoff=True,
            max_retry_delay=max_delay,
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(task=task, execution_date=DEFAULT_DATE)
        ti.end_date = pendulum.instance(timezone.utcnow())

        dt = ti.next_retry_datetime()
        # between 1 * 2^0.5 and 1 * 2^1 (15 and 30)
        period = ti.end_date.add(seconds=1) - ti.end_date.add(seconds=15)
        self.assertTrue(dt in period)
Beispiel #23
0
    def skip(self, dag_run, execution_date, tasks, session=None):
        """
        Sets tasks instances to skipped from the same dag run.

        :param dag_run: the DagRun for which to set the tasks to skipped
        :param execution_date: execution_date
        :param tasks: tasks to skip (not task_ids)
        :param session: db session to use
        """
        if not tasks:
            return

        task_ids = [d.task_id for d in tasks]
        now = timezone.utcnow()

        if dag_run:
            session.query(TaskInstance).filter(
                TaskInstance.dag_id == dag_run.dag_id,
                TaskInstance.execution_date == dag_run.execution_date,
                TaskInstance.task_id.in_(task_ids)
            ).update({TaskInstance.state: State.SKIPPED,
                      TaskInstance.start_date: now,
                      TaskInstance.end_date: now},
                     synchronize_session=False)
            session.commit()
        else:
            assert execution_date is not None, "Execution date is None and no dag run"

            self.log.warning("No DAG RUN present this should not happen")
            # this is defensive against dag runs that are not complete
            for task in tasks:
                ti = TaskInstance(task, execution_date=execution_date)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)

            session.commit()
Beispiel #24
0
    def execute(self, context):
        branch = super(BranchPythonOperator, self).execute(context)
        logging.info("Following branch " + branch)
        logging.info("Marking other directly downstream tasks as skipped")
        session = settings.Session()

        TI = TaskInstance
        tis = session.query(TI).filter(
            TI.execution_date == context['ti'].execution_date,
            TI.task_id.in_(context['task'].downstream_task_ids),
            TI.task_id != branch,
        ).with_for_update().all()

        for ti in tis:
            logging.info('Skipping task: %s', ti.task_id)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()

        # this is defensive against dag runs that are not complete
        for task in context['task'].downstream_list:
            if task.task_id in tis:
                continue

            if task.task_id == branch:
                continue

            logging.warning("Task {} was not part of a dag run. This should not happen."
                            .format(task))
            ti = TaskInstance(task, execution_date=context['ti'].execution_date)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()
            session.merge(ti)

        session.commit()
        session.close()
        logging.info("Done.")
Beispiel #25
0
def test_extra_link_in_gantt_view(dag, viewer_client):
    exec_date = dates.days_ago(2)
    start_date = timezone.datetime(2020, 4, 10, 2, 0, 0)
    end_date = exec_date + datetime.timedelta(seconds=30)

    with create_session() as session:
        for task in dag.tasks:
            ti = TaskInstance(task=task,
                              execution_date=exec_date,
                              state="success")
            ti.start_date = start_date
            ti.end_date = end_date
            session.add(ti)

    url = f'gantt?dag_id={dag.dag_id}&execution_date={exec_date}'
    resp = viewer_client.get(url, follow_redirects=True)

    check_content_in_response('"extraLinks":', resp)

    extra_links_grps = re.search(r'extraLinks\": \[(\".*?\")\]',
                                 resp.get_data(as_text=True))
    extra_links = extra_links_grps.group(0)
    assert 'airflow' in extra_links
    assert 'github' in extra_links
Beispiel #26
0
 def execute(self, context):
     now = datetime.datetime.now()
     left_window = context['dag'].following_schedule(
         context['execution_date'])
     right_window = context['dag'].following_schedule(left_window)
     logging.info(
         'Checking latest only with left_window: %s right_window: %s '
         'now: %s', left_window, right_window, now)
     if not left_window < now <= right_window:
         logging.info('Not latest execution, skipping downstream.')
         session = settings.Session()
         for task in context['task'].downstream_list:
             ti = TaskInstance(
                 task, execution_date=context['ti'].execution_date)
             logging.info('Skipping task: %s', ti.task_id)
             ti.state = State.SKIPPED
             ti.start_date = now
             ti.end_date = now
             session.merge(ti)
         session.commit()
         session.close()
         logging.info('Done.')
     else:
         logging.info('Latest, allowing execution to proceed.')
 def execute(self, context):
     now = datetime.datetime.now()
     left_window = context['dag'].following_schedule(
         context['execution_date'])
     right_window = context['dag'].following_schedule(left_window)
     _log.info(
         'Checking latest only with left_window: %s right_window: %s '
         'now: %s', left_window, right_window, now)
     if not left_window < now <= right_window:
         _log.info('Not latest execution, skipping downstream.')
         session = settings.Session()
         for task in context['task'].downstream_list:
             ti = TaskInstance(task,
                               execution_date=context['ti'].execution_date)
             _log.info('Skipping task: %s', ti.task_id)
             ti.state = State.SKIPPED
             ti.start_date = now
             ti.end_date = now
             session.merge(ti)
         session.commit()
         session.close()
         _log.info('Done.')
     else:
         _log.info('Latest, allowing execution to proceed.')
Beispiel #28
0
def create_dagrun_from_dbnd_run(
    databand_run,
    dag,
    execution_date,
    run_id,
    state=State.RUNNING,
    external_trigger=False,
    conf=None,
    session=None,
):
    """
    Create new DagRun and all relevant TaskInstances
    """
    dagrun = (session.query(DagRun).filter(
        DagRun.dag_id == dag.dag_id,
        DagRun.execution_date == execution_date).first())
    if dagrun is None:
        dagrun = DagRun(
            run_id=run_id,
            execution_date=execution_date,
            start_date=dag.start_date,
            _state=state,
            external_trigger=external_trigger,
            dag_id=dag.dag_id,
            conf=conf,
        )
        session.add(dagrun)
    else:
        logger.warning("Running with existing airflow dag run %s", dagrun)

    dagrun.dag = dag
    dagrun.run_id = run_id
    session.commit()

    # create the associated task instances
    # state is None at the moment of creation

    # dagrun.verify_integrity(session=session)
    # fetches [TaskInstance] again
    # tasks_skipped = databand_run.tasks_skipped

    # we can find a source of the completion, but also,
    # sometimes we don't know the source of the "complete"
    TI = TaskInstance
    tis = (session.query(TI).filter(TI.dag_id == dag.dag_id,
                                    TI.execution_date == execution_date).all())
    tis = {ti.task_id: ti for ti in tis}

    for af_task in dag.tasks:
        ti = tis.get(af_task.task_id)
        if ti is None:
            ti = TaskInstance(af_task, execution_date=execution_date)
            ti.start_date = timezone.utcnow()
            ti.end_date = timezone.utcnow()
            session.add(ti)
        task_run = databand_run.get_task_run_by_af_id(af_task.task_id)
        # all tasks part of the backfill are scheduled to dagrun

        # Set log file path to expected airflow log file path
        task_run.log.local_log_file.path = ti.log_filepath.replace(
            ".log", "/{0}.log".format(ti.try_number))
        if task_run.is_reused:
            # this task is completed and we don't need to run it anymore
            ti.state = State.SUCCESS

    session.commit()

    return dagrun
 def _get_task_instance(self, state, end_date=None,
                        retry_delay=timedelta(minutes=15)):
     task = Mock(retry_delay=retry_delay, retry_exponential_backoff=False)
     ti = TaskInstance(task=task, state=state, execution_date=None)
     ti.end_date = end_date
     return ti