Esempio n. 1
0
    def test_backfill_examples(self):
        """
        Test backfilling example dags
        """

        # some DAGs really are just examples... but try to make them work!
        skip_dags = [
            'example_http_operator',
            'example_twitter_dag',
            'example_trigger_target_dag',
            'example_trigger_controller_dag',  # tested above
            'test_utils',  # sleeps forever
        ]

        logger = logging.getLogger('BackfillJobTest.test_backfill_examples')
        dags = [
            dag for dag in self.dagbag.dags.values()
            if 'example_dags' in dag.full_filepath
            and dag.dag_id not in skip_dags
            ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            logger.info('*** Running example DAG #{}: {}'.format(i, dag.dag_id))
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True)
            job.run()
Esempio n. 2
0
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        cluster = LocalCluster()

        dags = [
            dag for dag in self.dagbag.dags.values()
            if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=cluster.scheduler_address))
            job.run()

        cluster.close()
Esempio n. 3
0
    def test_backfill_pooled_tasks(self):
        """
        Test that queued tasks are executed by BackfillJob

        Test for https://github.com/airbnb/airflow/pull/1225
        """
        session = settings.Session()
        pool = Pool(pool='test_backfill_pooled_task_pool', slots=1)
        session.add(pool)
        session.commit()

        dag = self.dagbag.get_dag('test_backfill_pooled_task_dag')
        dag.clear()

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)

        # run with timeout because this creates an infinite loop if not
        # caught
        with timeout(seconds=30):
            job.run()

        ti = TI(
            task=dag.get_task('test_backfill_pooled_task'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Esempio n. 4
0
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        dag.clear()
        target_dag.clear()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)
        self.assertFalse(queue.append.called)

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_first_depends_on_past=True
        )
        job.run()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)

        self.assertTrue(queue.append.called)
        target_dag.clear()
        dag.clear()
Esempio n. 5
0
 def test_backfill_examples(self):
     dagbag = DagBag(
         dag_folder=DEV_NULL, include_examples=True)
     dags = [
         dag for dag in dagbag.dags.values()
         if dag.dag_id in ('example_bash_operator',)]
     for dag in dags:
         dag.clear(
             start_date=DEFAULT_DATE,
             end_date=DEFAULT_DATE)
     for dag in dags:
         job = BackfillJob(
             dag=dag,
             start_date=DEFAULT_DATE,
             end_date=DEFAULT_DATE)
         job.run()
Esempio n. 6
0
 def test_backfill_examples(self):
     """
     Test backfilling example dags
     """
     dags = [
         dag for dag in self.dagbag.dags.values()
         if dag.dag_id in ('example_bash_operator',)]
     for dag in dags:
         dag.clear(
             start_date=DEFAULT_DATE,
             end_date=DEFAULT_DATE)
     for dag in dags:
         job = BackfillJob(
             dag=dag,
             start_date=DEFAULT_DATE,
             end_date=DEFAULT_DATE)
         job.run()
Esempio n. 7
0
    def test_scheduler_start_date(self):
        """
        Test that the scheduler respects start_dates, even when DAGS have run
        """

        dag_id = 'test_start_date_scheduling'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        self.assertTrue(dag.start_date > DEFAULT_DATE)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # zero tasks ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)

        # previously, running this backfill would kick off the Scheduler
        # because it would take the most recent run and start from there
        # That behavior still exists, but now it will only do so if after the
        # start date
        backfill = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)
        backfill.run()

        # one task ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # still one task
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
Esempio n. 8
0
    def test_backfill_pool_not_found(self):
        dag = self._get_dummy_dag(
            dag_id='test_backfill_pool_not_found',
            pool='king_pool',
        )

        executor = TestExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=7),
        )

        try:
            job.run()
        except AirflowException:
            return

        self.fail()
Esempio n. 9
0
    def test_backfill_run_backwards(self):
        dag = self.dagbag.get_dag("test_start_date_scheduling")
        dag.clear()

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=1),
                          run_backwards=True)
        job.run()

        session = settings.Session()
        tis = session.query(TI).filter(
            TI.dag_id == 'test_start_date_scheduling'
            and TI.task_id == 'dummy').order_by(TI.execution_date).all()

        queued_times = [ti.queued_dttm for ti in tis]
        self.assertTrue(queued_times == sorted(queued_times, reverse=True))
        self.assertTrue(all([ti.state == State.SUCCESS for ti in tis]))

        dag.clear()
        session.close()
Esempio n. 10
0
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        target_dag.sync_to_db()

        scheduler = SchedulerJob()
        task_instances_list = Mock()
        scheduler._process_task_instances(
            target_dag, task_instances_list=task_instances_list)
        self.assertFalse(task_instances_list.append.called)

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          ignore_first_depends_on_past=True)
        job.run()

        scheduler._process_task_instances(
            target_dag, task_instances_list=task_instances_list)

        self.assertTrue(task_instances_list.append.called)
Esempio n. 11
0
    def test_dag_run_with_finished_tasks_set_to_success(self):
        dag = self._get_dummy_dag('dummy_dag')

        dag_run = dag.create_dagrun(
            run_id='test',
            state=State.RUNNING,
        )

        for ti in dag_run.get_task_instances():
            ti.set_state(State.SUCCESS)

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=8),
                          ignore_first_depends_on_past=True)

        job._set_unfinished_dag_runs_to_failed([dag_run])

        dag_run.refresh_from_db()

        self.assertEquals(State.SUCCESS, dag_run.state)
    def test_backfill_max_limit_check_complete_loop(self):
        dag = self._get_dag_test_max_active_limits(
            'test_backfill_max_limit_check_complete_loop')
        start_date = DEFAULT_DATE - datetime.timedelta(hours=1)
        end_date = DEFAULT_DATE

        # Given the max limit to be 1 in active dag runs, we need to run the
        # backfill job 3 times
        success_expected = 2
        executor = TestExecutor()
        job = BackfillJob(dag=dag,
                          start_date=start_date,
                          end_date=end_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        success_dagruns = len(DagRun.find(dag_id=dag.dag_id, state=State.SUCCESS))
        running_dagruns = len(DagRun.find(dag_id=dag.dag_id, state=State.RUNNING))
        self.assertEqual(success_expected, success_dagruns)
        self.assertEqual(0, running_dagruns)  # no dag_runs in running state are left
Esempio n. 13
0
    def test_backfill_execute_subdag_with_removed_task(self):
        """
        Ensure that subdag operators execute properly in the case where
        an associated task of the subdag has been removed from the dag
        definition, but has instances in the database from previous runs.
        """
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag = dag.get_task('section-1').subdag

        executor = MockExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          donot_pickle=True)

        removed_task_ti = TI(task=DummyOperator(task_id='removed_task'),
                             execution_date=DEFAULT_DATE,
                             state=State.REMOVED)
        removed_task_ti.dag_id = subdag.dag_id

        session = settings.Session()
        session.merge(removed_task_ti)

        with timeout(seconds=30):
            job.run()

        for task in subdag.tasks:
            instance = session.query(TI).filter(
                TI.dag_id == subdag.dag_id, TI.task_id == task.task_id,
                TI.execution_date == DEFAULT_DATE).first()

            self.assertIsNotNone(instance)
            self.assertEqual(instance.state, State.SUCCESS)

        removed_task_ti.refresh_from_db()
        self.assertEqual(removed_task_ti.state, State.REMOVED)

        subdag.clear()
        dag.clear()
Esempio n. 14
0
    def test_scheduler_start_date(self):
        """
        Test that the scheduler respects start_dates, even when DAGS have run
        """

        dag_id = 'test_start_date_scheduling'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        self.assertTrue(dag.start_date > DEFAULT_DATE)

        scheduler = SchedulerJob(dag_id, num_runs=2)
        scheduler.run()

        # zero tasks ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)

        # previously, running this backfill would kick off the Scheduler
        # because it would take the most recent run and start from there
        # That behavior still exists, but now it will only do so if after the
        # start date
        backfill = BackfillJob(dag=dag,
                               start_date=DEFAULT_DATE,
                               end_date=DEFAULT_DATE)
        backfill.run()

        # one task ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)

        scheduler = SchedulerJob(dag_id, num_runs=2)
        scheduler.run()

        # still one task
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
Esempio n. 15
0
    def test_backfill_depends_on_past(self):
        """
        Test that backfill respects ignore_depends_on_past
        """
        dag = self.dagbag.get_dag('test_depends_on_past')
        dag.clear()
        run_date = DEFAULT_DATE + datetime.timedelta(days=5)

        # backfill should deadlock
        self.assertRaisesRegexp(
            AirflowException, 'BackfillJob is deadlocked',
            BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run)

        BackfillJob(dag=dag,
                    start_date=run_date,
                    end_date=run_date,
                    ignore_first_depends_on_past=True).run()

        # ti should have succeeded
        ti = TI(dag.tasks[0], run_date)
        ti.refresh_from_db()
        self.assertEquals(ti.state, State.SUCCESS)
Esempio n. 16
0
    def test_backfill_execute_subdag(self):
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag_op_task = dag.get_task('section-1')

        subdag = subdag_op_task.subdag
        subdag.schedule_interval = '@daily'

        start_date = timezone.utcnow()
        executor = MockExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=start_date,
                          end_date=start_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        subdag_op_task.pre_execute(context={'execution_date': start_date})
        subdag_op_task.execute(context={'execution_date': start_date})
        subdag_op_task.post_execute(context={'execution_date': start_date})

        history = executor.history
        subdag_history = history[0]

        # check that all 5 task instances of the subdag 'section-1' were executed
        self.assertEqual(5, len(subdag_history))
        for sdh in subdag_history:
            ti = sdh[3]
            self.assertIn('section-1-task-', ti.task_id)

        with create_session() as session:
            successful_subdag_runs = (session.query(DagRun).filter(
                DagRun.dag_id == subdag.dag_id).filter(
                    DagRun.execution_date == start_date).filter(
                        DagRun.state == State.SUCCESS).count())

            self.assertEqual(1, successful_subdag_runs)

        subdag.clear()
        dag.clear()
Esempio n. 17
0
    def test_backfill_examples(self, dag_id, expected_execution_order):
        """
        Test backfilling example dags

        Try to backfill some of the example dags. Be careful, not all dags are suitable
        for doing this. For example, a dag that sleeps forever, or does not have a
        schedule won't work here since you simply can't backfill them.
        """
        self.maxDiff = None
        dag = self.dagbag.get_dag(dag_id)

        logger.info('*** Running example DAG: %s', dag.dag_id)
        executor = MockExecutor()
        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          ignore_first_depends_on_past=True)

        job.run()
        self.assertListEqual(
            [((dag_id, task_id, DEFAULT_DATE, 1), State.SUCCESS)
             for task_id in expected_execution_order], executor.sorted_tasks)
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        dags = [
            dag for dag in self.dagbag.dags.values() if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        for dag in sorted(dags, key=lambda d: d.dag_id):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=self.cluster.scheduler_address))
            job.run()
Esempio n. 19
0
    def test_backfill_multi_dates(self):
        dag = self.dagbag.get_dag('example_bash_operator')
        dag.clear()

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=1),
                          ignore_first_depends_on_past=True)
        job.run()

        session = settings.Session()
        drs = session.query(DagRun).filter(
            DagRun.dag_id == 'example_bash_operator').order_by(
                DagRun.execution_date).all()

        self.assertTrue(drs[0].execution_date == DEFAULT_DATE)
        self.assertTrue(drs[0].state == State.SUCCESS)
        self.assertTrue(drs[1].execution_date == DEFAULT_DATE +
                        datetime.timedelta(days=1))
        self.assertTrue(drs[1].state == State.SUCCESS)

        dag.clear()
        session.close()
Esempio n. 20
0
    def test_backfill_multi_dates(self):
        dag = self.dagbag.get_dag('example_bash_operator')
        dag.clear()

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE+datetime.timedelta(days=1),
            ignore_first_depends_on_past=True
        )
        job.run()

        session = settings.Session()
        drs = session.query(DagRun).filter(
            DagRun.dag_id=='example_bash_operator'
        ).order_by(DagRun.execution_date).all()

        self.assertTrue(drs[0].execution_date == DEFAULT_DATE)
        self.assertTrue(drs[0].state == State.SUCCESS)
        self.assertTrue(drs[1].execution_date == DEFAULT_DATE+datetime.timedelta(days=1))
        self.assertTrue(drs[1].state == State.SUCCESS)

        dag.clear()
        session.close()
Esempio n. 21
0
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        dag.clear()
        target_dag.clear()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)
        self.assertFalse(queue.append.called)

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          ignore_first_depends_on_past=True)
        job.run()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)

        self.assertTrue(queue.append.called)
        target_dag.clear()
        dag.clear()
Esempio n. 22
0
    def test_backfill_rerun_upstream_failed_tasks(self):
        dag = DAG(dag_id='test_backfill_rerun_upstream_failed',
                  start_date=DEFAULT_DATE,
                  schedule_interval='@daily')

        with dag:
            t1 = DummyOperator(
                task_id='test_backfill_rerun_upstream_failed_task-1', dag=dag)
            t2 = DummyOperator(
                task_id='test_backfill_rerun_upstream_failed_task-2', dag=dag)
            t1.set_upstream(t2)

        dag.clear()
        executor = MockExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=2),
        )
        job.run()

        ti = TI(
            task=dag.get_task('test_backfill_rerun_upstream_failed_task-1'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.UPSTREAM_FAILED)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=True)
        job.run()
        ti = TI(
            task=dag.get_task('test_backfill_rerun_upstream_failed_task-1'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
    def test_backfill_run_rescheduled(self):
        dag = DAG(
            dag_id='test_backfill_run_rescheduled',
            start_date=DEFAULT_DATE,
            schedule_interval='@daily')

        with dag:
            DummyOperator(
                task_id='test_backfill_run_rescheduled_task-1',
                dag=dag,
            )

        dag.clear()

        executor = TestExecutor()

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          )
        job.run()

        ti = TI(task=dag.get_task('test_backfill_run_rescheduled_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.UP_FOR_RESCHEDULE)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=True
                          )
        job.run()
        ti = TI(task=dag.get_task('test_backfill_run_rescheduled_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Esempio n. 24
0
    def test_backfill_depends_on_past_backwards(self):
        """
        Test that CLI respects -B argument and raises on interaction with depends_on_past
        """
        dag_id = 'test_depends_on_past'
        start_date = DEFAULT_DATE + datetime.timedelta(days=1)
        end_date = start_date + datetime.timedelta(days=1)
        kwargs = dict(
            start_date=start_date,
            end_date=end_date,
        )
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        executor = MockExecutor()
        job = BackfillJob(dag=dag,
                          executor=executor,
                          ignore_first_depends_on_past=True,
                          **kwargs)
        job.run()

        ti = TI(dag.get_task('test_dop_task'), end_date)
        ti.refresh_from_db()
        # runs fine forwards
        self.assertEqual(ti.state, State.SUCCESS)

        # raises backwards
        expected_msg = 'You cannot backfill backwards because one or more tasks depend_on_past: {}'.format(
            'test_dop_task')
        with self.assertRaisesRegex(AirflowException, expected_msg):
            executor = MockExecutor()
            job = BackfillJob(dag=dag,
                              executor=executor,
                              run_backwards=True,
                              **kwargs)
            job.run()
    def test_backfill_rerun_failed_tasks_without_flag(self):
        dag = DAG(
            dag_id='test_backfill_rerun_failed',
            start_date=DEFAULT_DATE,
            schedule_interval='@daily')

        with dag:
            DummyOperator(
                task_id='test_backfill_rerun_failed_task-1',
                dag=dag)

        dag.clear()

        executor = TestExecutor()

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          )
        job.run()

        ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.FAILED)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=False
                          )

        with self.assertRaises(AirflowException):
            job.run()
Esempio n. 26
0
    def test_update_counters(self):
        dag = DAG(dag_id='test_manage_executor_state', start_date=DEFAULT_DATE)

        task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        job = BackfillJob(dag=dag)

        session = settings.Session()
        dr = dag.create_dagrun(run_id=DagRun.ID_PREFIX,
                               state=State.RUNNING,
                               execution_date=DEFAULT_DATE,
                               start_date=DEFAULT_DATE,
                               session=session)
        ti = TI(task1, dr.execution_date)
        ti.refresh_from_db()

        ti_status = BackfillJob._DagRunTaskStatus()

        # test for success
        ti.set_state(State.SUCCESS, session)
        ti_status.running[ti.key] = ti
        job._update_counters(ti_status=ti_status)
        self.assertTrue(len(ti_status.running) == 0)
        self.assertTrue(len(ti_status.succeeded) == 1)
        self.assertTrue(len(ti_status.skipped) == 0)
        self.assertTrue(len(ti_status.failed) == 0)
        self.assertTrue(len(ti_status.to_run) == 0)

        ti_status.succeeded.clear()

        # test for skipped
        ti.set_state(State.SKIPPED, session)
        ti_status.running[ti.key] = ti
        job._update_counters(ti_status=ti_status)
        self.assertTrue(len(ti_status.running) == 0)
        self.assertTrue(len(ti_status.succeeded) == 0)
        self.assertTrue(len(ti_status.skipped) == 1)
        self.assertTrue(len(ti_status.failed) == 0)
        self.assertTrue(len(ti_status.to_run) == 0)

        ti_status.skipped.clear()

        # test for failed
        ti.set_state(State.FAILED, session)
        ti_status.running[ti.key] = ti
        job._update_counters(ti_status=ti_status)
        self.assertTrue(len(ti_status.running) == 0)
        self.assertTrue(len(ti_status.succeeded) == 0)
        self.assertTrue(len(ti_status.skipped) == 0)
        self.assertTrue(len(ti_status.failed) == 1)
        self.assertTrue(len(ti_status.to_run) == 0)

        ti_status.failed.clear()

        # test for retry
        ti.set_state(State.UP_FOR_RETRY, session)
        ti_status.running[ti.key] = ti
        job._update_counters(ti_status=ti_status)
        self.assertTrue(len(ti_status.running) == 0)
        self.assertTrue(len(ti_status.succeeded) == 0)
        self.assertTrue(len(ti_status.skipped) == 0)
        self.assertTrue(len(ti_status.failed) == 0)
        self.assertTrue(len(ti_status.to_run) == 1)

        ti_status.to_run.clear()

        # test for reschedule
        ti.set_state(State.UP_FOR_RESCHEDULE, session)
        ti_status.running[ti.key] = ti
        job._update_counters(ti_status=ti_status)
        self.assertTrue(len(ti_status.running) == 0)
        self.assertTrue(len(ti_status.succeeded) == 0)
        self.assertTrue(len(ti_status.skipped) == 0)
        self.assertTrue(len(ti_status.failed) == 0)
        self.assertTrue(len(ti_status.to_run) == 1)

        ti_status.to_run.clear()

        # test for none
        ti.set_state(State.NONE, session)
        ti_status.running[ti.key] = ti
        job._update_counters(ti_status=ti_status)
        self.assertTrue(len(ti_status.running) == 0)
        self.assertTrue(len(ti_status.succeeded) == 0)
        self.assertTrue(len(ti_status.skipped) == 0)
        self.assertTrue(len(ti_status.failed) == 0)
        self.assertTrue(len(ti_status.to_run) == 1)

        ti_status.to_run.clear()

        session.close()
Esempio n. 27
0
    def test_backfill_fill_blanks(self):
        dag = DAG(
            'test_backfill_fill_blanks',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'},
        )

        with dag:
            op1 = DummyOperator(task_id='op1')
            op2 = DummyOperator(task_id='op2')
            op3 = DummyOperator(task_id='op3')
            op4 = DummyOperator(task_id='op4')
            op5 = DummyOperator(task_id='op5')
            op6 = DummyOperator(task_id='op6')

        dag.clear()
        dr = dag.create_dagrun(run_id='test',
                               state=State.RUNNING,
                               execution_date=DEFAULT_DATE,
                               start_date=DEFAULT_DATE)
        executor = MockExecutor()

        session = settings.Session()

        tis = dr.get_task_instances()
        for ti in tis:
            if ti.task_id == op1.task_id:
                ti.state = State.UP_FOR_RETRY
                ti.end_date = DEFAULT_DATE
            elif ti.task_id == op2.task_id:
                ti.state = State.FAILED
            elif ti.task_id == op3.task_id:
                ti.state = State.SKIPPED
            elif ti.task_id == op4.task_id:
                ti.state = State.SCHEDULED
            elif ti.task_id == op5.task_id:
                ti.state = State.UPSTREAM_FAILED
            # op6 = None
            session.merge(ti)
        session.commit()
        session.close()

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor)
        self.assertRaisesRegex(AirflowException, 'Some task instances failed',
                               job.run)

        self.assertRaises(sqlalchemy.orm.exc.NoResultFound, dr.refresh_from_db)
        # the run_id should have changed, so a refresh won't work
        drs = DagRun.find(dag_id=dag.dag_id, execution_date=DEFAULT_DATE)
        dr = drs[0]

        self.assertEqual(dr.state, State.FAILED)

        tis = dr.get_task_instances()
        for ti in tis:
            if ti.task_id in (op1.task_id, op4.task_id, op6.task_id):
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == op2.task_id:
                self.assertEqual(ti.state, State.FAILED)
            elif ti.task_id == op3.task_id:
                self.assertEqual(ti.state, State.SKIPPED)
            elif ti.task_id == op5.task_id:
                self.assertEqual(ti.state, State.UPSTREAM_FAILED)
Esempio n. 28
0
    def test_backfill_with_no_pool_limit(self, mock_getint, mock_log):
        non_pooled_backfill_task_slot_count = 2

        def getint(section, key):
            if section.lower() == 'core' and \
                    'non_pooled_backfill_task_slot_count' == key.lower():
                return non_pooled_backfill_task_slot_count
            else:
                return configuration.conf.getint(section, key)

        mock_getint.side_effect = getint

        dag = self._get_dummy_dag('test_backfill_with_no_pool_limit')

        executor = TestExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=7),
        )

        job.run()

        self.assertTrue(0 < len(executor.history))

        non_pooled_task_slot_count_reached_at_least_once = False

        num_running_task_instances = 0

        # if no pool is specified, the number of tasks running in
        # parallel per backfill should be less than
        # non_pooled_backfill_task_slot_count at any point of time.
        for running_task_instances in executor.history:
            self.assertLessEqual(
                len(running_task_instances),
                non_pooled_backfill_task_slot_count,
            )
            num_running_task_instances += len(running_task_instances)
            if len(running_task_instances
                   ) == non_pooled_backfill_task_slot_count:
                non_pooled_task_slot_count_reached_at_least_once = True

        self.assertEquals(8, num_running_task_instances)
        self.assertTrue(non_pooled_task_slot_count_reached_at_least_once)

        times_dag_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            DagConcurrencyLimitReached,
        )

        times_pool_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            NoAvailablePoolSlot,
        )

        times_task_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            TaskConcurrencyLimitReached,
        )

        self.assertEquals(0, times_dag_concurrency_limit_reached_in_debug)
        self.assertEquals(0, times_task_concurrency_limit_reached_in_debug)
        self.assertGreater(times_pool_limit_reached_in_debug, 0)
Esempio n. 29
0
    def _execute(self, session=None):
        """
        Initializes all components required to run a dag for a specified date range and
        calls helper method to execute the tasks.
        """
        # Trigger cleaning
        if self.airflow_config.clean_zombies_during_backfill:
            ClearZombieJob().run()

        ti_status = BackfillJob._DagRunTaskStatus()

        # picklin'
        pickle_id = self.dag.pickle_id
        # We don't need to pickle our dag again as it already pickled on job creattion
        # also this will save it into databand table, that have no use for the airflow
        # if not self.donot_pickle and self.executor.__class__ not in (
        #     executors.LocalExecutor,
        #     executors.SequentialExecutor,
        # ):
        #     pickle_id = airflow_pickle(self.dag, session=session)

        executor = self.executor
        executor.start()

        ti_status.total_runs = 1  # total dag runs in backfill

        dag_run = None
        try:
            dag_run = self._get_dag_run(session=session)

            # Create relation DagRun <> Job
            dag_run.conf = {"job_id": self.id}
            session.merge(dag_run)
            session.commit()

            run_date = dag_run.execution_date
            if dag_run is None:
                raise DatabandSystemError("Can't build dagrun")

            tis_map = self._task_instances_for_dag_run(dag_run, session=session)

            if not tis_map:
                raise DatabandSystemError("There are no task instances to run!")
            ti_status.active_runs.append(dag_run)
            ti_status.to_run.update(tis_map or {})

            processed_dag_run_dates = self._process_dag_task_instances(
                ti_status=ti_status,
                executor=executor,
                pickle_id=pickle_id,
                session=session,
            )
            ti_status.executed_dag_run_dates.update(processed_dag_run_dates)

            err = self._collect_errors(ti_status=ti_status, session=session)
            if err:
                raise DatabandRunError("Airflow executor has failed to run the run")

            if run_date not in ti_status.executed_dag_run_dates:
                self.log.warning(
                    "Dag %s is not marked as completed!  %s not found in %s",
                    self.dag_id,
                    run_date,
                    ti_status.executed_dag_run_dates,
                )
        finally:
            # in sequential executor a keyboard interrupt would reach here and
            # then executor.end() -> heartbeat() -> sync() will cause the queued commands
            # to be run again before exiting
            if hasattr(executor, "commands_to_run"):
                executor.commands_to_run = []
            try:
                executor.end()
            except Exception:
                logger.exception("Failed to terminate executor")
            if dag_run and dag_run.state == State.RUNNING:
                _kill_dag_run_zombi(dag_run, session)
            session.commit()

        self.log.info("Run is completed. Exiting.")