def test_backfill_multi_dates(self):
        dag = self.dagbag.get_dag('example_bash_operator')

        end_date = DEFAULT_DATE + datetime.timedelta(days=1)

        executor = MockExecutor()
        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=end_date,
            executor=executor,
            ignore_first_depends_on_past=True
        )

        job.run()

        expected_execution_order = [
            ("runme_0", DEFAULT_DATE),
            ("runme_1", DEFAULT_DATE),
            ("runme_2", DEFAULT_DATE),
            ("runme_0", end_date),
            ("runme_1", end_date),
            ("runme_2", end_date),
            ("also_run_this", DEFAULT_DATE),
            ("also_run_this", end_date),
            ("run_after_loop", DEFAULT_DATE),
            ("run_after_loop", end_date),
            ("run_this_last", DEFAULT_DATE),
            ("run_this_last", end_date),
        ]
        self.assertListEqual(
            [((dag.dag_id, task_id, when, 1), State.SUCCESS)
             for (task_id, when) in expected_execution_order],
            executor.sorted_tasks
        )

        session = settings.Session()
        drs = session.query(DagRun).filter(
            DagRun.dag_id == dag.dag_id
        ).order_by(DagRun.execution_date).all()

        self.assertTrue(drs[0].execution_date == DEFAULT_DATE)
        self.assertTrue(drs[0].state == State.SUCCESS)
        self.assertTrue(drs[1].execution_date ==
                        DEFAULT_DATE + datetime.timedelta(days=1))
        self.assertTrue(drs[1].state == State.SUCCESS)

        dag.clear()
        session.close()
Beispiel #2
0
    def test_backfill_respect_dag_concurrency_limit(self, mock_log):

        dag = self._get_dummy_dag('test_backfill_respect_concurrency_limit')
        dag.concurrency = 2

        executor = MockExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=7),
        )

        job.run()

        self.assertGreater(len(executor.history), 0)

        concurrency_limit_reached_at_least_once = False

        num_running_task_instances = 0

        for running_task_instances in executor.history:
            self.assertLessEqual(len(running_task_instances), dag.concurrency)
            num_running_task_instances += len(running_task_instances)
            if len(running_task_instances) == dag.concurrency:
                concurrency_limit_reached_at_least_once = True

        self.assertEqual(8, num_running_task_instances)
        self.assertTrue(concurrency_limit_reached_at_least_once)

        times_dag_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            DagConcurrencyLimitReached,
        )

        times_pool_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            NoAvailablePoolSlot,
        )

        times_task_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            TaskConcurrencyLimitReached,
        )

        self.assertEqual(0, times_pool_limit_reached_in_debug)
        self.assertEqual(0, times_task_concurrency_limit_reached_in_debug)
        self.assertGreater(times_dag_concurrency_limit_reached_in_debug, 0)
Beispiel #3
0
    def test_backfill_conf(self):
        dag = self._get_dummy_dag('test_backfill_conf')

        executor = MockExecutor()

        conf_ = json.loads("""{"key": "value"}""")
        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          conf=conf_)
        job.run()

        dr = DagRun.find(dag_id='test_backfill_conf')

        self.assertEqual(conf_, dr[0].conf)
Beispiel #4
0
    def test_sub_set_subdag(self):
        dag = DAG('test_sub_set_subdag',
                  start_date=DEFAULT_DATE,
                  default_args={'owner': 'owner1'})

        with dag:
            op1 = DummyOperator(task_id='leave1')
            op2 = DummyOperator(task_id='leave2')
            op3 = DummyOperator(task_id='upstream_level_1')
            op4 = DummyOperator(task_id='upstream_level_2')
            op5 = DummyOperator(task_id='upstream_level_3')
            # order randomly
            op2.set_downstream(op3)
            op1.set_downstream(op3)
            op4.set_downstream(op5)
            op3.set_downstream(op4)

        dag.clear()
        dr = dag.create_dagrun(run_id="test",
                               state=State.RUNNING,
                               execution_date=DEFAULT_DATE,
                               start_date=DEFAULT_DATE)

        executor = MockExecutor()
        sub_dag = dag.sub_dag(task_regex="leave*",
                              include_downstream=False,
                              include_upstream=False)
        job = BackfillJob(dag=sub_dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor)
        job.run()

        self.assertRaises(sqlalchemy.orm.exc.NoResultFound, dr.refresh_from_db)
        # the run_id should have changed, so a refresh won't work
        drs = DagRun.find(dag_id=dag.dag_id, execution_date=DEFAULT_DATE)
        dr = drs[0]

        self.assertEqual(
            DagRun.generate_run_id(DagRunType.BACKFILL_JOB, DEFAULT_DATE),
            dr.run_id)
        for ti in dr.get_task_instances():
            if ti.task_id == 'leave1' or ti.task_id == 'leave2':
                self.assertEqual(State.SUCCESS, ti.state)
            else:
                self.assertEqual(State.NONE, ti.state)
Beispiel #5
0
    def test_backfill_max_limit_check_within_limit(self):
        dag = self._get_dag_test_max_active_limits(
            'test_backfill_max_limit_check_within_limit', max_active_runs=16)

        start_date = DEFAULT_DATE - datetime.timedelta(hours=1)
        end_date = DEFAULT_DATE

        executor = MockExecutor()
        job = BackfillJob(dag=dag,
                          start_date=start_date,
                          end_date=end_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        dagruns = DagRun.find(dag_id=dag.dag_id)
        self.assertEqual(2, len(dagruns))
        self.assertTrue(all(run.state == State.SUCCESS for run in dagruns))
Beispiel #6
0
    def test_backfill_execute_subdag(self):
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag_op_task = dag.get_task('section-1')

        subdag = subdag_op_task.subdag
        subdag.schedule_interval = '@daily'

        start_date = timezone.utcnow()
        executor = MockExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=start_date,
                          end_date=start_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        subdag_op_task.pre_execute(context={'execution_date': start_date})
        subdag_op_task.execute(context={'execution_date': start_date})
        subdag_op_task.post_execute(context={'execution_date': start_date})

        history = executor.history
        subdag_history = history[0]

        # check that all 5 task instances of the subdag 'section-1' were executed
        self.assertEqual(5, len(subdag_history))
        for sdh in subdag_history:
            ti = sdh[3]
            self.assertIn('section-1-task-', ti.task_id)

        with create_session() as session:
            successful_subdag_runs = (
                session
                .query(DagRun)
                .filter(DagRun.dag_id == subdag.dag_id)
                .filter(DagRun.execution_date == start_date)
                # pylint: disable=comparison-with-callable
                .filter(DagRun.state == State.SUCCESS)
                .count()
            )

            self.assertEqual(1, successful_subdag_runs)

        subdag.clear()
        dag.clear()
Beispiel #7
0
    def test_backfill_execute_subdag_with_removed_task(self):
        """
        Ensure that subdag operators execute properly in the case where
        an associated task of the subdag has been removed from the dag
        definition, but has instances in the database from previous runs.
        """
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag = dag.get_task('section-1').subdag

        executor = MockExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          donot_pickle=True)

        removed_task_ti = TI(
            task=DummyOperator(task_id='removed_task'),
            execution_date=DEFAULT_DATE,
            state=State.REMOVED)
        removed_task_ti.dag_id = subdag.dag_id

        session = settings.Session()
        session.merge(removed_task_ti)
        session.commit()

        with timeout(seconds=30):
            job.run()

        for task in subdag.tasks:
            instance = session.query(TI).filter(
                TI.dag_id == subdag.dag_id,
                TI.task_id == task.task_id,
                TI.execution_date == DEFAULT_DATE).first()

            self.assertIsNotNone(instance)
            self.assertEqual(instance.state, State.SUCCESS)

        removed_task_ti.refresh_from_db()
        self.assertEqual(removed_task_ti.state, State.REMOVED)

        subdag.clear()
        dag.clear()
Beispiel #8
0
    def test_backfill_pool_not_found(self):
        dag = self._get_dummy_dag(
            dag_id='test_backfill_pool_not_found',
            pool='king_pool',
        )

        executor = MockExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=7),
        )

        try:
            job.run()
        except AirflowException:
            return

        self.fail()
Beispiel #9
0
    def test_backfill_max_limit_check_complete_loop(self):
        dag = self._get_dag_test_max_active_limits(
            'test_backfill_max_limit_check_complete_loop')
        start_date = DEFAULT_DATE - datetime.timedelta(hours=1)
        end_date = DEFAULT_DATE

        # Given the max limit to be 1 in active dag runs, we need to run the
        # backfill job 3 times
        success_expected = 2
        executor = MockExecutor()
        job = BackfillJob(dag=dag,
                          start_date=start_date,
                          end_date=end_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        success_dagruns = len(DagRun.find(dag_id=dag.dag_id, state=State.SUCCESS))
        running_dagruns = len(DagRun.find(dag_id=dag.dag_id, state=State.RUNNING))
        self.assertEqual(success_expected, success_dagruns)
        self.assertEqual(0, running_dagruns)  # no dag_runs in running state are left
    def test_backfill_run_backwards(self):
        dag = self.dagbag.get_dag("test_start_date_scheduling")
        dag.clear()

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=1),
                          run_backwards=True)
        job.run()

        session = settings.Session()
        tis = session.query(TI).filter(
            TI.dag_id == 'test_start_date_scheduling'
            and TI.task_id == 'dummy').order_by(TI.execution_date).all()

        queued_times = [ti.queued_dttm for ti in tis]
        self.assertTrue(queued_times == sorted(queued_times, reverse=True))
        self.assertTrue(all([ti.state == State.SUCCESS for ti in tis]))

        dag.clear()
        session.close()
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        target_dag.sync_to_db()

        dag_file_processor = DagFileProcessor(dag_ids=[], log=Mock())
        task_instances_list = Mock()
        dag_file_processor._process_task_instances(
            target_dag, task_instances_list=task_instances_list)
        self.assertFalse(task_instances_list.append.called)

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          ignore_first_depends_on_past=True)
        job.run()

        dag_file_processor._process_task_instances(
            target_dag, task_instances_list=task_instances_list)

        self.assertTrue(task_instances_list.append.called)
Beispiel #12
0
    def test_backfill_rerun_upstream_failed_tasks(self):
        dag = DAG(dag_id='test_backfill_rerun_upstream_failed',
                  start_date=DEFAULT_DATE,
                  schedule_interval='@daily')

        with dag:
            op1 = DummyOperator(
                task_id='test_backfill_rerun_upstream_failed_task-1', dag=dag)
            op2 = DummyOperator(
                task_id='test_backfill_rerun_upstream_failed_task-2', dag=dag)
            op1.set_upstream(op2)

        dag.clear()
        executor = MockExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=2),
        )
        job.run()

        ti = TI(
            task=dag.get_task('test_backfill_rerun_upstream_failed_task-1'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.UPSTREAM_FAILED)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=True)
        job.run()
        ti = TI(
            task=dag.get_task('test_backfill_rerun_upstream_failed_task-1'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
    def test_backfill_examples(self, dag_id, expected_execution_order):
        """
        Test backfilling example dags

        Try to backfill some of the example dags. Be careful, not all dags are suitable
        for doing this. For example, a dag that sleeps forever, or does not have a
        schedule won't work here since you simply can't backfill them.
        """
        dag = self.dagbag.get_dag(dag_id)

        logger.info('*** Running example DAG: %s', dag.dag_id)
        executor = MockExecutor()
        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          ignore_first_depends_on_past=True)

        job.run()
        self.assertListEqual(
            [((dag_id, task_id, DEFAULT_DATE, 1), State.SUCCESS)
             for task_id in expected_execution_order], executor.sorted_tasks)
Beispiel #14
0
    def test_backfill_depends_on_past_backwards(self):
        """
        Test that CLI respects -B argument and raises on interaction with depends_on_past
        """
        dag_id = 'test_depends_on_past'
        start_date = DEFAULT_DATE + datetime.timedelta(days=1)
        end_date = start_date + datetime.timedelta(days=1)
        kwargs = dict(
            start_date=start_date,
            end_date=end_date,
        )
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        executor = MockExecutor()
        job = BackfillJob(dag=dag,
                          executor=executor,
                          ignore_first_depends_on_past=True,
                          **kwargs)
        job.run()

        ti = TI(dag.get_task('test_dop_task'), end_date)
        ti.refresh_from_db()
        # runs fine forwards
        self.assertEqual(ti.state, State.SUCCESS)

        # raises backwards
        expected_msg = 'You cannot backfill backwards because one or more tasks depend_on_past: {}'.format(
            'test_dop_task')
        with self.assertRaisesRegex(AirflowException, expected_msg):
            executor = MockExecutor()
            job = BackfillJob(dag=dag,
                              executor=executor,
                              run_backwards=True,
                              **kwargs)
            job.run()
Beispiel #15
0
    def test_backfill_rerun_failed_tasks_without_flag(self):
        dag = DAG(
            dag_id='test_backfill_rerun_failed',
            start_date=DEFAULT_DATE,
            schedule_interval='@daily')

        with dag:
            DummyOperator(
                task_id='test_backfill_rerun_failed_task-1',
                dag=dag)

        dag.clear()

        executor = MockExecutor()

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          )
        job.run()

        ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.FAILED)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=False
                          )

        with self.assertRaises(AirflowException):
            job.run()