Esempio n. 1
0
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        cluster = LocalCluster()

        dags = [
            dag for dag in self.dagbag.dags.values()
            if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=cluster.scheduler_address))
            job.run()

        cluster.close()
Esempio n. 2
0
    def test_backfill_pooled_tasks(self):
        """
        Test that queued tasks are executed by BackfillJob

        Test for https://github.com/airbnb/airflow/pull/1225
        """
        session = settings.Session()
        pool = Pool(pool='test_backfill_pooled_task_pool', slots=1)
        session.add(pool)
        session.commit()

        dag = self.dagbag.get_dag('test_backfill_pooled_task_dag')
        dag.clear()

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)

        # run with timeout because this creates an infinite loop if not
        # caught
        with timeout(seconds=30):
            job.run()

        ti = TI(
            task=dag.get_task('test_backfill_pooled_task'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Esempio n. 3
0
    def test_backfill_execute_subdag(self):
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag_op_task = dag.get_task('section-1')

        subdag = subdag_op_task.subdag
        subdag.schedule_interval = '@daily'

        start_date = timezone.utcnow()
        executor = TestExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=start_date,
                          end_date=start_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        history = executor.history
        subdag_history = history[0]

        # check that all 5 task instances of the subdag 'section-1' were executed
        self.assertEqual(5, len(subdag_history))
        for sdh in subdag_history:
            ti = sdh[3]
            self.assertIn('section-1-task-', ti.task_id)

        subdag.clear()
        dag.clear()
Esempio n. 4
0
    def test_backfill_examples(self):
        """
        Test backfilling example dags
        """

        # some DAGs really are just examples... but try to make them work!
        skip_dags = [
            'example_http_operator',
            'example_twitter_dag',
            'example_trigger_target_dag',
            'example_trigger_controller_dag',  # tested above
            'test_utils',  # sleeps forever
        ]

        logger = logging.getLogger('BackfillJobTest.test_backfill_examples')
        dags = [
            dag for dag in self.dagbag.dags.values()
            if 'example_dags' in dag.full_filepath and dag.dag_id not in skip_dags
        ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            logger.info('*** Running example DAG #{}: {}'.format(i, dag.dag_id))
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True)
            job.run()
Esempio n. 5
0
        def run_backfill(cond):
            cond.acquire()
            try:
                dag = self._get_dag_test_max_active_limits(dag_id)

                # this session object is different than the one in the main thread
                thread_session = settings.Session()

                # Existing dagrun that is not within the backfill range
                dag.create_dagrun(
                    run_id=run_id,
                    state=State.RUNNING,
                    execution_date=DEFAULT_DATE + datetime.timedelta(hours=1),
                    start_date=DEFAULT_DATE,
                )

                thread_session.commit()
                cond.notify()
            finally:
                cond.release()

            executor = TestExecutor()
            job = BackfillJob(dag=dag,
                              start_date=start_date,
                              end_date=end_date,
                              executor=executor,
                              donot_pickle=True)
            job.run()

            thread_session.close()
Esempio n. 6
0
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        dag.clear()
        target_dag.clear()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)
        self.assertFalse(queue.append.called)

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_first_depends_on_past=True
        )
        job.run()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)

        self.assertTrue(queue.append.called)
        target_dag.clear()
        dag.clear()
Esempio n. 7
0
    def test_backfill_rerun_failed_tasks_without_flag(self):
        dag = DAG(dag_id='test_backfill_rerun_failed',
                  start_date=DEFAULT_DATE,
                  schedule_interval='@daily')

        with dag:
            DummyOperator(task_id='test_backfill_rerun_failed_task-1', dag=dag)

        dag.clear()

        executor = TestExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=2),
        )
        job.run()

        ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.FAILED)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=False)

        with self.assertRaises(AirflowException):
            job.run()
Esempio n. 8
0
    def test_backfill_examples(self):
        """
        Test backfilling example dags
        """

        # some DAGs really are just examples... but try to make them work!
        skip_dags = [
            'example_http_operator',
            'example_twitter_dag',
            'example_trigger_target_dag',
            'example_trigger_controller_dag',  # tested above
            'test_utils',  # sleeps forever
        ]

        logger = logging.getLogger('BackfillJobTest.test_backfill_examples')
        dags = [
            dag for dag in self.dagbag.dags.values()
            if 'example_dags' in dag.full_filepath
            and dag.dag_id not in skip_dags
            ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            logger.info('*** Running example DAG #{}: {}'.format(i, dag.dag_id))
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True)
            job.run()
Esempio n. 9
0
    def test_backfill_multi_dates(self):
        dag = self.dagbag.get_dag('example_bash_operator')
        dag.clear()

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=1),
            ignore_first_depends_on_past=True
        )
        job.run()

        session = settings.Session()
        drs = session.query(DagRun).filter(
            DagRun.dag_id=='example_bash_operator'
        ).order_by(DagRun.execution_date).all()

        self.assertTrue(drs[0].execution_date == DEFAULT_DATE)
        self.assertTrue(drs[0].state == State.SUCCESS)
        self.assertTrue(drs[1].execution_date ==
                        DEFAULT_DATE + datetime.timedelta(days=1))
        self.assertTrue(drs[1].state == State.SUCCESS)

        dag.clear()
        session.close()
    def test_backfill_pooled_tasks(self):
        """
        Test that queued tasks are executed by BackfillJob
        """
        session = settings.Session()
        pool = Pool(pool='test_backfill_pooled_task_pool', slots=1)
        session.add(pool)
        session.commit()
        session.close()

        dag = self.dagbag.get_dag('test_backfill_pooled_task_dag')
        dag.clear()

        executor = TestExecutor(do_update=True)
        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            executor=executor)

        # run with timeout because this creates an infinite loop if not
        # caught
        try:
            with timeout(seconds=5):
                job.run()
        except AirflowTaskTimeout:
            pass
        ti = TI(
            task=dag.get_task('test_backfill_pooled_task'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Esempio n. 11
0
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        dags = [
            dag for dag in self.dagbag.dags.values()
            if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for dag in sorted(dags, key=lambda d: d.dag_id):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=self.cluster.scheduler_address))
            job.run()
Esempio n. 12
0
    def test_backfill_pooled_tasks(self):
        """
        Test that queued tasks are executed by BackfillJob

        Test for https://github.com/airbnb/airflow/pull/1225
        """
        session = settings.Session()
        pool = Pool(pool='test_backfill_pooled_task_pool', slots=1)
        session.add(pool)
        session.commit()

        dag = self.dagbag.get_dag('test_backfill_pooled_task_dag')
        dag.clear()

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)

        # run with timeout because this creates an infinite loop if not
        # caught
        with timeout(seconds=30):
            job.run()

        ti = TI(
            task=dag.get_task('test_backfill_pooled_task'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
    def test_backfill_examples(self, dag_id, expected_execution_order):
        """
        Test backfilling example dags

        Try to backfill some of the example dags. Be careful, not all dags are suitable
        for doing this. For example, a dag that sleeps forever, or does not have a
        schedule won't work here since you simply can't backfill them.
        """
        self.maxDiff = None
        dag = self.dagbag.get_dag(dag_id)

        logger.info('*** Running example DAG: %s', dag.dag_id)
        executor = TestExecutor()
        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            executor=executor,
            ignore_first_depends_on_past=True)

        job.run()
        self.assertListEqual(
            [((dag_id, task_id, DEFAULT_DATE, 1), State.SUCCESS)
             for task_id in expected_execution_order],
            executor.sorted_tasks
        )
Esempio n. 14
0
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        dag.clear()
        target_dag.clear()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)
        self.assertFalse(queue.append.called)

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_first_depends_on_past=True
        )
        job.run()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)

        self.assertTrue(queue.append.called)
        target_dag.clear()
        dag.clear()
Esempio n. 15
0
    def test_backfill_max_limit_check_no_count_existing(self):
        dag = self._get_dag_test_max_active_limits(
            'test_backfill_max_limit_check_no_count_existing')
        start_date = DEFAULT_DATE
        end_date = DEFAULT_DATE

        # Existing dagrun that is within the backfill range
        dag.create_dagrun(run_id="test_existing_backfill",
                          state=State.RUNNING,
                          execution_date=DEFAULT_DATE,
                          start_date=DEFAULT_DATE)

        executor = TestExecutor()
        job = BackfillJob(dag=dag,
                          start_date=start_date,
                          end_date=end_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        # BackfillJob will run since the existing DagRun does not count for the max
        # active limit since it's within the backfill date range.
        dagruns = DagRun.find(dag_id=dag.dag_id)
        # will only be able to run 1 (the existing one) since there's just
        # one dag run slot left given the max_active_runs limit
        self.assertEqual(1, len(dagruns))
        self.assertEqual(State.SUCCESS, dagruns[0].state)
Esempio n. 16
0
    def test_backfill_rerun_failed_tasks(self):
        dag = DAG(dag_id='test_backfill_rerun_failed',
                  start_date=DEFAULT_DATE,
                  schedule_interval='@daily')

        with dag:
            DummyOperator(task_id='test_backfill_rerun_failed_task-1', dag=dag)

        dag.clear()

        executor = TestExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=2),
        )
        job.run()

        ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.FAILED)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=True)
        job.run()
        ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Esempio n. 17
0
    def test_backfill_respect_pool_limit(self, mock_log):
        session = settings.Session()

        slots = 2
        pool = Pool(
            pool='pool_with_two_slots',
            slots=slots,
        )
        session.add(pool)
        session.commit()

        dag = self._get_dummy_dag(
            dag_id='test_backfill_respect_pool_limit',
            pool=pool.pool,
        )

        executor = MockExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=7),
        )

        job.run()

        self.assertTrue(0 < len(executor.history))

        pool_was_full_at_least_once = False
        num_running_task_instances = 0

        for running_task_instances in executor.history:
            self.assertLessEqual(len(running_task_instances), slots)
            num_running_task_instances += len(running_task_instances)
            if len(running_task_instances) == slots:
                pool_was_full_at_least_once = True

        self.assertEqual(8, num_running_task_instances)
        self.assertTrue(pool_was_full_at_least_once)

        times_dag_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            DagConcurrencyLimitReached,
        )

        times_pool_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            NoAvailablePoolSlot,
        )

        times_task_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            TaskConcurrencyLimitReached,
        )

        self.assertEqual(0, times_task_concurrency_limit_reached_in_debug)
        self.assertEqual(0, times_dag_concurrency_limit_reached_in_debug)
        self.assertGreater(times_pool_limit_reached_in_debug, 0)
    def test_backfill_ordered_concurrent_execute(self):
        dag = DAG(
            dag_id='test_backfill_ordered_concurrent_execute',
            start_date=DEFAULT_DATE,
            schedule_interval="@daily")

        with dag:
            op1 = DummyOperator(task_id='leave1')
            op2 = DummyOperator(task_id='leave2')
            op3 = DummyOperator(task_id='upstream_level_1')
            op4 = DummyOperator(task_id='upstream_level_2')
            op5 = DummyOperator(task_id='upstream_level_3')
            # order randomly
            op2.set_downstream(op3)
            op1.set_downstream(op3)
            op4.set_downstream(op5)
            op3.set_downstream(op4)

        dag.clear()

        executor = TestExecutor()
        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          )
        job.run()

        d0 = DEFAULT_DATE
        d1 = d0 + datetime.timedelta(days=1)
        d2 = d1 + datetime.timedelta(days=1)

        # test executor history keeps a list
        history = executor.history

        self.maxDiff = None
        self.assertListEqual(
            # key[0] is dag id, key[3] is try_number, we don't care about either of those here
            [sorted([item[-1].key[1:3] for item in batch]) for batch in history],
            [
                [
                    ('leave1', d0),
                    ('leave1', d1),
                    ('leave1', d2),
                    ('leave2', d0),
                    ('leave2', d1),
                    ('leave2', d2)
                ],
                [('upstream_level_1', d0), ('upstream_level_1', d1),
                 ('upstream_level_1', d2)],
                [('upstream_level_2', d0), ('upstream_level_2', d1),
                 ('upstream_level_2', d2)],
                [('upstream_level_3', d0), ('upstream_level_3', d1),
                 ('upstream_level_3', d2)],
            ]
        )
Esempio n. 19
0
    def test_backfill_respect_default_pool_limit(self, mock_log):
        default_pool_slots = 2
        set_default_pool_slots(default_pool_slots)

        dag = self._get_dummy_dag('test_backfill_with_no_pool_limit')

        executor = MockExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=7),
        )

        job.run()

        self.assertTrue(0 < len(executor.history))

        default_pool_task_slot_count_reached_at_least_once = False

        num_running_task_instances = 0

        # if no pool is specified, the number of tasks running in
        # parallel per backfill should be less than
        # default_pool slots at any point of time.
        for running_task_instances in executor.history:
            self.assertLessEqual(
                len(running_task_instances),
                default_pool_slots,
            )
            num_running_task_instances += len(running_task_instances)
            if len(running_task_instances) == default_pool_slots:
                default_pool_task_slot_count_reached_at_least_once = True

        self.assertEquals(8, num_running_task_instances)
        self.assertTrue(default_pool_task_slot_count_reached_at_least_once)

        times_dag_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            DagConcurrencyLimitReached,
        )

        times_pool_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            NoAvailablePoolSlot,
        )

        times_task_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            TaskConcurrencyLimitReached,
        )

        self.assertEqual(0, times_dag_concurrency_limit_reached_in_debug)
        self.assertEqual(0, times_task_concurrency_limit_reached_in_debug)
        self.assertGreater(times_pool_limit_reached_in_debug, 0)
    def test_subdag_clear_parentdag_downstream_clear(self):
        dag = self.dagbag.get_dag('clear_subdag_test_dag')
        subdag_op_task = dag.get_task('daily_job')

        subdag = subdag_op_task.subdag

        executor = TestExecutor()
        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          donot_pickle=True)

        with timeout(seconds=30):
            job.run()

        ti_subdag = TI(
            task=dag.get_task('daily_job'),
            execution_date=DEFAULT_DATE)
        ti_subdag.refresh_from_db()
        self.assertEqual(ti_subdag.state, State.SUCCESS)

        ti_irrelevant = TI(
            task=dag.get_task('daily_job_irrelevant'),
            execution_date=DEFAULT_DATE)
        ti_irrelevant.refresh_from_db()
        self.assertEqual(ti_irrelevant.state, State.SUCCESS)

        ti_downstream = TI(
            task=dag.get_task('daily_job_downstream'),
            execution_date=DEFAULT_DATE)
        ti_downstream.refresh_from_db()
        self.assertEqual(ti_downstream.state, State.SUCCESS)

        sdag = subdag.sub_dag(
            task_regex='daily_job_subdag_task',
            include_downstream=True,
            include_upstream=False)

        sdag.clear(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            include_parentdag=True)

        ti_subdag.refresh_from_db()
        self.assertEqual(State.NONE, ti_subdag.state)

        ti_irrelevant.refresh_from_db()
        self.assertEqual(State.SUCCESS, ti_irrelevant.state)

        ti_downstream.refresh_from_db()
        self.assertEqual(State.NONE, ti_downstream.state)

        subdag.clear()
        dag.clear()
Esempio n. 21
0
    def test_subdag_clear_parentdag_downstream_clear(self):
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag_op_task = dag.get_task('section-1')

        subdag = subdag_op_task.subdag
        subdag.schedule_interval = '@daily'

        executor = TestExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          donot_pickle=True)

        with timeout(seconds=30):
            job.run()

        ti0 = TI(
            task=subdag.get_task('section-1-task-1'),
            execution_date=DEFAULT_DATE)
        ti0.refresh_from_db()
        self.assertEqual(ti0.state, State.SUCCESS)

        sdag = subdag.sub_dag(
            task_regex='section-1-task-1',
            include_downstream=True,
            include_upstream=False)

        sdag.clear(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            include_parentdag=True)

        ti0.refresh_from_db()
        self.assertEqual(State.NONE, ti0.state)

        ti1 = TI(
            task=dag.get_task('some-other-task'),
            execution_date=DEFAULT_DATE)
        self.assertEqual(State.NONE, ti1.state)

        # Checks that all the Downstream tasks for Parent DAG
        # have been cleared
        for task in subdag_op_task.downstream_list:
            ti = TI(
                task=dag.get_task(task.task_id),
                execution_date=DEFAULT_DATE
            )
            self.assertEqual(State.NONE, ti.state)

        subdag.clear()
        dag.clear()
    def test_backfill_multi_dates(self):
        dag = self.dagbag.get_dag('example_bash_operator')

        end_date = DEFAULT_DATE + datetime.timedelta(days=1)

        executor = TestExecutor()
        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=end_date,
            executor=executor,
            ignore_first_depends_on_past=True
        )

        job.run()

        expected_execution_order = [
            ("runme_0", DEFAULT_DATE),
            ("runme_1", DEFAULT_DATE),
            ("runme_2", DEFAULT_DATE),
            ("runme_0", end_date),
            ("runme_1", end_date),
            ("runme_2", end_date),
            ("also_run_this", DEFAULT_DATE),
            ("also_run_this", end_date),
            ("run_after_loop", DEFAULT_DATE),
            ("run_after_loop", end_date),
            ("run_this_last", DEFAULT_DATE),
            ("run_this_last", end_date),
        ]
        self.maxDiff = None
        self.assertListEqual(
            [((dag.dag_id, task_id, when, 1), State.SUCCESS)
             for (task_id, when) in expected_execution_order],
            executor.sorted_tasks
        )

        session = settings.Session()
        drs = session.query(DagRun).filter(
            DagRun.dag_id == dag.dag_id
        ).order_by(DagRun.execution_date).all()

        self.assertTrue(drs[0].execution_date == DEFAULT_DATE)
        self.assertTrue(drs[0].state == State.SUCCESS)
        self.assertTrue(drs[1].execution_date ==
                        DEFAULT_DATE + datetime.timedelta(days=1))
        self.assertTrue(drs[1].state == State.SUCCESS)

        dag.clear()
        session.close()
Esempio n. 23
0
    def test_backfill_respect_task_concurrency_limit(self, mock_log):
        task_concurrency = 2
        dag = self._get_dummy_dag(
            'test_backfill_respect_task_concurrency_limit',
            task_concurrency=task_concurrency,
        )

        executor = MockExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=7),
        )

        job.run()

        self.assertTrue(0 < len(executor.history))

        task_concurrency_limit_reached_at_least_once = False

        num_running_task_instances = 0
        for running_task_instances in executor.history:
            self.assertLessEqual(len(running_task_instances), task_concurrency)
            num_running_task_instances += len(running_task_instances)
            if len(running_task_instances) == task_concurrency:
                task_concurrency_limit_reached_at_least_once = True

        self.assertEqual(8, num_running_task_instances)
        self.assertTrue(task_concurrency_limit_reached_at_least_once)

        times_dag_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            DagConcurrencyLimitReached,
        )

        times_pool_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            NoAvailablePoolSlot,
        )

        times_task_concurrency_limit_reached_in_debug = self._times_called_with(
            mock_log.debug,
            TaskConcurrencyLimitReached,
        )

        self.assertEqual(0, times_pool_limit_reached_in_debug)
        self.assertEqual(0, times_dag_concurrency_limit_reached_in_debug)
        self.assertGreater(times_task_concurrency_limit_reached_in_debug, 0)
Esempio n. 24
0
 def test_backfill_examples(self):
     dagbag = DagBag(
         dag_folder=DEV_NULL, include_examples=True)
     dags = [
         dag for dag in dagbag.dags.values()
         if dag.dag_id in ('example_bash_operator',)]
     for dag in dags:
         dag.clear(
             start_date=DEFAULT_DATE,
             end_date=DEFAULT_DATE)
     for dag in dags:
         job = BackfillJob(
             dag=dag,
             start_date=DEFAULT_DATE,
             end_date=DEFAULT_DATE)
         job.run()
Esempio n. 25
0
    def test_backfill_conf(self):
        dag = self._get_dummy_dag('test_backfill_conf')

        executor = MockExecutor()

        conf = json.loads("""{"key": "value"}""")
        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          conf=conf)
        job.run()

        dr = DagRun.find(dag_id='test_backfill_conf')

        self.assertEqual(conf, dr[0].conf)
Esempio n. 26
0
    def test_sub_set_subdag(self):
        dag = DAG('test_sub_set_subdag',
                  start_date=DEFAULT_DATE,
                  default_args={'owner': 'owner1'})

        with dag:
            op1 = DummyOperator(task_id='leave1')
            op2 = DummyOperator(task_id='leave2')
            op3 = DummyOperator(task_id='upstream_level_1')
            op4 = DummyOperator(task_id='upstream_level_2')
            op5 = DummyOperator(task_id='upstream_level_3')
            # order randomly
            op2.set_downstream(op3)
            op1.set_downstream(op3)
            op4.set_downstream(op5)
            op3.set_downstream(op4)

        dag.clear()
        dr = dag.create_dagrun(run_id="test",
                               state=State.RUNNING,
                               execution_date=DEFAULT_DATE,
                               start_date=DEFAULT_DATE)

        executor = MockExecutor()
        sub_dag = dag.sub_dag(task_regex="leave*",
                              include_downstream=False,
                              include_upstream=False)
        job = BackfillJob(dag=sub_dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor)
        job.run()

        self.assertRaises(sqlalchemy.orm.exc.NoResultFound, dr.refresh_from_db)
        # the run_id should have changed, so a refresh won't work
        drs = DagRun.find(dag_id=dag.dag_id, execution_date=DEFAULT_DATE)
        dr = drs[0]

        self.assertEqual(
            BackfillJob.ID_FORMAT_PREFIX.format(DEFAULT_DATE.isoformat()),
            dr.run_id)
        for ti in dr.get_task_instances():
            if ti.task_id == 'leave1' or ti.task_id == 'leave2':
                self.assertEqual(State.SUCCESS, ti.state)
            else:
                self.assertEqual(State.NONE, ti.state)
Esempio n. 27
0
 def test_backfill_examples(self):
     """
     Test backfilling example dags
     """
     dags = [
         dag for dag in self.dagbag.dags.values()
         if dag.dag_id in ('example_bash_operator',)]
     for dag in dags:
         dag.clear(
             start_date=DEFAULT_DATE,
             end_date=DEFAULT_DATE)
     for dag in dags:
         job = BackfillJob(
             dag=dag,
             start_date=DEFAULT_DATE,
             end_date=DEFAULT_DATE)
         job.run()
Esempio n. 28
0
    def test_backfill_max_limit_check_within_limit(self):
        dag = self._get_dag_test_max_active_limits(
            'test_backfill_max_limit_check_within_limit', max_active_runs=16)

        start_date = DEFAULT_DATE - datetime.timedelta(hours=1)
        end_date = DEFAULT_DATE

        executor = MockExecutor()
        job = BackfillJob(dag=dag,
                          start_date=start_date,
                          end_date=end_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        dagruns = DagRun.find(dag_id=dag.dag_id)
        self.assertEqual(2, len(dagruns))
        self.assertTrue(all([run.state == State.SUCCESS for run in dagruns]))
Esempio n. 29
0
    def test_scheduler_start_date(self):
        """
        Test that the scheduler respects start_dates, even when DAGS have run
        """

        dag_id = 'test_start_date_scheduling'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        self.assertTrue(dag.start_date > DEFAULT_DATE)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # zero tasks ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)

        # previously, running this backfill would kick off the Scheduler
        # because it would take the most recent run and start from there
        # That behavior still exists, but now it will only do so if after the
        # start date
        backfill = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)
        backfill.run()

        # one task ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # still one task
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
Esempio n. 30
0
    def test_scheduler_start_date(self):
        """
        Test that the scheduler respects start_dates, even when DAGS have run
        """

        dag_id = 'test_start_date_scheduling'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        self.assertTrue(dag.start_date > DEFAULT_DATE)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # zero tasks ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)

        # previously, running this backfill would kick off the Scheduler
        # because it would take the most recent run and start from there
        # That behavior still exists, but now it will only do so if after the
        # start date
        backfill = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)
        backfill.run()

        # one task ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # still one task
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
    def test_backfill_execute_subdag(self):
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag_op_task = dag.get_task('section-1')

        subdag = subdag_op_task.subdag
        subdag.schedule_interval = '@daily'

        start_date = timezone.utcnow()
        executor = TestExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=start_date,
                          end_date=start_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        subdag_op_task.pre_execute(context={'execution_date': start_date})
        subdag_op_task.execute(context={'execution_date': start_date})
        subdag_op_task.post_execute(context={'execution_date': start_date})

        history = executor.history
        subdag_history = history[0]

        # check that all 5 task instances of the subdag 'section-1' were executed
        self.assertEqual(5, len(subdag_history))
        for sdh in subdag_history:
            ti = sdh[3]
            self.assertIn('section-1-task-', ti.task_id)

        with create_session() as session:
            successful_subdag_runs = (
                session
                .query(DagRun)
                .filter(DagRun.dag_id == subdag.dag_id)
                .filter(DagRun.execution_date == start_date)
                .filter(DagRun.state == State.SUCCESS)
                .count()
            )

            self.assertEqual(1, successful_subdag_runs)

        subdag.clear()
        dag.clear()
    def test_backfill_execute_subdag_with_removed_task(self):
        """
        Ensure that subdag operators execute properly in the case where
        an associated task of the subdag has been removed from the dag
        definition, but has instances in the database from previous runs.
        """
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag = dag.get_task('section-1').subdag

        executor = TestExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          donot_pickle=True)

        removed_task_ti = TI(
            task=DummyOperator(task_id='removed_task'),
            execution_date=DEFAULT_DATE,
            state=State.REMOVED)
        removed_task_ti.dag_id = subdag.dag_id

        session = settings.Session()
        session.merge(removed_task_ti)

        with timeout(seconds=30):
            job.run()

        for task in subdag.tasks:
            instance = session.query(TI).filter(
                TI.dag_id == subdag.dag_id,
                TI.task_id == task.task_id,
                TI.execution_date == DEFAULT_DATE).first()

            self.assertIsNotNone(instance)
            self.assertEqual(instance.state, State.SUCCESS)

        removed_task_ti.refresh_from_db()
        self.assertEqual(removed_task_ti.state, State.REMOVED)

        subdag.clear()
        dag.clear()
Esempio n. 33
0
def test_dag():
    with testing.postgresql.Postgresql() as postgresql:
        with patch.dict(
                os.environ, {
                    'API_V1_DB_URL': postgresql.url(),
                    'OUTPUT_FOLDER': 'tests/api_sync_v1/input'
                }):
            configuration.load_test_config()
            # the scheduler messages, which will show up if something
            # happens to screw up execution, are INFO level so save us
            # some headaches but switching to that loglevel here
            logging.basicConfig(level=logging.INFO)
            bag = models.DagBag()
            dag = bag.get_dag(dag_id='open_skills_master.api_v1_sync')
            # expire old DAG runs, otherwise the max of 16 will automatically get scheduled
            dag.dagrun_timeout = 1
            dag.clear()
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
            )
            job.run()
            engine = create_engine(postgresql.url())
            session = sessionmaker(engine)()
            num_jobs = session.query(JobMaster).count()
            assert num_jobs > 1
            num_skills = session.query(SkillMaster).count()
            assert num_skills > 1
            num_importances = session.query(SkillImportance).count()
            assert num_importances > 1
            assert session.query(GeoTitleCount).count() > 1
            assert session.query(TitleCount).count() > 1

            # make sure non-temporal data doesn't
            # load twice for a different quarter
            new_date = datetime(2014, 5, 1)
            dag.clear(start_date=new_date, end_date=new_date)
            dag.run(start_date=new_date, end_date=new_date, local=True)
            assert session.query(JobMaster).count() == num_jobs
            assert session.query(SkillMaster).count() == num_skills
            assert session.query(SkillImportance).count() == num_importances
    def test_backfill_max_limit_check_complete_loop(self):
        dag = self._get_dag_test_max_active_limits(
            'test_backfill_max_limit_check_complete_loop')
        start_date = DEFAULT_DATE - datetime.timedelta(hours=1)
        end_date = DEFAULT_DATE

        # Given the max limit to be 1 in active dag runs, we need to run the
        # backfill job 3 times
        success_expected = 2
        executor = TestExecutor()
        job = BackfillJob(dag=dag,
                          start_date=start_date,
                          end_date=end_date,
                          executor=executor,
                          donot_pickle=True)
        job.run()

        success_dagruns = len(DagRun.find(dag_id=dag.dag_id, state=State.SUCCESS))
        running_dagruns = len(DagRun.find(dag_id=dag.dag_id, state=State.RUNNING))
        self.assertEqual(success_expected, success_dagruns)
        self.assertEqual(0, running_dagruns)  # no dag_runs in running state are left
Esempio n. 35
0
    def test_backfill_pool_not_found(self):
        dag = self._get_dummy_dag(
            dag_id='test_backfill_pool_not_found',
            pool='king_pool',
        )

        executor = MockExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=7),
        )

        try:
            job.run()
        except AirflowException:
            return

        self.fail()
Esempio n. 36
0
    def test_backfill_run_backwards(self):
        dag = self.dagbag.get_dag("test_start_date_scheduling")
        dag.clear()

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=1),
                          run_backwards=True)
        job.run()

        session = settings.Session()
        tis = session.query(TI).filter(
            TI.dag_id == 'test_start_date_scheduling'
            and TI.task_id == 'dummy').order_by(TI.execution_date).all()

        queued_times = [ti.queued_dttm for ti in tis]
        self.assertTrue(queued_times == sorted(queued_times, reverse=True))
        self.assertTrue(all([ti.state == State.SUCCESS for ti in tis]))

        dag.clear()
        session.close()
Esempio n. 37
0
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        target_dag.sync_to_db()

        scheduler = SchedulerJob()
        task_instances_list = Mock()
        scheduler._process_task_instances(
            target_dag, task_instances_list=task_instances_list)
        self.assertFalse(task_instances_list.append.called)

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          ignore_first_depends_on_past=True)
        job.run()

        scheduler._process_task_instances(
            target_dag, task_instances_list=task_instances_list)

        self.assertTrue(task_instances_list.append.called)
Esempio n. 38
0
    def test_backfill_multi_dates(self):
        dag = self.dagbag.get_dag('example_bash_operator')
        dag.clear()

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE+datetime.timedelta(days=1),
            ignore_first_depends_on_past=True
        )
        job.run()

        session = settings.Session()
        drs = session.query(DagRun).filter(
            DagRun.dag_id=='example_bash_operator'
        ).order_by(DagRun.execution_date).all()

        self.assertTrue(drs[0].execution_date == DEFAULT_DATE)
        self.assertTrue(drs[0].state == State.SUCCESS)
        self.assertTrue(drs[1].execution_date == DEFAULT_DATE+datetime.timedelta(days=1))
        self.assertTrue(drs[1].state == State.SUCCESS)

        dag.clear()
        session.close()