def run_backfill(cond): cond.acquire() # this session object is different than the one in the main thread with create_session() as thread_session: try: dag = self._get_dag_test_max_active_limits(dag_id) # Existing dagrun that is not within the backfill range dag.create_dagrun( run_id=run_id, state=State.RUNNING, execution_date=DEFAULT_DATE + datetime.timedelta(hours=1), start_date=DEFAULT_DATE, ) thread_session.commit() cond.notify() finally: cond.release() thread_session.close() executor = MockExecutor() job = BackfillJob(dag=dag, start_date=start_date, end_date=end_date, executor=executor, donot_pickle=True) job.run()
def test_backfill_examples(self, dag_id, expected_execution_order): """ Test backfilling example dags Try to backfill some of the example dags. Be careful, not all dags are suitable for doing this. For example, a dag that sleeps forever, or does not have a schedule won't work here since you simply can't backfill them. """ dag = self.dagbag.get_dag(dag_id) logger.info('*** Running example DAG: %s', dag.dag_id) executor = MockExecutor() job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor, ignore_first_depends_on_past=True) job.run() self.assertListEqual( [((dag_id, task_id, DEFAULT_DATE, 1), State.SUCCESS) for task_id in expected_execution_order], executor.sorted_tasks )
def test_backfill_max_limit_check_no_count_existing(self): dag = self._get_dag_test_max_active_limits( 'test_backfill_max_limit_check_no_count_existing') start_date = DEFAULT_DATE end_date = DEFAULT_DATE # Existing dagrun that is within the backfill range dag.create_dagrun(run_id="test_existing_backfill", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE) executor = MockExecutor() job = BackfillJob(dag=dag, start_date=start_date, end_date=end_date, executor=executor, donot_pickle=True) job.run() # BackfillJob will run since the existing DagRun does not count for the max # active limit since it's within the backfill date range. dagruns = DagRun.find(dag_id=dag.dag_id) # will only be able to run 1 (the existing one) since there's just # one dag run slot left given the max_active_runs limit self.assertEqual(1, len(dagruns)) self.assertEqual(State.SUCCESS, dagruns[0].state)
def test_backfill_rerun_failed_tasks(self): dag = DAG(dag_id='test_backfill_rerun_failed', start_date=DEFAULT_DATE, schedule_interval='@daily') with dag: DummyOperator(task_id='test_backfill_rerun_failed_task-1', dag=dag) dag.clear() executor = MockExecutor() job = BackfillJob( dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), ) job.run() ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.set_state(State.FAILED) job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), rerun_failed_tasks=True) job.run() ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS)
def test_backfill_execute_subdag(self): dag = self.dagbag.get_dag('example_subdag_operator') subdag_op_task = dag.get_task('section-1') subdag = subdag_op_task.subdag subdag.schedule_interval = '@daily' start_date = timezone.utcnow() executor = MockExecutor() job = BackfillJob(dag=subdag, start_date=start_date, end_date=start_date, executor=executor, donot_pickle=True) job.run() history = executor.history subdag_history = history[0] # check that all 5 task instances of the subdag 'section-1' were executed self.assertEqual(5, len(subdag_history)) for sdh in subdag_history: ti = sdh[3] self.assertIn('section-1-task-', ti.task_id) subdag.clear() dag.clear()
def test_backfill_run_backwards(self): dag = self.dagbag.get_dag("test_start_date_scheduling") dag.clear() executor = MockExecutor(parallelism=16) job = BackfillJob( executor=executor, dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=1), run_backwards=True ) job.run() session = settings.Session() tis = session.query(TI).filter( TI.dag_id == 'test_start_date_scheduling' and TI.task_id == 'dummy' ).order_by(TI.execution_date).all() queued_times = [ti.queued_dttm for ti in tis] self.assertTrue(queued_times == sorted(queued_times, reverse=True)) self.assertTrue(all([ti.state == State.SUCCESS for ti in tis])) dag.clear() session.close()
def test_backfill_depends_on_past(self): """ Test that backfill respects ignore_depends_on_past """ dag = self.dagbag.get_dag('test_depends_on_past') dag.clear() run_date = DEFAULT_DATE + datetime.timedelta(days=5) # backfill should deadlock self.assertRaisesRegex( AirflowException, 'BackfillJob is deadlocked', BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run) BackfillJob( dag=dag, start_date=run_date, end_date=run_date, executor=MockExecutor(), ignore_first_depends_on_past=True).run() # ti should have succeeded ti = TI(dag.tasks[0], run_date) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS)
def test_backfill_rerun_failed_tasks_without_flag(self): dag = DAG(dag_id='test_backfill_rerun_failed', start_date=DEFAULT_DATE, schedule_interval='@daily') with dag: DummyOperator(task_id='test_backfill_rerun_failed_task-1', dag=dag) dag.clear() executor = MockExecutor() job = BackfillJob( dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), ) job.run() ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.set_state(State.FAILED) job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), rerun_failed_tasks=False) with self.assertRaises(AirflowException): job.run()
def test_backfill_pooled_tasks(self): """ Test that queued tasks are executed by BackfillJob """ session = settings.Session() pool = Pool(pool='test_backfill_pooled_task_pool', slots=1) session.add(pool) session.commit() session.close() dag = self.dagbag.get_dag('test_backfill_pooled_task_dag') dag.clear() executor = MockExecutor(do_update=True) job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor) # run with timeout because this creates an infinite loop if not # caught try: with timeout(seconds=5): job.run() except AirflowTaskTimeout: pass ti = TI(task=dag.get_task('test_backfill_pooled_task'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS)
def test_backfill_respect_pool_limit(self, mock_log): session = settings.Session() slots = 2 pool = Pool( pool='pool_with_two_slots', slots=slots, ) session.add(pool) session.commit() dag = self._get_dummy_dag( dag_id='test_backfill_respect_pool_limit', pool=pool.pool, ) executor = MockExecutor() job = BackfillJob( dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=7), ) job.run() self.assertTrue(0 < len(executor.history)) pool_was_full_at_least_once = False num_running_task_instances = 0 for running_task_instances in executor.history: self.assertLessEqual(len(running_task_instances), slots) num_running_task_instances += len(running_task_instances) if len(running_task_instances) == slots: pool_was_full_at_least_once = True self.assertEqual(8, num_running_task_instances) self.assertTrue(pool_was_full_at_least_once) times_dag_concurrency_limit_reached_in_debug = self._times_called_with( mock_log.debug, DagConcurrencyLimitReached, ) times_pool_limit_reached_in_debug = self._times_called_with( mock_log.debug, NoAvailablePoolSlot, ) times_task_concurrency_limit_reached_in_debug = self._times_called_with( mock_log.debug, TaskConcurrencyLimitReached, ) self.assertEqual(0, times_task_concurrency_limit_reached_in_debug) self.assertEqual(0, times_dag_concurrency_limit_reached_in_debug) self.assertGreater(times_pool_limit_reached_in_debug, 0)
def test_backfill_respect_default_pool_limit(self, mock_log): default_pool_slots = 2 set_default_pool_slots(default_pool_slots) dag = self._get_dummy_dag('test_backfill_with_no_pool_limit') executor = MockExecutor() job = BackfillJob( dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=7), ) job.run() self.assertTrue(0 < len(executor.history)) default_pool_task_slot_count_reached_at_least_once = False num_running_task_instances = 0 # if no pool is specified, the number of tasks running in # parallel per backfill should be less than # default_pool slots at any point of time. for running_task_instances in executor.history: self.assertLessEqual( len(running_task_instances), default_pool_slots, ) num_running_task_instances += len(running_task_instances) if len(running_task_instances) == default_pool_slots: default_pool_task_slot_count_reached_at_least_once = True self.assertEquals(8, num_running_task_instances) self.assertTrue(default_pool_task_slot_count_reached_at_least_once) times_dag_concurrency_limit_reached_in_debug = self._times_called_with( mock_log.debug, DagConcurrencyLimitReached, ) times_pool_limit_reached_in_debug = self._times_called_with( mock_log.debug, NoAvailablePoolSlot, ) times_task_concurrency_limit_reached_in_debug = self._times_called_with( mock_log.debug, TaskConcurrencyLimitReached, ) self.assertEqual(0, times_dag_concurrency_limit_reached_in_debug) self.assertEqual(0, times_task_concurrency_limit_reached_in_debug) self.assertGreater(times_pool_limit_reached_in_debug, 0)
def test_backfill_ordered_concurrent_execute(self): dag = DAG( dag_id='test_backfill_ordered_concurrent_execute', start_date=DEFAULT_DATE, schedule_interval="@daily") with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1') op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3') # order randomly op2.set_downstream(op3) op1.set_downstream(op3) op4.set_downstream(op5) op3.set_downstream(op4) dag.clear() executor = MockExecutor(parallelism=16) job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), ) job.run() date0 = DEFAULT_DATE date1 = date0 + datetime.timedelta(days=1) date2 = date1 + datetime.timedelta(days=1) # test executor history keeps a list history = executor.history self.assertListEqual( # key[0] is dag id, key[3] is try_number, we don't care about either of those here [sorted([item[-1].key[1:3] for item in batch]) for batch in history], [ [ ('leave1', date0), ('leave1', date1), ('leave1', date2), ('leave2', date0), ('leave2', date1), ('leave2', date2) ], [('upstream_level_1', date0), ('upstream_level_1', date1), ('upstream_level_1', date2)], [('upstream_level_2', date0), ('upstream_level_2', date1), ('upstream_level_2', date2)], [('upstream_level_3', date0), ('upstream_level_3', date1), ('upstream_level_3', date2)], ] )
def test_subdag_clear_parentdag_downstream_clear(self): dag = self.dagbag.get_dag('clear_subdag_test_dag') subdag_op_task = dag.get_task('daily_job') subdag = subdag_op_task.subdag executor = MockExecutor() job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor, donot_pickle=True) with timeout(seconds=30): job.run() ti_subdag = TI( task=dag.get_task('daily_job'), execution_date=DEFAULT_DATE) ti_subdag.refresh_from_db() self.assertEqual(ti_subdag.state, State.SUCCESS) ti_irrelevant = TI( task=dag.get_task('daily_job_irrelevant'), execution_date=DEFAULT_DATE) ti_irrelevant.refresh_from_db() self.assertEqual(ti_irrelevant.state, State.SUCCESS) ti_downstream = TI( task=dag.get_task('daily_job_downstream'), execution_date=DEFAULT_DATE) ti_downstream.refresh_from_db() self.assertEqual(ti_downstream.state, State.SUCCESS) sdag = subdag.sub_dag( task_regex='daily_job_subdag_task', include_downstream=True, include_upstream=False) sdag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, include_parentdag=True) ti_subdag.refresh_from_db() self.assertEqual(State.NONE, ti_subdag.state) ti_irrelevant.refresh_from_db() self.assertEqual(State.SUCCESS, ti_irrelevant.state) ti_downstream.refresh_from_db() self.assertEqual(State.NONE, ti_downstream.state) subdag.clear() dag.clear()
def test_backfill_respect_task_concurrency_limit(self, mock_log): task_concurrency = 2 dag = self._get_dummy_dag( 'test_backfill_respect_task_concurrency_limit', task_concurrency=task_concurrency, ) executor = MockExecutor() job = BackfillJob( dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=7), ) job.run() self.assertTrue(0 < len(executor.history)) task_concurrency_limit_reached_at_least_once = False num_running_task_instances = 0 for running_task_instances in executor.history: self.assertLessEqual(len(running_task_instances), task_concurrency) num_running_task_instances += len(running_task_instances) if len(running_task_instances) == task_concurrency: task_concurrency_limit_reached_at_least_once = True self.assertEqual(8, num_running_task_instances) self.assertTrue(task_concurrency_limit_reached_at_least_once) times_dag_concurrency_limit_reached_in_debug = self._times_called_with( mock_log.debug, DagConcurrencyLimitReached, ) times_pool_limit_reached_in_debug = self._times_called_with( mock_log.debug, NoAvailablePoolSlot, ) times_task_concurrency_limit_reached_in_debug = self._times_called_with( mock_log.debug, TaskConcurrencyLimitReached, ) self.assertEqual(0, times_pool_limit_reached_in_debug) self.assertEqual(0, times_dag_concurrency_limit_reached_in_debug) self.assertGreater(times_task_concurrency_limit_reached_in_debug, 0)
def test_backfill_multi_dates(self): dag = self.dagbag.get_dag('example_bash_operator') end_date = DEFAULT_DATE + datetime.timedelta(days=1) executor = MockExecutor(parallelism=16) job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=end_date, executor=executor, ignore_first_depends_on_past=True ) job.run() expected_execution_order = [ ("runme_0", DEFAULT_DATE), ("runme_1", DEFAULT_DATE), ("runme_2", DEFAULT_DATE), ("runme_0", end_date), ("runme_1", end_date), ("runme_2", end_date), ("also_run_this", DEFAULT_DATE), ("also_run_this", end_date), ("run_after_loop", DEFAULT_DATE), ("run_after_loop", end_date), ("run_this_last", DEFAULT_DATE), ("run_this_last", end_date), ] self.assertListEqual( [((dag.dag_id, task_id, when, 1), State.SUCCESS) for (task_id, when) in expected_execution_order], executor.sorted_tasks ) session = settings.Session() drs = session.query(DagRun).filter( DagRun.dag_id == dag.dag_id ).order_by(DagRun.execution_date).all() self.assertTrue(drs[0].execution_date == DEFAULT_DATE) self.assertTrue(drs[0].state == State.SUCCESS) self.assertTrue(drs[1].execution_date == DEFAULT_DATE + datetime.timedelta(days=1)) self.assertTrue(drs[1].state == State.SUCCESS) dag.clear() session.close()
def test_number_of_queries_single_loop(self, mock_get_task_runner, return_codes): unique_prefix = str(uuid.uuid4()) dag = DAG(dag_id=f'{unique_prefix}_test_number_of_queries', start_date=DEFAULT_DATE) task = DummyOperator(task_id='test_state_succeeded1', dag=dag) dag.clear() dag.create_dagrun(run_id=unique_prefix, state=State.NONE) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) mock_get_task_runner.return_value.return_code.side_effects = return_codes job = LocalTaskJob(task_instance=ti, executor=MockExecutor()) with assert_queries_count(13): job.run()
def test_backfill_depends_on_past_backwards(self): """ Test that CLI respects -B argument and raises on interaction with depends_on_past """ dag_id = 'test_depends_on_past' start_date = DEFAULT_DATE + datetime.timedelta(days=1) end_date = start_date + datetime.timedelta(days=1) kwargs = dict( start_date=start_date, end_date=end_date, ) dag = self.dagbag.get_dag(dag_id) dag.clear() executor = MockExecutor() job = BackfillJob(dag=dag, executor=executor, ignore_first_depends_on_past=True, **kwargs) job.run() ti = TI(dag.get_task('test_dop_task'), end_date) ti.refresh_from_db() # runs fine forwards self.assertEqual(ti.state, State.SUCCESS) # raises backwards expected_msg = 'You cannot backfill backwards because one or more tasks depend_on_past: {}'.format( 'test_dop_task') with self.assertRaisesRegex(AirflowException, expected_msg): executor = MockExecutor() job = BackfillJob(dag=dag, executor=executor, run_backwards=True, **kwargs) job.run()
def test_backfill_conf(self): dag = self._get_dummy_dag('test_backfill_conf') executor = MockExecutor() conf = json.loads("""{"key": "value"}""") job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), conf=conf) job.run() dr = DagRun.find(dag_id='test_backfill_conf') self.assertEqual(conf, dr[0].conf)
def test_sub_set_subdag(self): dag = DAG('test_sub_set_subdag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1') op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3') # order randomly op2.set_downstream(op3) op1.set_downstream(op3) op4.set_downstream(op5) op3.set_downstream(op4) dag.clear() dr = dag.create_dagrun(run_id="test", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE) executor = MockExecutor() sub_dag = dag.sub_dag(task_regex="leave*", include_downstream=False, include_upstream=False) job = BackfillJob(dag=sub_dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor) job.run() self.assertRaises(sqlalchemy.orm.exc.NoResultFound, dr.refresh_from_db) # the run_id should have changed, so a refresh won't work drs = DagRun.find(dag_id=dag.dag_id, execution_date=DEFAULT_DATE) dr = drs[0] self.assertEqual( BackfillJob.ID_FORMAT_PREFIX.format(DEFAULT_DATE.isoformat()), dr.run_id) for ti in dr.get_task_instances(): if ti.task_id == 'leave1' or ti.task_id == 'leave2': self.assertEqual(State.SUCCESS, ti.state) else: self.assertEqual(State.NONE, ti.state)
def test_heartbeat_failed_fast(self, mock_getpid): """ Test that task heartbeat will sleep when it fails fast """ mock_getpid.return_value = 1 heartbeat_records = [] def heartbeat_recorder(**kwargs): heartbeat_records.append(timezone.utcnow()) with create_session() as session: dagbag = models.DagBag( dag_folder=TEST_DAG_FOLDER, include_examples=False, ) dag_id = 'test_heartbeat_failed_fast' task_id = 'test_heartbeat_failed_fast_op' dag = dagbag.get_dag(dag_id) task = dag.get_task(task_id) dag.create_dagrun(run_id="test_heartbeat_failed_fast_run", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session) ti = TI(task=task, execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.state = State.RUNNING ti.hostname = get_hostname() ti.pid = 1 session.commit() job = LocalTaskJob(task_instance=ti, executor=MockExecutor(do_update=False)) job.heartrate = 2 job.heartbeat_callback = heartbeat_recorder job._execute() self.assertGreater(len(heartbeat_records), 1) for i in range(1, len(heartbeat_records)): time1 = heartbeat_records[i - 1] time2 = heartbeat_records[i] # Assert that difference small enough delta = (time2 - time1).total_seconds() self.assertAlmostEqual(delta, job.heartrate, delta=0.05)
def test_backfill_max_limit_check_within_limit(self): dag = self._get_dag_test_max_active_limits( 'test_backfill_max_limit_check_within_limit', max_active_runs=16) start_date = DEFAULT_DATE - datetime.timedelta(hours=1) end_date = DEFAULT_DATE executor = MockExecutor() job = BackfillJob(dag=dag, start_date=start_date, end_date=end_date, executor=executor, donot_pickle=True) job.run() dagruns = DagRun.find(dag_id=dag.dag_id) self.assertEqual(2, len(dagruns)) self.assertTrue(all([run.state == State.SUCCESS for run in dagruns]))
def test_heartbeat_failed_fast(self): """ Test that task heartbeat will sleep when it fails fast """ self.mock_base_job_sleep.side_effect = time.sleep with create_session() as session: dagbag = DagBag( dag_folder=TEST_DAG_FOLDER, include_examples=False, ) dag_id = 'test_heartbeat_failed_fast' task_id = 'test_heartbeat_failed_fast_op' dag = dagbag.get_dag(dag_id) task = dag.get_task(task_id) dag.create_dagrun( run_id="test_heartbeat_failed_fast_run", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.state = State.RUNNING ti.hostname = get_hostname() ti.pid = 1 session.commit() job = LocalTaskJob(task_instance=ti, executor=MockExecutor(do_update=False)) job.heartrate = 2 heartbeat_records = [] job.heartbeat_callback = lambda session: heartbeat_records.append( job.latest_heartbeat) job._execute() self.assertGreater(len(heartbeat_records), 2) for i in range(1, len(heartbeat_records)): time1 = heartbeat_records[i - 1] time2 = heartbeat_records[i] # Assert that difference small enough delta = (time2 - time1).total_seconds() self.assertAlmostEqual(delta, job.heartrate, delta=0.05)
def test_backfill_execute_subdag(self): dag = self.dagbag.get_dag('example_subdag_operator') subdag_op_task = dag.get_task('section-1') subdag = subdag_op_task.subdag subdag.schedule_interval = '@daily' start_date = timezone.utcnow() executor = MockExecutor() job = BackfillJob(dag=subdag, start_date=start_date, end_date=start_date, executor=executor, donot_pickle=True) job.run() subdag_op_task.pre_execute(context={'execution_date': start_date}) subdag_op_task.execute(context={'execution_date': start_date}) subdag_op_task.post_execute(context={'execution_date': start_date}) history = executor.history subdag_history = history[0] # check that all 5 task instances of the subdag 'section-1' were executed self.assertEqual(5, len(subdag_history)) for sdh in subdag_history: ti = sdh[3] self.assertIn('section-1-task-', ti.task_id) with create_session() as session: successful_subdag_runs = ( session .query(DagRun) .filter(DagRun.dag_id == subdag.dag_id) .filter(DagRun.execution_date == start_date) # pylint: disable=comparison-with-callable .filter(DagRun.state == State.SUCCESS) .count() ) self.assertEqual(1, successful_subdag_runs) subdag.clear() dag.clear()
def test_backfill_execute_subdag_with_removed_task(self): """ Ensure that subdag operators execute properly in the case where an associated task of the subdag has been removed from the dag definition, but has instances in the database from previous runs. """ dag = self.dagbag.get_dag('example_subdag_operator') subdag = dag.get_task('section-1').subdag executor = MockExecutor() job = BackfillJob(dag=subdag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor, donot_pickle=True) removed_task_ti = TI( task=DummyOperator(task_id='removed_task'), execution_date=DEFAULT_DATE, state=State.REMOVED) removed_task_ti.dag_id = subdag.dag_id session = settings.Session() session.merge(removed_task_ti) session.commit() with timeout(seconds=30): job.run() for task in subdag.tasks: instance = session.query(TI).filter( TI.dag_id == subdag.dag_id, TI.task_id == task.task_id, TI.execution_date == DEFAULT_DATE).first() self.assertIsNotNone(instance) self.assertEqual(instance.state, State.SUCCESS) removed_task_ti.refresh_from_db() self.assertEqual(removed_task_ti.state, State.REMOVED) subdag.clear() dag.clear()
def test_backfill_max_limit_check_complete_loop(self): dag = self._get_dag_test_max_active_limits( 'test_backfill_max_limit_check_complete_loop') start_date = DEFAULT_DATE - datetime.timedelta(hours=1) end_date = DEFAULT_DATE # Given the max limit to be 1 in active dag runs, we need to run the # backfill job 3 times success_expected = 2 executor = MockExecutor() job = BackfillJob(dag=dag, start_date=start_date, end_date=end_date, executor=executor, donot_pickle=True) job.run() success_dagruns = len(DagRun.find(dag_id=dag.dag_id, state=State.SUCCESS)) running_dagruns = len(DagRun.find(dag_id=dag.dag_id, state=State.RUNNING)) self.assertEqual(success_expected, success_dagruns) self.assertEqual(0, running_dagruns) # no dag_runs in running state are left
def test_backfill_pool_not_found(self): dag = self._get_dummy_dag( dag_id='test_backfill_pool_not_found', pool='king_pool', ) executor = MockExecutor() job = BackfillJob( dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=7), ) try: job.run() except AirflowException: return self.fail()
def test_backfill_fill_blanks(self): dag = DAG( 'test_backfill_fill_blanks', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}, ) with dag: op1 = DummyOperator(task_id='op1') op2 = DummyOperator(task_id='op2') op3 = DummyOperator(task_id='op3') op4 = DummyOperator(task_id='op4') op5 = DummyOperator(task_id='op5') op6 = DummyOperator(task_id='op6') dag.clear() dr = dag.create_dagrun(run_id='test', state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE) executor = MockExecutor() session = settings.Session() tis = dr.get_task_instances() for ti in tis: if ti.task_id == op1.task_id: ti.state = State.UP_FOR_RETRY ti.end_date = DEFAULT_DATE elif ti.task_id == op2.task_id: ti.state = State.FAILED elif ti.task_id == op3.task_id: ti.state = State.SKIPPED elif ti.task_id == op4.task_id: ti.state = State.SCHEDULED elif ti.task_id == op5.task_id: ti.state = State.UPSTREAM_FAILED # op6 = None session.merge(ti) session.commit() session.close() job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor) self.assertRaisesRegex(AirflowException, 'Some task instances failed', job.run) self.assertRaises(sqlalchemy.orm.exc.NoResultFound, dr.refresh_from_db) # the run_id should have changed, so a refresh won't work drs = DagRun.find(dag_id=dag.dag_id, execution_date=DEFAULT_DATE) dr = drs[0] self.assertEqual(dr.state, State.FAILED) tis = dr.get_task_instances() for ti in tis: if ti.task_id in (op1.task_id, op4.task_id, op6.task_id): self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == op2.task_id: self.assertEqual(ti.state, State.FAILED) elif ti.task_id == op3.task_id: self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == op5.task_id: self.assertEqual(ti.state, State.UPSTREAM_FAILED)