def test_scheduler_verify_max_active_runs(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr)
def test_scheduler_process_check_heartrate(self): """ Test if process dag honors the heartrate """ dag = DAG( dag_id='test_scheduler_process_check_heartrate', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.last_scheduler_run = datetime.datetime.now() session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() scheduler.heartrate = 1000 dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_do_not_schedule_removed_task(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_do_not_schedule_too_early(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_too_early', start_date=datetime.datetime(2200, 1, 1)) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_fail_dagrun_timeout(self): """ Test if a a dagrun wil be set failed if timeout """ dag = DAG( dag_id='test_scheduler_fail_dagrun_timeout', start_date=DEFAULT_DATE) dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() dr2 = scheduler.create_dag_run(dag) self.assertIsNotNone(dr2) dr.refresh_from_db(session=session) self.assertEquals(dr.state, State.FAILED)
def test_dag_clear(self): dag = DAG('test_dag_clear', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10)) task0 = DummyOperator(task_id='test_dag_clear_task_0', owner='test', dag=dag) ti0 = TI(task=task0, execution_date=DEFAULT_DATE) # Next try to run will be try 1 self.assertEqual(ti0.try_number, 1) ti0.run() self.assertEqual(ti0.try_number, 2) dag.clear() ti0.refresh_from_db() self.assertEqual(ti0.try_number, 2) self.assertEqual(ti0.state, State.NONE) self.assertEqual(ti0.max_tries, 1) task1 = DummyOperator(task_id='test_dag_clear_task_1', owner='test', dag=dag, retries=2) ti1 = TI(task=task1, execution_date=DEFAULT_DATE) self.assertEqual(ti1.max_tries, 2) ti1.try_number = 1 # Next try will be 2 ti1.run() self.assertEqual(ti1.try_number, 3) self.assertEqual(ti1.max_tries, 2) dag.clear() ti0.refresh_from_db() ti1.refresh_from_db() # after clear dag, ti2 should show attempt 3 of 5 self.assertEqual(ti1.max_tries, 4) self.assertEqual(ti1.try_number, 3) # after clear dag, ti1 should show attempt 2 of 2 self.assertEqual(ti0.try_number, 2) self.assertEqual(ti0.max_tries, 1)
def test_scheduler_process_execute_task(self): """ Test if process dag sends a task to the executor """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_called_with( ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None) ) tis = dr.get_task_instances(state=State.SCHEDULED) self.assertIsNotNone(tis)
def test_scheduler_process_task_instances(self): """ Test if _process_task_instances puts the right task instances into the queue. """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE) )
def test_scheduler_does_not_run_excluded(self): dag = DAG( dag_id='test_scheduler_does_not_run_excluded', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.EXCLUDED session.commit() session.close() queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_with_dag_run(self): value = False dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks {}".format(dag.tasks)) dr = dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
def _test_schedule(self, interval, schedule_criteria, start_date=True, default_start_date=-10): print("Test {} date of {} schedule job.".format( "start" if start_date else "end", interval)) today = datetime.today() today_utc = datetime.now(timezone.utc) _start_date = today + timedelta(days=default_start_date) _end_date = None catchup = start_date def format(date): if date is None: return "-" else: return date.strftime("%Y/%m/%d %H:%M:%S") deltas = ( (-2, "day_before_yesterday"), (-1, "yesterday"), (0, "today"), (1, "tomorrow"), (2, "day_after_tomorrow"), ) for delta, description in deltas: date = today + timedelta(days=delta) if start_date: _start_date = datetime.today() + timedelta(days=delta) else: _end_date = date dag = DAG(f"dag_starts_{description}", start_date=_start_date, end_date=_end_date, schedule_interval=interval, catchup=catchup) scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) execution_date = None if dr is None else dr.execution_date dates = (f"today={format(today)} (UTC={format(today_utc)}), ", f"start_date={format(dag.start_date)}, ", f"end_date={format(dag.end_date)}, ", f"execution_date={format(execution_date)}") dates = "\n\t" + "\n\t".join(dates) if schedule_criteria(delta): self.assertIsNotNone(dr) print(f"{description} is scheduled {dates}.") else: self.assertIsNone(dr) print(f"{description} is not scheduled {dates}.")
def test_with_dag_run(self): value = False dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks %s", dag.tasks) dr = dag.create_dagrun(run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise Exception value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise Exception
def test_clear_skipped_downstream_task(self): """ After a downstream task is skipped by ShortCircuitOperator, clearing the skipped task should not cause it to be executed. """ dag = DAG('shortcircuit_clear_skipped_downstream_task', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: False) downstream = DummyOperator(task_id='downstream', dag=dag) short_op >> downstream dag.clear() dr = dag.create_dagrun( run_type=DagRunType.MANUAL, start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) downstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'downstream': self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f'Invalid task id {ti.task_id} found!') # Clear downstream with create_session() as session: clear_task_instances([t for t in tis if t.task_id == "downstream"], session=session, dag=dag) # Run downstream again downstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # Check if the states are correct. for ti in dr.get_task_instances(): if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'downstream': self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f'Invalid task id {ti.task_id} found!')
def test_backfill_ordered_concurrent_execute(self): dag = DAG( dag_id='test_backfill_ordered_concurrent_execute', start_date=DEFAULT_DATE, schedule_interval="@daily") with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1') op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3') # order randomly op2.set_downstream(op3) op1.set_downstream(op3) op4.set_downstream(op5) op3.set_downstream(op4) dag.clear() executor = TestExecutor() job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), ) job.run() d0 = DEFAULT_DATE d1 = d0 + datetime.timedelta(days=1) d2 = d1 + datetime.timedelta(days=1) # test executor history keeps a list history = executor.history self.maxDiff = None self.assertListEqual( # key[0] is dag id, key[3] is try_number, we don't care about either of those here [sorted([item[-1].key[1:3] for item in batch]) for batch in history], [ [ ('leave1', d0), ('leave1', d1), ('leave1', d2), ('leave2', d0), ('leave2', d1), ('leave2', d2) ], [('upstream_level_1', d0), ('upstream_level_1', d1), ('upstream_level_1', d2)], [('upstream_level_2', d0), ('upstream_level_2', d1), ('upstream_level_2', d2)], [('upstream_level_3', d0), ('upstream_level_3', d1), ('upstream_level_3', d2)], ] )
def test_scheduler_verify_pool_full(self, mock_pool_full): """ Test task instances not queued when pool is full """ mock_pool_full.return_value = False dag = DAG( dag_id='test_scheduler_verify_pool_full', start_date=DEFAULT_DATE) DummyOperator( task_id='dummy', dag=dag, owner='airflow', pool='test_scheduler_verify_pool_full') session = settings.Session() pool = Pool(pool='test_scheduler_verify_pool_full', slots=1) session.add(pool) orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() # Create 2 dagruns, which will create 2 task instances. dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, DEFAULT_DATE) dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = [] scheduler._process_task_instances(dag, queue=queue) self.assertEquals(len(queue), 2) dagbag = SimpleDagBag([dag]) # Recreated part of the scheduler here, to kick off tasks -> executor for ti_key in queue: task = dag.get_task(ti_key[1]) ti = models.TaskInstance(task, ti_key[2]) # Task starts out in the scheduled state. All tasks in the # scheduled state will be sent to the executor ti.state = State.SCHEDULED # Also save this task instance to the DB. session.merge(ti) session.commit() scheduler._execute_task_instances(dagbag, (State.SCHEDULED, State.UP_FOR_RETRY)) self.assertEquals(len(scheduler.executor.queued_tasks), 1)
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close()
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise session.close()
def dag(mocker): clear_session() configuration.load_test_config() dag = DAG( "test_dag", default_args=dict(owner="airflow", start_date=DEFAULT_DATE), schedule_interval=INTERVAL, ) yield dag dag.clear() clear_session()
def test_dag_get_active_runs(self): """ Test to check that a DAG returns it's active runs """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace( minute=0, second=0, microsecond=0) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'get_active_runs_test' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag1.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag1.clear() dr = scheduler.create_dag_run(dag1) # We had better get a dag run self.assertIsNotNone(dr) execution_date = dr.execution_date running_dates = dag1.get_active_runs() try: running_date = running_dates[0] except: running_date = 'Except' self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
def test_dag_get_active_runs(self): """ Test to check that a DAG returns it's active runs """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace(minute=0, second=0, microsecond=0) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'get_active_runs_test' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args ) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag1.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag1.clear() dr = scheduler.create_dag_run(dag1) # We had better get a dag run self.assertIsNotNone(dr) execution_date = dr.execution_date running_dates = dag1.get_active_runs() try: running_date = running_dates[0] except: running_date = 'Except' self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
def test_mark_failure_on_failure_callback(self): """ Test that ensures that mark_failure in the UI fails the task, and executes on_failure_callback """ data = {'called': False} def check_failure(context): self.assertEqual(context['dag_run'].dag_id, 'test_mark_failure') data['called'] = True dag = DAG(dag_id='test_mark_failure', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) task = DummyOperator( task_id='test_state_succeeded1', dag=dag, on_failure_callback=check_failure) session = settings.Session() dag.clear() dag.create_dagrun(run_id="test", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session) ti = TI(task=task, execution_date=DEFAULT_DATE) ti.refresh_from_db() job1 = LocalTaskJob(task_instance=ti, ignore_ti_state=True, executor=SequentialExecutor()) from airflow.task.task_runner.standard_task_runner import StandardTaskRunner job1.task_runner = StandardTaskRunner(job1) process = multiprocessing.Process(target=job1.run) process.start() ti.refresh_from_db() for _ in range(0, 50): if ti.state == State.RUNNING: break time.sleep(0.1) ti.refresh_from_db() self.assertEqual(State.RUNNING, ti.state) ti.state = State.FAILED session.merge(ti) session.commit() job1.heartbeat_callback(session=None) self.assertTrue(data['called']) process.join(timeout=10) self.assertFalse(process.is_alive())
def test_dagrun_set_state_end_date(self): session = settings.Session() dag = DAG( 'test_dagrun_set_state_end_date', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) dag.clear() now = timezone.utcnow() dr = dag.create_dagrun(run_id='test_dagrun_set_state_end_date', state=State.RUNNING, execution_date=now, start_date=now) # Initial end_date should be NULL # State.SUCCESS and State.FAILED are all ending state and should set end_date # State.RUNNING set end_date back to NULL session.add(dr) session.commit() self.assertIsNone(dr.end_date) dr.set_state(State.SUCCESS) session.merge(dr) session.commit() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_set_state_end_date' ).one() self.assertIsNotNone(dr_database.end_date) self.assertEqual(dr.end_date, dr_database.end_date) dr.set_state(State.RUNNING) session.merge(dr) session.commit() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_set_state_end_date' ).one() self.assertIsNone(dr_database.end_date) dr.set_state(State.FAILED) session.merge(dr) session.commit() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_set_state_end_date' ).one() self.assertIsNotNone(dr_database.end_date) self.assertEqual(dr.end_date, dr_database.end_date)
def _get_dummy_dag(self, dag_id, pool=None, task_concurrency=None): dag = DAG(dag_id=dag_id, start_date=DEFAULT_DATE, schedule_interval='@daily') with dag: DummyOperator(task_id='op', pool=pool, task_concurrency=task_concurrency, dag=dag) dag.clear() return dag
def test_get_states_count_upstream_ti(self): """ this test tests the helper function '_get_states_count_upstream_ti' as a unit and inside update_state """ from airflow.ti_deps.dep_context import DepContext get_states_count_upstream_ti = TriggerRuleDep._get_states_count_upstream_ti session = settings.Session() now = timezone.utcnow() dag = DAG('test_dagrun_with_pre_tis', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E', trigger_rule=TriggerRule.ONE_FAILED) op1.set_downstream([op2, op3]) # op1 >> op2, op3 op4.set_upstream([op3, op2]) # op3, op2 >> op4 op5.set_upstream([op2, op3, op4]) # (op2, op3, op4) >> op5 clear_db_runs() dag.clear() dr = dag.create_dagrun( run_id='test_dagrun_with_pre_tis', state=State.RUNNING, execution_date=now, start_date=now ) ti_op1 = TaskInstance(task=dag.get_task(op1.task_id), execution_date=dr.execution_date) ti_op2 = TaskInstance(task=dag.get_task(op2.task_id), execution_date=dr.execution_date) ti_op3 = TaskInstance(task=dag.get_task(op3.task_id), execution_date=dr.execution_date) ti_op4 = TaskInstance(task=dag.get_task(op4.task_id), execution_date=dr.execution_date) ti_op5 = TaskInstance(task=dag.get_task(op5.task_id), execution_date=dr.execution_date) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) ti_op5.set_state(state=State.SUCCESS, session=session) session.commit() # check handling with cases that tasks are triggered from backfill with no finished tasks finished_tasks = DepContext().ensure_finished_tasks(ti_op2.task.dag, ti_op2.execution_date, session) assert get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op2) == (1, 0, 0, 0, 1) finished_tasks = dr.get_task_instances(state=State.finished, session=session) assert get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op4) == (1, 0, 1, 0, 2) assert get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op5) == (2, 0, 1, 0, 3) dr.update_state() assert State.SUCCESS == dr.state
def test_dagrun_success_conditions(self): session = settings.Session() dag = DAG( 'test_dagrun_success_conditions', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B # A -> C -> D # ordered: B, D, C, A or D, B, C, A or D, C, B, A with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op1.set_upstream([op2, op3]) op3.set_upstream(op4) dag.clear() now = datetime.datetime.now() dr = dag.create_dagrun(run_id='test_dagrun_success_conditions', state=State.RUNNING, execution_date=now, start_date=now) # op1 = root ti_op1 = dr.get_task_instance(task_id=op1.task_id) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2 = dr.get_task_instance(task_id=op2.task_id) ti_op3 = dr.get_task_instance(task_id=op3.task_id) ti_op4 = dr.get_task_instance(task_id=op4.task_id) # root is successful, but unfinished tasks state = dr.update_state() self.assertEqual(State.RUNNING, state) # one has failed, but root is successful ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) state = dr.update_state() self.assertEqual(State.SUCCESS, state) # upstream dependency failed, root has not run ti_op1.set_state(State.NONE, session) state = dr.update_state() self.assertEqual(State.FAILED, state)
def test_scheduler_auto_align(self): """ Test if the schedule_interval will be auto aligned with the start_date such that if the start_date coincides with the schedule the first execution_date will be start_date, otherwise it will be start_date + interval. """ dag = DAG( dag_id='test_scheduler_auto_align_1', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="4 5 * * *" ) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 2, 5, 4)) dag = DAG( dag_id='test_scheduler_auto_align_2', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="10 10 * * *" ) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 1, 10, 10))
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join(models.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_scheduler_dagrun_once(self): """ Test if the scheduler does not create multiple dagruns if a dag is scheduled with @once and a start_date """ dag = DAG('test_scheduler_dagrun_once', start_date=datetime.datetime(2015, 1, 1), schedule_interval="@once") scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr)
def test_scheduler_dagrun_once(self): """ Test if the scheduler does not create multiple dagruns if a dag is scheduled with @once and a start_date """ dag = DAG( 'test_scheduler_dagrun_once', start_date=datetime.datetime(2015, 1, 1), schedule_interval="@once") scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr)
def test_dag_clear(self): dag = DAG('test_dag_clear', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10)) task0 = DummyOperator(task_id='test_dag_clear_task_0', owner='test', dag=dag) ti0 = TI(task=task0, execution_date=DEFAULT_DATE) dag.create_dagrun( execution_date=ti0.execution_date, state=State.RUNNING, run_type=DagRunType.SCHEDULED, ) # Next try to run will be try 1 assert ti0.try_number == 1 ti0.run() assert ti0.try_number == 2 dag.clear() ti0.refresh_from_db() assert ti0.try_number == 2 assert ti0.state == State.NONE assert ti0.max_tries == 1 task1 = DummyOperator(task_id='test_dag_clear_task_1', owner='test', dag=dag, retries=2) ti1 = TI(task=task1, execution_date=DEFAULT_DATE) assert ti1.max_tries == 2 ti1.try_number = 1 # Next try will be 2 ti1.run() assert ti1.try_number == 3 assert ti1.max_tries == 2 dag.clear() ti0.refresh_from_db() ti1.refresh_from_db() # after clear dag, ti2 should show attempt 3 of 5 assert ti1.max_tries == 4 assert ti1.try_number == 3 # after clear dag, ti1 should show attempt 2 of 2 assert ti0.try_number == 2 assert ti0.max_tries == 1
def test_sub_set_subdag(self): dag = DAG('test_sub_set_subdag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1') op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3') # order randomly op2.set_downstream(op3) op1.set_downstream(op3) op4.set_downstream(op5) op3.set_downstream(op4) dag.clear() dr = dag.create_dagrun(run_id="test", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE) executor = MockExecutor() sub_dag = dag.sub_dag(task_regex="leave*", include_downstream=False, include_upstream=False) job = BackfillJob(dag=sub_dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor) job.run() self.assertRaises(sqlalchemy.orm.exc.NoResultFound, dr.refresh_from_db) # the run_id should have changed, so a refresh won't work drs = DagRun.find(dag_id=dag.dag_id, execution_date=DEFAULT_DATE) dr = drs[0] self.assertEqual( BackfillJob.ID_FORMAT_PREFIX.format(DEFAULT_DATE.isoformat()), dr.run_id) for ti in dr.get_task_instances(): if ti.task_id == 'leave1' or ti.task_id == 'leave2': self.assertEqual(State.SUCCESS, ti.state) else: self.assertEqual(State.NONE, ti.state)
def _get_dag_test_max_active_limits(self, dag_id, max_active_runs=1): dag = DAG(dag_id=dag_id, start_date=DEFAULT_DATE, schedule_interval="@hourly", max_active_runs=max_active_runs) with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1') op4 = DummyOperator(task_id='upstream_level_2') op1 >> op2 >> op3 op4 >> op3 dag.clear() return dag
def test_process_bind_param_naive(self): """ Check if naive datetimes are prevented from saving to the db """ dag_id = 'test_process_bind_param_naive' # naive start_date = datetime.datetime.now() dag = DAG(dag_id=dag_id, start_date=start_date) dag.clear() with self.assertRaises((ValueError, StatementError)): dag.create_dagrun(run_id=start_date.isoformat, state=State.NONE, execution_date=start_date, start_date=start_date, session=self.session) dag.clear()
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): scheduler = SchedulerJob(num_runs=1, executor=executor,) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_schedule_dag_once(self): """ Tests scheduling a dag scheduled for @once - should be scheduled the first time it is called, and not scheduled the second. """ dag_id = "test_schedule_dag_once" dag = DAG(dag_id=dag_id) dag.schedule_interval = '@once' dag.add_task(BaseOperator( task_id="faketastic", owner='Also fake', start_date=datetime_tz(2015, 1, 2, 0, 0))) dag_run = DagFileProcessor(dag_ids=[], log=mock.MagicMock()).create_dag_run(dag) dag_run2 = DagFileProcessor(dag_ids=[], log=mock.MagicMock()).create_dag_run(dag) self.assertIsNotNone(dag_run) self.assertIsNone(dag_run2) dag.clear() self._clean_up(dag_id)
def test_schedule_dag_relativedelta(self): """ Tests scheduling a dag with a relativedelta schedule_interval """ dag_id = "test_schedule_dag_relativedelta" delta = relativedelta(hours=+1) dag = DAG(dag_id=dag_id, schedule_interval=delta) dag.add_task(BaseOperator( task_id="faketastic", owner='Also fake', start_date=datetime_tz(2015, 1, 2, 0, 0))) dag_file_processor = DagFileProcessor(dag_ids=[], log=mock.MagicMock()) dag_run = dag_file_processor.create_dag_run(dag) self.assertIsNotNone(dag_run) self.assertEqual(dag.dag_id, dag_run.dag_id) self.assertIsNotNone(dag_run.run_id) self.assertNotEqual('', dag_run.run_id) self.assertEqual( datetime_tz(2015, 1, 2, 0, 0), dag_run.execution_date, msg='dag_run.execution_date did not match expectation: {0}' .format(dag_run.execution_date) ) self.assertEqual(State.RUNNING, dag_run.state) self.assertFalse(dag_run.external_trigger) dag_run2 = dag_file_processor.create_dag_run(dag) self.assertIsNotNone(dag_run2) self.assertEqual(dag.dag_id, dag_run2.dag_id) self.assertIsNotNone(dag_run2.run_id) self.assertNotEqual('', dag_run2.run_id) self.assertEqual( datetime_tz(2015, 1, 2, 0, 0) + delta, dag_run2.execution_date, msg='dag_run2.execution_date did not match expectation: {0}' .format(dag_run2.execution_date) ) self.assertEqual(State.RUNNING, dag_run2.state) self.assertFalse(dag_run2.external_trigger) dag.clear() self._clean_up(dag_id)
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG(dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): scheduler = SchedulerJob( num_runs=1, executor=executor, ) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_scheduler_do_not_schedule_too_early(self): dag = DAG(dag_id='test_scheduler_do_not_schedule_too_early', start_date=datetime.datetime(2200, 1, 1)) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_backfill_rerun_upstream_failed_tasks(self): dag = DAG(dag_id='test_backfill_rerun_upstream_failed', start_date=DEFAULT_DATE, schedule_interval='@daily') with dag: t1 = DummyOperator( task_id='test_backfill_rerun_upstream_failed_task-1', dag=dag) t2 = DummyOperator( task_id='test_backfill_rerun_upstream_failed_task-2', dag=dag) t1.set_upstream(t2) dag.clear() executor = MockExecutor() job = BackfillJob( dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), ) job.run() ti = TI( task=dag.get_task('test_backfill_rerun_upstream_failed_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.set_state(State.UPSTREAM_FAILED) job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), rerun_failed_tasks=True) job.run() ti = TI( task=dag.get_task('test_backfill_rerun_upstream_failed_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS)
def test_backfill_run_rescheduled(self): dag = DAG( dag_id='test_backfill_run_rescheduled', start_date=DEFAULT_DATE, schedule_interval='@daily') with dag: DummyOperator( task_id='test_backfill_run_rescheduled_task-1', dag=dag, ) dag.clear() executor = TestExecutor() job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), ) job.run() ti = TI(task=dag.get_task('test_backfill_run_rescheduled_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.set_state(State.UP_FOR_RESCHEDULE) job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), rerun_failed_tasks=True ) job.run() ti = TI(task=dag.get_task('test_backfill_run_rescheduled_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS)
def test_scheduler_add_new_task(self): """ Test if a task instance will be added if the dag is updated """ dag = DAG( dag_id='test_scheduler_add_new_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances() self.assertEquals(len(tis), 1) dag_task2 = DummyOperator( task_id='dummy2', dag=dag, owner='airflow') queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) tis = dr.get_task_instances() self.assertEquals(len(tis), 2)
def test_scheduler_verify_max_active_runs_and_dagrun_timeout(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached and dagrun_timeout is not reached Test if a a dagrun will be scheduled if max_dag_runs has been reached but dagrun_timeout is also reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs_and_dagrun_timeout', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) # Should not be scheduled as DagRun has not timedout and max_active_runs is reached new_dr = scheduler.create_dag_run(dag) self.assertIsNone(new_dr) # Should be scheduled as dagrun_timeout has passed dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() new_dr = scheduler.create_dag_run(dag) self.assertIsNotNone(new_dr)
def test_scheduler_max_active_runs_respected_after_clear(self): """ Test if _process_task_instances only schedules ti's up to max_active_runs (related to issue AIRFLOW-137) """ dag = DAG( dag_id='test_scheduler_max_active_runs_respected_after_clear', start_date=DEFAULT_DATE) dag.max_active_runs = 3 dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() # First create up to 3 dagruns in RUNNING state. scheduler.create_dag_run(dag) # Reduce max_active_runs to 1 dag.max_active_runs = 1 queue = mock.Mock() # and schedule them in, so we can check how many # tasks are put on the queue (should be one, not 3) scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE) )
class BranchOperatorTest(unittest.TestCase): @classmethod def setUpClass(cls): super(BranchOperatorTest, cls).setUpClass() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) def tearDown(self): super().tearDown() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.dag.clear() self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise Exception def test_branch_list_without_dag_run(self): """This checks if the BranchPythonOperator supports branching off to a list of tasks.""" self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: ['branch_1', 'branch_2']) self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.branch_3 = DummyOperator(task_id='branch_3', dag=self.dag) self.branch_3.set_upstream(self.branch_op) self.dag.clear() self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) expected = { "make_choice": State.SUCCESS, "branch_1": State.NONE, "branch_2": State.NONE, "branch_3": State.SKIPPED, } for ti in tis: if ti.task_id in expected: self.assertEqual(ti.state, expected[ti.task_id]) else: raise Exception def test_with_dag_run(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise Exception def test_with_skip_in_branch_downstream_dependencies(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_op >> self.branch_1 >> self.branch_2 self.branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise Exception def test_with_skip_in_branch_downstream_dependencies2(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_2') self.branch_op >> self.branch_1 >> self.branch_2 self.branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise Exception
class BigQueryOperatorTest(unittest.TestCase): def setUp(self): configuration.conf.load_test_config() self.dagbag = models.DagBag( dag_folder='/dev/null', include_examples=True) self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG(TEST_DAG_ID, default_args=self.args) def tearDown(self): session = Session() session.query(models.TaskInstance).filter_by( dag_id=TEST_DAG_ID).delete() session.query(TaskFail).filter_by( dag_id=TEST_DAG_ID).delete() session.commit() session.close() @mock.patch('airflow.contrib.operators.bigquery_operator.BigQueryHook') def test_execute(self, mock_hook): operator = BigQueryOperator( task_id=TASK_ID, sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, bigquery_conn_id='google_cloud_default', udf_config=None, use_legacy_sql=True, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, ) operator.execute(MagicMock()) mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, ) @mock.patch('airflow.contrib.operators.bigquery_operator.BigQueryHook') def test_bigquery_operator_defaults(self, mock_hook): operator = BigQueryOperator( task_id=TASK_ID, sql='Select * from test_table', dag=self.dag, default_args=self.args ) operator.execute(MagicMock()) mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, ) self.assertTrue(isinstance(operator.sql, six.string_types)) ti = TaskInstance(task=operator, execution_date=DEFAULT_DATE) ti.render_templates() self.assertTrue(isinstance(ti.task.sql, six.string_types)) @mock.patch('airflow.contrib.operators.bigquery_operator.BigQueryHook') def test_bigquery_operator_extra_link(self, mock_hook): bigquery_task = BigQueryOperator( task_id=TASK_ID, sql='SELECT * FROM test_table', dag=self.dag, ) self.dag.clear() ti = TaskInstance( task=bigquery_task, execution_date=DEFAULT_DATE, ) job_id = '12345' ti.xcom_push(key='job_id', value=job_id) self.assertEquals( 'https://console.cloud.google.com/bigquery?j={job_id}'.format(job_id=job_id), bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name), ) self.assertEquals( '', bigquery_task.get_extra_links(datetime(2019, 1, 1), BigQueryConsoleLink.name), )
class BranchOperatorTest(unittest.TestCase): @classmethod def setUpClass(cls): super(BranchOperatorTest, cls).setUpClass() session = Session() session.query(DagRun).delete() session.query(TI).delete() session.commit() session.close() def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.branch_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_op) self.dag.clear() def tearDown(self): super(BranchOperatorTest, self).tearDown() session = Session() session.query(DagRun).delete() session.query(TI).delete() print(len(session.query(DagRun).all())) session.commit() session.close() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) session.close() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise def test_with_dag_run(self): dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise