def test_operator_clear(self): dag = DAG('test_operator_clear', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10)) t1 = DummyOperator(task_id='bash_op', owner='test', dag=dag) t2 = DummyOperator(task_id='dummy_op', owner='test', dag=dag, retries=1) t2.set_upstream(t1) ti1 = TI(task=t1, execution_date=DEFAULT_DATE) ti2 = TI(task=t2, execution_date=DEFAULT_DATE) ti2.run() # Dependency not met self.assertEqual(ti2.try_number, 1) self.assertEqual(ti2.max_tries, 1) t2.clear(upstream=True) ti1.run() ti2.run() self.assertEqual(ti1.try_number, 2) # max_tries is 0 because there is no task instance in db for ti1 # so clear won't change the max_tries. self.assertEqual(ti1.max_tries, 0) self.assertEqual(ti2.try_number, 2) # try_number (0) + retries(1) self.assertEqual(ti2.max_tries, 1)
def test_skipping(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task.set_upstream(latest_task) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} assert exec_date_to_latest_state == { datetime.datetime(2016, 1, 1): 'success', datetime.datetime(2016, 1, 1, 12): 'success', datetime.datetime(2016, 1, 2): 'success', } downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} assert exec_date_to_downstream_state == { datetime.datetime(2016, 1, 1): 'skipped', datetime.datetime(2016, 1, 1, 12): 'skipped', datetime.datetime(2016, 1, 2): 'success', }
class BranchOperatorTest(unittest.TestCase): def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.branch_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_op) self.dag.clear() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) session.close() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise def test_with_dag_run(self): dr = self.dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise
def test_with_dag_run(self): value = False dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks {}".format(dag.tasks)) dr = dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
def test_get_states_count_upstream_ti(self): """ this test tests the helper function '_get_states_count_upstream_ti' as a unit and inside update_state """ from airflow.ti_deps.dep_context import DepContext get_states_count_upstream_ti = TriggerRuleDep._get_states_count_upstream_ti session = settings.Session() now = timezone.utcnow() dag = DAG( 'test_dagrun_with_pre_tis', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E', trigger_rule=TriggerRule.ONE_FAILED) op1.set_downstream([op2, op3]) # op1 >> op2, op3 op4.set_upstream([op3, op2]) # op3, op2 >> op4 op5.set_upstream([op2, op3, op4]) # (op2, op3, op4) >> op5 clear_db_runs() dag.clear() dr = dag.create_dagrun(run_id='test_dagrun_with_pre_tis', state=State.RUNNING, execution_date=now, start_date=now) ti_op1 = TaskInstance(task=dag.get_task(op1.task_id), execution_date=dr.execution_date) ti_op2 = TaskInstance(task=dag.get_task(op2.task_id), execution_date=dr.execution_date) ti_op3 = TaskInstance(task=dag.get_task(op3.task_id), execution_date=dr.execution_date) ti_op4 = TaskInstance(task=dag.get_task(op4.task_id), execution_date=dr.execution_date) ti_op5 = TaskInstance(task=dag.get_task(op5.task_id), execution_date=dr.execution_date) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) ti_op5.set_state(state=State.SUCCESS, session=session) session.commit() # check handling with cases that tasks are triggered from backfill with no finished tasks finished_tasks = DepContext().ensure_finished_tasks(ti_op2.task.dag, ti_op2.execution_date, session) self.assertEqual(get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op2), (1, 0, 0, 0, 1)) finished_tasks = dr.get_task_instances(state=State.finished() + [State.UPSTREAM_FAILED], session=session) self.assertEqual(get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op4), (1, 0, 1, 0, 2)) self.assertEqual(get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op5), (2, 0, 1, 0, 3)) dr.update_state() self.assertEqual(State.SUCCESS, dr.state)
def test_dagrun_update_state_end_date(self): session = settings.Session() dag = DAG( 'test_dagrun_update_state_end_date', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'} ) # A -> B with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op1.set_upstream(op2) dag.clear() now = timezone.utcnow() dr = dag.create_dagrun( run_id='test_dagrun_update_state_end_date', state=State.RUNNING, execution_date=now, start_date=now, ) # Initial end_date should be NULL # State.SUCCESS and State.FAILED are all ending state and should set end_date # State.RUNNING set end_date back to NULL session.merge(dr) session.commit() self.assertIsNone(dr.end_date) ti_op1 = dr.get_task_instance(task_id=op1.task_id) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2 = dr.get_task_instance(task_id=op2.task_id) ti_op2.set_state(state=State.SUCCESS, session=session) dr.update_state() dr_database = session.query(DagRun).filter(DagRun.run_id == 'test_dagrun_update_state_end_date').one() self.assertIsNotNone(dr_database.end_date) self.assertEqual(dr.end_date, dr_database.end_date) ti_op1.set_state(state=State.RUNNING, session=session) ti_op2.set_state(state=State.RUNNING, session=session) dr.update_state() dr_database = session.query(DagRun).filter(DagRun.run_id == 'test_dagrun_update_state_end_date').one() self.assertEqual(dr._state, State.RUNNING) self.assertIsNone(dr.end_date) self.assertIsNone(dr_database.end_date) ti_op1.set_state(state=State.FAILED, session=session) ti_op2.set_state(state=State.FAILED, session=session) dr.update_state() dr_database = session.query(DagRun).filter(DagRun.run_id == 'test_dagrun_update_state_end_date').one() self.assertIsNotNone(dr_database.end_date) self.assertEqual(dr.end_date, dr_database.end_date)
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close()
def test_dag_get_active_runs(self): """ Test to check that a DAG returns it's active runs """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace( minute=0, second=0, microsecond=0) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'get_active_runs_test' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag1.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag1.clear() dr = scheduler.create_dag_run(dag1) # We had better get a dag run self.assertIsNotNone(dr) execution_date = dr.execution_date running_dates = dag1.get_active_runs() try: running_date = running_dates[0] except: running_date = 'Except' self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
def test_dag_get_active_runs(self): """ Test to check that a DAG returns it's active runs """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace(minute=0, second=0, microsecond=0) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'get_active_runs_test' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args ) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag1.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag1.clear() dr = scheduler.create_dag_run(dag1) # We had better get a dag run self.assertIsNotNone(dr) execution_date = dr.execution_date running_dates = dag1.get_active_runs() try: running_date = running_dates[0] except: running_date = 'Except' self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
def test_dagrun_success_conditions(self): session = settings.Session() dag = DAG( 'test_dagrun_success_conditions', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B # A -> C -> D # ordered: B, D, C, A or D, B, C, A or D, C, B, A with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op1.set_upstream([op2, op3]) op3.set_upstream(op4) dag.clear() now = datetime.datetime.now() dr = dag.create_dagrun(run_id='test_dagrun_success_conditions', state=State.RUNNING, execution_date=now, start_date=now) # op1 = root ti_op1 = dr.get_task_instance(task_id=op1.task_id) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2 = dr.get_task_instance(task_id=op2.task_id) ti_op3 = dr.get_task_instance(task_id=op3.task_id) ti_op4 = dr.get_task_instance(task_id=op4.task_id) # root is successful, but unfinished tasks state = dr.update_state() self.assertEqual(State.RUNNING, state) # one has failed, but root is successful ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) state = dr.update_state() self.assertEqual(State.SUCCESS, state) # upstream dependency failed, root has not run ti_op1.set_state(State.NONE, session) state = dr.update_state() self.assertEqual(State.FAILED, state)
def test_dagrun_success_conditions(self): session = settings.Session() dag = DAG( 'test_dagrun_success_conditions', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B # A -> C -> D # ordered: B, D, C, A or D, B, C, A or D, C, B, A with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op1.set_upstream([op2, op3]) op3.set_upstream(op4) dag.clear() now = datetime.datetime.now() dr = dag.create_dagrun(run_id='test_dagrun_success_conditions', state=State.RUNNING, execution_date=now, start_date=now) # op1 = root ti_op1 = dr.get_task_instance(task_id=op1.task_id) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2 = dr.get_task_instance(task_id=op2.task_id) ti_op3 = dr.get_task_instance(task_id=op3.task_id) ti_op4 = dr.get_task_instance(task_id=op4.task_id) # root is successful, but unfinished tasks state = dr.update_state() self.assertEqual(State.RUNNING, state) # one has failed, but root is successful ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) state = dr.update_state() self.assertEqual(State.SUCCESS, state) # upstream dependency failed, root has not run ti_op1.set_state(State.NONE, session) state = dr.update_state() self.assertEqual(State.FAILED, state)
def _make_sensor(self, return_value, task_id=SENSOR_OP, **kwargs): poke_interval = 'poke_interval' timeout = 'timeout' if poke_interval not in kwargs: kwargs[poke_interval] = 0 if timeout not in kwargs: kwargs[timeout] = 0 sensor = DummySensor(task_id=task_id, return_value=return_value, dag=self.dag, **kwargs) dummy_op = DummyOperator(task_id=DUMMY_OP, dag=self.dag) dummy_op.set_upstream(sensor) return sensor
def test_skipping_dagrun(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') self.dag_file_processor._process_task_instances(self.dag, task_instances_list=latest_instances) exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') self.dag_file_processor._process_task_instances(self.dag, task_instances_list=downstream_instances) exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') self.dag_file_processor._process_task_instances(self.dag, task_instances_list=downstream_instances) exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
def _create_events_branch(self, task_id): """Create the DAG branch with sensor and operator (to be called by each subclass).""" self.decrypt_connection() tables = self.get_events() tables_op = DummyOperator(task_id=task_id, dag=self.dag, resources=dict(organizationId='astronomer')) tables_op.set_upstream(self.upstream_task) for table in tables: sensor = self.create_key_sensor(table=table) sensor.set_upstream(tables_op) copy_task = self.create_copy_operator(table=table) if not copy_task: logger.info('Skipping table due to invalid config') continue copy_task.set_upstream(sensor)
def _make_smart_operator(self, index, **kwargs): poke_interval = 'poke_interval' smart_sensor_timeout = 'smart_sensor_timeout' if poke_interval not in kwargs: kwargs[poke_interval] = 0 if smart_sensor_timeout not in kwargs: kwargs[smart_sensor_timeout] = 0 smart_task = DummySmartSensor(task_id=SMART_OP + "_" + str(index), dag=self.dag, **kwargs) dummy_op = DummyOperator(task_id=DUMMY_OP, dag=self.dag) dummy_op.set_upstream(smart_task) return smart_task
def generated_sub_dag(parent_dag_name, child_dag_name, start_date, schedule_interval): dag = DAG('%s.%s' % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, default_args=default_args) task_count = 3 previous_task = None for i in range(task_count): task = DummyOperator( task_id='generated_task_' + str(i), dag=dag, ) if previous_task: task.set_upstream(previous_task) previous_task = task return dag
def generated_sub_dag(parent_dag_name, child_dag_name, start_date, schedule_interval): dag = DAG( '%s.%s' % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, default_args=default_args ) task_count = 3 previous_task = None for i in range(task_count): task = DummyOperator( task_id='generated_task_' + str(i), dag=dag, ) if previous_task: task.set_upstream(previous_task) previous_task = task return dag
def test_dagrun_no_deadlock_with_shutdown(self): session = settings.Session() dag = DAG('test_dagrun_no_deadlock_with_shutdown', start_date=DEFAULT_DATE) with dag: op1 = DummyOperator(task_id='upstream_task') op2 = DummyOperator(task_id='downstream_task') op2.set_upstream(op1) dr = dag.create_dagrun(run_id='test_dagrun_no_deadlock_with_shutdown', state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE) upstream_ti = dr.get_task_instance(task_id='upstream_task') upstream_ti.set_state(State.SHUTDOWN, session=session) dr.update_state() self.assertEqual(dr.state, State.RUNNING)
def _make_sensor(self, return_value, **kwargs): poke_interval = 'poke_interval' timeout = 'timeout' if poke_interval not in kwargs: kwargs[poke_interval] = 0 if timeout not in kwargs: kwargs[timeout] = 0 sensor = DummySensor( task_id=SENSOR_OP, return_value=return_value, dag=self.dag, **kwargs ) dummy_op = DummyOperator( task_id=DUMMY_OP, dag=self.dag ) dummy_op.set_upstream(sensor) return sensor
def test_ExecutionDateBranchOperator(self, dag): date_branches = [ (None, DEFAULT_DATE - INTERVAL, 'before'), (DEFAULT_DATE, DEFAULT_DATE, 'during'), (DEFAULT_DATE + INTERVAL, None, 'after'), ] op = ExecutionDateBranchOperator( date_branches=date_branches, task_id='date_branch', dag=dag) before = DummyOperator(task_id='before', dag=dag) before.set_upstream(op) during = DummyOperator(task_id='during', dag=dag) during.set_upstream(op) after = DummyOperator(task_id='after', dag=dag) after.set_upstream(op) dr = dag.create_dagrun( run_id="manual__", start_date=datetime.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) expected = [ ('date_branch', State.SUCCESS), ('before', State.SKIPPED), ('during', State.NONE), ('after', State.SKIPPED), ] actual = [(ti.task_id, ti.state) for ti in dr.get_task_instances()] assert set(expected) == set(actual)
def test_backfill_rerun_upstream_failed_tasks(self): dag = DAG(dag_id='test_backfill_rerun_upstream_failed', start_date=DEFAULT_DATE, schedule_interval='@daily') with dag: t1 = DummyOperator( task_id='test_backfill_rerun_upstream_failed_task-1', dag=dag) t2 = DummyOperator( task_id='test_backfill_rerun_upstream_failed_task-2', dag=dag) t1.set_upstream(t2) dag.clear() executor = MockExecutor() job = BackfillJob( dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), ) job.run() ti = TI( task=dag.get_task('test_backfill_rerun_upstream_failed_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.set_state(State.UPSTREAM_FAILED) job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), rerun_failed_tasks=True) job.run() ti = TI( task=dag.get_task('test_backfill_rerun_upstream_failed_task-1'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS)
def test_dag_topological_sort2(self): dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # C -> (A u B) -> D # C -> E # ordered: E | D, A | B, C with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op1.set_downstream(op3) op2.set_downstream(op3) op1.set_upstream(op4) op2.set_upstream(op4) op5.set_downstream(op3) topological_list = dag.topological_sort() logging.info(topological_list) set1 = [op4, op5] self.assertTrue(topological_list[0] in set1) set1.remove(topological_list[0]) set2 = [op1, op2] set2.extend(set1) self.assertTrue(topological_list[1] in set2) set2.remove(topological_list[1]) self.assertTrue(topological_list[2] in set2) set2.remove(topological_list[2]) self.assertTrue(topological_list[3] in set2) self.assertTrue(topological_list[4] == op3)
def test_operator_clear(self): dag = DAG('test_operator_clear', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10)) op1 = DummyOperator(task_id='bash_op', owner='test', dag=dag) op2 = DummyOperator(task_id='dummy_op', owner='test', dag=dag, retries=1) op2.set_upstream(op1) ti1 = TI(task=op1, execution_date=DEFAULT_DATE) ti2 = TI(task=op2, execution_date=DEFAULT_DATE) dag.create_dagrun( execution_date=ti1.execution_date, state=State.RUNNING, run_type=DagRunType.SCHEDULED, ) ti2.run() # Dependency not met self.assertEqual(ti2.try_number, 1) self.assertEqual(ti2.max_tries, 1) op2.clear(upstream=True) ti1.run() ti2.run(ignore_ti_state=True) self.assertEqual(ti1.try_number, 2) # max_tries is 0 because there is no task instance in db for ti1 # so clear won't change the max_tries. self.assertEqual(ti1.max_tries, 0) self.assertEqual(ti2.try_number, 2) # try_number (0) + retries(1) self.assertEqual(ti2.max_tries, 1)
def create_test_pipeline(suffix, trigger_rule, dag): skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag) always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag) join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule) join.set_upstream(skip_operator) join.set_upstream(always_true) final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag) final.set_upstream(join)
def create_test_pipeline(suffix, trigger_rule, dag): skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag) always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag) join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule) join.set_upstream(skip_operator) join.set_upstream(always_true) final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag) final.set_upstream(join)
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from airflow import utils from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from datetime import datetime, timedelta now = datetime.now() now_to_the_hour = (now - timedelta(0, 0, 0, 0, 0, 3)).replace(minute=0, second=0, microsecond=0) START_DATE = now_to_the_hour DAG_NAME = 'test_dag_v1' default_args = { 'owner': 'airflow', 'depends_on_past': True, 'start_date': utils.dates.days_ago(2) } dag = DAG(DAG_NAME, schedule_interval='*/10 * * * *', default_args=default_args) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag) run_this_3.set_upstream(run_this_2)
local_cleanup = DummyOperator(task_id='local_cleanup', dag=dag) for entry in list(set(HDFS_DIR)): hdfs_single_cleanup = BashOperator( task_id="hdfs" + entry.replace("/", "_"), bash_command=hdfs_cleanup_command, params={ 'DIRECTORY': entry, 'MAX_DAYS': DEFAULT_MAX_FILE_AGE_IN_DAYS, 'ENABLE_DELETE': ENABLE_DELETE }, dag=dag) hdfs_single_cleanup.set_upstream(hdfs_cleanup) for entry in list(set(LOCAL_DIR)): local_single_cleanup = BashOperator( task_id="local" + entry.replace("/", "_"), bash_command=local_cleanup_command, params={ 'DIRECTORY': entry, 'MAX_DAYS': DEFAULT_MAX_FILE_AGE_IN_DAYS, 'ENABLE_DELETE': ENABLE_DELETE }, dag=dag) local_single_cleanup.set_upstream(local_cleanup) hdfs_cleanup.set_upstream(start) local_cleanup.set_upstream(start)
# BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs dag = DAG(dag_id='example_branch_dop_operator_v3',schedule_interval='*/1 * * * *', default_args=args) def should_run(ds, **kwargs): print("------------- exec dttm = {} and minute = {}".format(kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: return "oper_1" else: return "oper_2" cond = BranchPythonOperator( task_id='condition', provide_context=True, python_callable=should_run, dag=dag) oper_1 = DummyOperator( task_id='oper_1', dag=dag) oper_1.set_upstream(cond) oper_2 = DummyOperator( task_id='oper_2', dag=dag) oper_2.set_upstream(cond)
def test_not_skipping_external(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) self.dag.create_dagrun( run_id="manual__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, external_trigger=True, ) self.dag.create_dagrun( run_id="manual__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING, external_trigger=True, ) self.dag.create_dagrun( run_id="manual__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING, external_trigger=True, ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
class ShortCircuitOperatorTest(unittest.TestCase): def setUp(self): self.dag = DAG('shortcircuit_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.short_op = ShortCircuitOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: self.value) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.short_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_1) self.upstream = DummyOperator(task_id='upstream', dag=self.dag) self.upstream.set_downstream(self.short_op) self.dag.clear() self.value = True def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.value = False self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise self.value = True self.dag.clear() self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close() def test_with_dag_run(self): self.value = False logging.error("Tasks {}".format(self.dag.tasks)) dr = self.dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise self.value = True self.dag.clear() dr.verify_integrity() self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
class BranchOperatorTest(unittest.TestCase): @classmethod def setUpClass(cls): super(BranchOperatorTest, cls).setUpClass() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) def tearDown(self): super(BranchOperatorTest, self).tearDown() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.dag.clear() self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise Exception def test_branch_list_without_dag_run(self): """This checks if the BranchPythonOperator supports branching off to a list of tasks.""" self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: ['branch_1', 'branch_2']) self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.branch_3 = DummyOperator(task_id='branch_3', dag=self.dag) self.branch_3.set_upstream(self.branch_op) self.dag.clear() self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) expected = { "make_choice": State.SUCCESS, "branch_1": State.NONE, "branch_2": State.NONE, "branch_3": State.SKIPPED, } for ti in tis: if ti.task_id in expected: self.assertEqual(ti.state, expected[ti.task_id]) else: raise Exception def test_with_dag_run(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise Exception def test_with_skip_in_branch_downstream_dependencies(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_op >> self.branch_1 >> self.branch_2 self.branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise Exception def test_with_skip_in_branch_downstream_dependencies2(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_2') self.branch_op >> self.branch_1 >> self.branch_2 self.branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise Exception
schedule_interval=timedelta(seconds=1)) task_core = DummyOperator(task_id='task_core', dag=dag_core) dag_first_child_id = TEST_DAG_ID + '_first_child' dag_first_child = DAG(dag_first_child_id, default_args=args, schedule_interval=timedelta(seconds=1)) t1_first_child = ExternalTaskSensor(task_id='t1_first_child', external_dag_id=dag_core_id, external_task_id='task_core', poke_interval=1, dag=dag_first_child, depends_on_past=True) t2_first_child = DummyOperator(task_id='t2_first_child', dag=dag_first_child, depends_on_past=True) t2_first_child.set_upstream(t1_first_child) dag_second_child_id = TEST_DAG_ID + '_second_child' dag_second_child = DAG(dag_second_child_id, default_args=args, schedule_interval=timedelta(seconds=1)) t1_second_child = ExternalTaskSensor(task_id='t1_second_child', external_dag_id=dag_first_child_id, external_task_id='t2_first_child', poke_interval=1, dag=dag_second_child, depends_on_past=True) t2_second_child = DummyOperator(task_id='t2_second_child', dag=dag_second_child)
from unicorn.airflow.util.unicorn_airflow_util import load_yaml dag_id = "unicorn_daily_dag" dir_path = os.path.dirname(os.path.realpath(__file__)) dag_config = load_yaml(os.path.join(dir_path, dag_id + ".yml")) default_args = dag_config['default_args'] default_args['start_date'] = datetime.now() dag = DAG(dag_id, default_args=dag_config["default_args"], schedule_interval=dag_config["schedule_interval"]) dag.doc_md = dag_config['doc_md'] task1 = BashOperator(task_id='TaskStart', bash_command="echo {{params}}", params={'cmd': dag_config["task1_cmd"]}, dag=dag) task2 = BashOperator(task_id='UnicornDaily', depends_on_past=False, bash_command=dag_config["task1_cmd"], params=dag_config["task1_params"], dag=dag) task3 = DummyOperator(task_id='TaskFinsish', dag=dag) task2.set_upstream(task1) task3.set_upstream(task2)
import airflow import datetime from airflow.operators.python_operator import ShortCircuitOperator from airflow.operators.dummy_operator import DummyOperator from airflow.models import DAG import airflow.utils.helpers import random args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2) } dag = DAG( dag_id='more', default_args=args, schedule_interval=datetime.timedelta(seconds=10) ) test = ShortCircuitOperator( task_id='test', python_callable=lambda: random.random() > 0.5, dag=dag) node = DummyOperator( task_id='dosomething', dag=dag) node.set_upstream(test)
SUM(A.value) AS value, COUNT(1) AS games, A.match_date, '{{ds}}' AS process_date FROM {{stg_table_3}} A WHERE process_date = '{{ds}}' GROUP BY A.heroname, A.mapName, A.gameversion, A.gameType, A.toonHandle, A.battleTag, A.player_id, A.playerName, A.metric, A.match_date ON CONFLICT ON CONSTRAINT stg_player_stats_agg_pkey DO UPDATE SET value = excluded.value + {{agg_table}}.value, games = excluded.games + {{agg_table}}.games; """).render(stg_table_3=task_variables['STG_PLAYER_STATS_3'], agg_table=task_variables['STG_PLAYER_STATS_AGG'], ds="{{ ds }}")) insert_stg_agg.set_upstream(insert_stg_player_stats_3) end_task = DummyOperator(dag=dag, task_id='end_task') end_task.set_upstream(insert_stg_agg)
sql='/sql/fact_reviews.sql', postgres_conn_id='redshift') process_fact_reviews.set_upstream( [process_dim_times, process_dim_users, process_dim_business]) process_fk = PostgresOperator(dag=dag, task_id='process_foreign_keys', sql='/sql/dim_fk.sql', postgres_conn_id='redshift') process_fk.set_upstream([process_fact_tips, process_fact_reviews]) run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks', dag=dag, redshift_conn_id='redshift', queries=({ "table": "dim_times", "where": "day IS NULL", "result": 0 }, { "table": "fact_review", "where": "user_id IS NULL", "result": 0 }, { "table": "fact_review", "result": 6685900 })) run_quality_checks.set_upstream(process_fk) end_operator = DummyOperator(dag=dag, task_id='end_operator') end_operator.set_upstream(run_quality_checks)
def test_dag_topological_sort(self): dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B # A -> C -> D # ordered: B, D, C, A or D, B, C, A or D, C, B, A with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op1.set_upstream([op2, op3]) op3.set_upstream(op4) topological_list = dag.topological_sort() logging.info(topological_list) tasks = [op2, op3, op4] self.assertTrue(topological_list[0] in tasks) tasks.remove(topological_list[0]) self.assertTrue(topological_list[1] in tasks) tasks.remove(topological_list[1]) self.assertTrue(topological_list[2] in tasks) tasks.remove(topological_list[2]) self.assertTrue(topological_list[3] == op1) dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # C -> (A u B) -> D # C -> E # ordered: E | D, A | B, C with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op1.set_downstream(op3) op2.set_downstream(op3) op1.set_upstream(op4) op2.set_upstream(op4) op5.set_downstream(op3) topological_list = dag.topological_sort() logging.info(topological_list) set1 = [op4, op5] self.assertTrue(topological_list[0] in set1) set1.remove(topological_list[0]) set2 = [op1, op2] set2.extend(set1) self.assertTrue(topological_list[1] in set2) set2.remove(topological_list[1]) self.assertTrue(topological_list[2] in set2) set2.remove(topological_list[2]) self.assertTrue(topological_list[3] in set2) self.assertTrue(topological_list[4] == op3) dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) self.assertEquals(tuple(), dag.topological_sort())
from airflow.models import DAG args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(12)} dag = DAG(dag_id='example_branch_operator_further_back', default_args=args, schedule_interval="@daily") cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] def return_current_day(**context): return options.__getitem__(context["execution_date"].weekday()) branching = BranchPythonOperator(task_id='branching', python_callable=return_current_day, provide_context=True, dag=dag) branching.set_upstream(run_this_first) join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) t.set_downstream(join)
def test_dag_catchup_option(self): """ Test to check that a DAG with catchup = False only schedules beginning now, not back to the start date """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace( minute=0, second=0, microsecond=0) three_minutes_ago = now - datetime.timedelta(minutes=3) two_hours_and_three_minutes_ago = three_minutes_ago - datetime.timedelta( hours=2) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'no_catchup_test1' DAG_NAME2 = 'no_catchup_test2' DAG_NAME3 = 'no_catchup_test3' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args) default_catchup = configuration.getboolean('scheduler', 'catchup_by_default') # Test configs have catchup by default ON self.assertEqual(default_catchup, True) # Correct default? self.assertEqual(dag1.catchup, True) dag2 = DAG(DAG_NAME2, schedule_interval='* * * * *', max_active_runs=1, catchup=False, default_args=default_args) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag2) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag2) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag2) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag2.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag2.clear() dr = scheduler.create_dag_run(dag2) # We had better get a dag run self.assertIsNotNone(dr) # The DR should be scheduled in the last 3 minutes, not 6 hours ago self.assertGreater(dr.execution_date, three_minutes_ago) # The DR should be scheduled BEFORE now self.assertLess(dr.execution_date, datetime.datetime.now()) dag3 = DAG(DAG_NAME3, schedule_interval='@hourly', max_active_runs=1, catchup=False, default_args=default_args) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag3) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag3) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag3) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag3.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag3.clear() dr = None dr = scheduler.create_dag_run(dag3) # We had better get a dag run self.assertIsNotNone(dr) # The DR should be scheduled in the last two hours, not 6 hours ago self.assertGreater(dr.execution_date, two_hours_and_three_minutes_ago) # The DR should be scheduled BEFORE now self.assertLess(dr.execution_date, datetime.datetime.now())
def test_skipping_dagrun(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) self.dag.create_dagrun( run_id="manual__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.dag.create_dagrun( run_id="manual__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING ) self.dag.create_dagrun( run_id="manual__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
class BranchOperatorTest(unittest.TestCase): @classmethod def setUpClass(cls): super(BranchOperatorTest, cls).setUpClass() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) def tearDown(self): super().tearDown() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.dag.clear() self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise Exception def test_branch_list_without_dag_run(self): """This checks if the BranchPythonOperator supports branching off to a list of tasks.""" self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: ['branch_1', 'branch_2']) self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.branch_3 = DummyOperator(task_id='branch_3', dag=self.dag) self.branch_3.set_upstream(self.branch_op) self.dag.clear() self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) expected = { "make_choice": State.SUCCESS, "branch_1": State.NONE, "branch_2": State.NONE, "branch_3": State.SKIPPED, } for ti in tis: if ti.task_id in expected: self.assertEqual(ti.state, expected[ti.task_id]) else: raise Exception def test_with_dag_run(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise Exception def test_with_skip_in_branch_downstream_dependencies(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_op >> self.branch_1 >> self.branch_2 self.branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise Exception def test_with_skip_in_branch_downstream_dependencies2(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_2') self.branch_op >> self.branch_1 >> self.branch_2 self.branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise Exception
def create_subdag(dag_parent, label, team): dag_id_child = "%s.%s" % (dag_parent.dag_id, label) schema = team["schema"][label] dag = DAG( dag_id=dag_id_child, default_args=dag_parent.default_args, schedule_interval=dag_parent.schedule_interval, ) # Find the corresponding operator and its parameters fn, operator_params = find_label_operator(schema["qos"]) # Label is declared but there is no node in Neo4j count = team["labels"][label] if not count: DummyOperator(task_id="{}.notask".format(label), dag=dag) return dag, operator_params.get("dependencies") if count < 100: length = count else: frac, length = math.modf(count / 100) if frac: length += 1 chunks = { "{}.chunk.{}".format(label, i): i for i in range(0, count, int(length)) } tasks = [] for name, skip in chunks.items(): # All custom operators share these parameters params = { "app": app, "team": team, "label": label, "skip": skip, "length": length, **operator_params, } tasks.append(fn(task_id=name, dag=dag, params=params)) with dag: delete_redis_avg_op = PythonOperator( task_id="{}.del_redis_average".format(label), provide_context=True, python_callable=delete_redis_avg, params={ "app": app, "team": team, "label": label }, ) before_subdag_task = BeforeSubdagOperator( task_id="{}.before_subdag".format(label), params={ "app": app, "team": team, "label": label, "count": count }, ) after_subdag_task = AfterSubdagOperator( task_id="{}.after_subdag".format(label), params={ "app": app, "team": team, "label": label }, ) after_chunks_task = DummyOperator(task_id="{}.dummy".format(label)) average_op = AverageOperator( task_id="{}.average".format(label), params={ "app": app, "team": team, "label": label }, ) daily_worst_op = DailyWorstOperator( task_id="{}.daily_worst".format(label), params={ "app": app, "team": team, "label": label }, ) before_subdag_task.set_downstream(delete_redis_avg_op) delete_redis_avg_op.set_downstream(tasks) after_chunks_task.set_upstream(tasks) after_chunks_task.set_downstream([average_op, daily_worst_op]) after_subdag_task.set_upstream([average_op, daily_worst_op]) return dag, operator_params.get("dependencies")
def test_skipping_non_latest(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task3 = DummyOperator( task_id='downstream_3', trigger_rule=TriggerRule.NONE_FAILED, dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) downstream_task3.set_upstream(downstream_task) self.dag.create_dagrun( run_id="scheduled__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) self.dag.create_dagrun( run_id="scheduled__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING, ) self.dag.create_dagrun( run_id="scheduled__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING, ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task3.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): None, timezone.datetime(2016, 1, 1, 12): None, timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_3') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
""" Example LatestOnlyOperator and TriggerRule interactions """ import datetime as dt import airflow from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.latest_only_operator import LatestOnlyOperator from airflow.utils.trigger_rule import TriggerRule dag = DAG( dag_id='latest_only_with_trigger', schedule_interval=dt.timedelta(hours=4), start_date=airflow.utils.dates.days_ago(2), ) latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag) task1 = DummyOperator(task_id='task1', dag=dag) task1.set_upstream(latest_only) task2 = DummyOperator(task_id='task2', dag=dag) task3 = DummyOperator(task_id='task3', dag=dag) task3.set_upstream([task1, task2]) task4 = DummyOperator(task_id='task4', dag=dag, trigger_rule=TriggerRule.ALL_DONE) task4.set_upstream([task1, task2])
} dag = DAG( dag_id='example_branch_operator', default_args=args, schedule_interval="@daily") cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) join = DummyOperator( task_id='join', trigger_rule='one_success', dag=dag ) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) t.set_downstream(dummy_follow) dummy_follow.set_downstream(join)
dag2 = DAG(dag_id='test_depends_on_past', default_args=default_args) dag2_task1 = DummyOperator( task_id='test_dop_task', dag=dag2, depends_on_past=True,) # DAG tests that a Dag run that doesn't complete is marked failed dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args) dag3_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag3, python_callable=fail) dag3_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag3,) dag3_task2.set_upstream(dag3_task1) # DAG tests that a Dag run that completes but has a failure is marked success dag4 = DAG(dag_id='test_dagrun_states_success', default_args=default_args) dag4_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag4, python_callable=fail, ) dag4_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag4, trigger_rule=TriggerRule.ALL_FAILED ) dag4_task2.set_upstream(dag4_task1)
from datetime import datetime, timedelta from airflow.models.dag import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.utils.dates import days_ago now = datetime.now() now_to_the_hour = (now - timedelta(0, 0, 0, 0, 0, 3)).replace(minute=0, second=0, microsecond=0) START_DATE = now_to_the_hour DAG_NAME = 'test_dag_v1' default_args = { 'owner': 'airflow', 'depends_on_past': True, 'start_date': days_ago(2) } dag = DAG(DAG_NAME, schedule_interval='*/10 * * * *', default_args=default_args) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag) run_this_3.set_upstream(run_this_2)
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Example of the LatestOnlyOperator """ import datetime as dt from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.latest_only_operator import LatestOnlyOperator from airflow.utils.trigger_rule import TriggerRule dag = DAG( dag_id='latest_only', schedule_interval=dt.timedelta(hours=4), start_date=dt.datetime(2016, 9, 20), ) latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag) task1 = DummyOperator(task_id='task1', dag=dag) task1.set_upstream(latest_only)