def test_dagrun_update_state_with_handle_callback_failure(self): def on_failure_callable(context): self.assertEqual( context['dag_run'].dag_id, 'test_dagrun_update_state_with_handle_callback_failure') dag = DAG( dag_id='test_dagrun_update_state_with_handle_callback_failure', start_date=datetime.datetime(2017, 1, 1), on_failure_callback=on_failure_callable, ) dag_task1 = DummyOperator(task_id='test_state_succeeded1', dag=dag) dag_task2 = DummyOperator(task_id='test_state_failed2', dag=dag) dag_task1.set_downstream(dag_task2) initial_task_states = { 'test_state_succeeded1': State.SUCCESS, 'test_state_failed2': State.FAILED, } dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) _, callback = dag_run.update_state(execute_callbacks=False) self.assertEqual(State.FAILED, dag_run.state) # Callbacks are not added until handle_callback = False is passed to dag_run.update_state() assert callback == DagCallbackRequest( full_filepath=dag_run.dag.fileloc, dag_id="test_dagrun_update_state_with_handle_callback_failure", execution_date=dag_run.execution_date, is_failure_callback=True, msg="task_failure", )
def test_dagrun_success_when_all_skipped(self): """ Tests that a DAG run succeeds when all tasks are skipped """ dag = DAG(dag_id='test_dagrun_success_when_all_skipped', start_date=timezone.datetime(2017, 1, 1)) dag_task1 = ShortCircuitOperator(task_id='test_short_circuit_false', dag=dag, python_callable=lambda: False) dag_task2 = DummyOperator(task_id='test_state_skipped1', dag=dag) dag_task3 = DummyOperator(task_id='test_state_skipped2', dag=dag) dag_task1.set_downstream(dag_task2) dag_task2.set_downstream(dag_task3) initial_task_states = { 'test_short_circuit_false': State.SUCCESS, 'test_state_skipped1': State.SKIPPED, 'test_state_skipped2': State.SKIPPED, } dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) dag_run.update_state() assert State.SUCCESS == dag_run.state
def test_dagrun_failure_callback(self): def on_failure_callable(context): assert context['dag_run'].dag_id == 'test_dagrun_failure_callback' dag = DAG( dag_id='test_dagrun_failure_callback', start_date=datetime.datetime(2017, 1, 1), on_failure_callback=on_failure_callable, ) dag_task1 = DummyOperator(task_id='test_state_succeeded1', dag=dag) dag_task2 = DummyOperator(task_id='test_state_failed2', dag=dag) initial_task_states = { 'test_state_succeeded1': State.SUCCESS, 'test_state_failed2': State.FAILED, } dag_task1.set_downstream(dag_task2) # Scheduler uses Serialized DAG -- so use that instead of the Actual DAG dag = SerializedDAG.from_dict(SerializedDAG.to_dict(dag)) dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) _, callback = dag_run.update_state() assert State.FAILED == dag_run.state # Callbacks are not added until handle_callback = False is passed to dag_run.update_state() assert callback is None
def test_dagrun_failure_callback(self): def on_failure_callable(context): self.assertEqual(context['dag_run'].dag_id, 'test_dagrun_failure_callback') dag = DAG( dag_id='test_dagrun_failure_callback', start_date=datetime.datetime(2017, 1, 1), on_failure_callback=on_failure_callable, ) dag_task1 = DummyOperator(task_id='test_state_succeeded1', dag=dag) dag_task2 = DummyOperator(task_id='test_state_failed2', dag=dag) initial_task_states = { 'test_state_succeeded1': State.SUCCESS, 'test_state_failed2': State.FAILED, } dag_task1.set_downstream(dag_task2) dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) _, callback = dag_run.update_state() self.assertEqual(State.FAILED, dag_run.state) # Callbacks are not added until handle_callback = False is passed to dag_run.update_state() self.assertIsNone(callback)
def test_dagrun_update_state_with_handle_callback_failure(self): def on_failure_callable(context): assert context['dag_run'].dag_id == 'test_dagrun_update_state_with_handle_callback_failure' dag = DAG( dag_id='test_dagrun_update_state_with_handle_callback_failure', start_date=datetime.datetime(2017, 1, 1), on_failure_callback=on_failure_callable, ) dag_task1 = DummyOperator(task_id='test_state_succeeded1', dag=dag) dag_task2 = DummyOperator(task_id='test_state_failed2', dag=dag) dag_task1.set_downstream(dag_task2) initial_task_states = { 'test_state_succeeded1': State.SUCCESS, 'test_state_failed2': State.FAILED, } # Scheduler uses Serialized DAG -- so use that instead of the Actual DAG dag = SerializedDAG.from_dict(SerializedDAG.to_dict(dag)) dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) _, callback = dag_run.update_state(execute_callbacks=False) assert State.FAILED == dag_run.state # Callbacks are not added until handle_callback = False is passed to dag_run.update_state() assert callback == DagCallbackRequest( full_filepath=dag_run.dag.fileloc, dag_id="test_dagrun_update_state_with_handle_callback_failure", execution_date=dag_run.execution_date, is_failure_callback=True, msg="task_failure", )
def subdag_c(): subdag_c = DAG('nested_cycle.op_subdag_1.opSubdag_C', default_args=default_args) op_subdag_c_task = DummyOperator(task_id='subdag_c.task', dag=subdag_c) # introduce a loop in opSubdag_C op_subdag_c_task.set_downstream(op_subdag_c_task) return subdag_c
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG( 'shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL, ) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter(TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': assert ti.state == State.SUCCESS elif ti.task_id == 'upstream': # should not exist raise ValueError(f'Invalid task id {ti.task_id} found!') elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': assert ti.state == State.SKIPPED else: raise ValueError(f'Invalid task id {ti.task_id} found!') value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': assert ti.state == State.SUCCESS elif ti.task_id == 'upstream': # should not exist raise ValueError(f'Invalid task id {ti.task_id} found!') elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': assert ti.state == State.NONE else: raise ValueError(f'Invalid task id {ti.task_id} found!')
def test_cycle_loop(self): # test self loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> A with dag: op1 = DummyOperator(task_id='A') op1.set_downstream(op1) with pytest.raises(AirflowDagCycleException): assert not _test_cycle(dag)
def test_get_states_count_upstream_ti(self): """ this test tests the helper function '_get_states_count_upstream_ti' as a unit and inside update_state """ from airflow.ti_deps.dep_context import DepContext get_states_count_upstream_ti = TriggerRuleDep._get_states_count_upstream_ti session = settings.Session() now = timezone.utcnow() dag = DAG('test_dagrun_with_pre_tis', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E', trigger_rule=TriggerRule.ONE_FAILED) op1.set_downstream([op2, op3]) # op1 >> op2, op3 op4.set_upstream([op3, op2]) # op3, op2 >> op4 op5.set_upstream([op2, op3, op4]) # (op2, op3, op4) >> op5 clear_db_runs() dag.clear() dr = dag.create_dagrun( run_id='test_dagrun_with_pre_tis', state=State.RUNNING, execution_date=now, start_date=now ) ti_op1 = TaskInstance(task=dag.get_task(op1.task_id), execution_date=dr.execution_date) ti_op2 = TaskInstance(task=dag.get_task(op2.task_id), execution_date=dr.execution_date) ti_op3 = TaskInstance(task=dag.get_task(op3.task_id), execution_date=dr.execution_date) ti_op4 = TaskInstance(task=dag.get_task(op4.task_id), execution_date=dr.execution_date) ti_op5 = TaskInstance(task=dag.get_task(op5.task_id), execution_date=dr.execution_date) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) ti_op5.set_state(state=State.SUCCESS, session=session) session.commit() # check handling with cases that tasks are triggered from backfill with no finished tasks finished_tasks = DepContext().ensure_finished_tasks(ti_op2.task.dag, ti_op2.execution_date, session) assert get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op2) == (1, 0, 0, 0, 1) finished_tasks = dr.get_task_instances(state=State.finished, session=session) assert get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op4) == (1, 0, 1, 0, 2) assert get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op5) == (2, 0, 1, 0, 3) dr.update_state() assert State.SUCCESS == dr.state
def basic_cycle(): import datetime # pylint: disable=redefined-outer-name,reimported from airflow.models import DAG from airflow.operators.dummy import DummyOperator dag_name = 'cycle_dag' default_args = {'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1)} dag = DAG(dag_name, default_args=default_args) # A -> A with dag: op_a = DummyOperator(task_id='A') op_a.set_downstream(op_a) return dag
def standard_subdag(): import datetime # pylint: disable=redefined-outer-name,reimported from airflow.models import DAG from airflow.operators.dummy import DummyOperator from airflow.operators.subdag import SubDagOperator dag_name = 'parent' default_args = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(dag_name, default_args=default_args) # parent: # A -> opSubDag_0 # parent.opsubdag_0: # -> subdag_0.task # A -> opSubDag_1 # parent.opsubdag_1: # -> subdag_1.task with dag: def subdag_0(): subdag_0 = DAG('parent.op_subdag_0', default_args=default_args) DummyOperator(task_id='subdag_0.task', dag=subdag_0) return subdag_0 def subdag_1(): subdag_1 = DAG('parent.op_subdag_1', default_args=default_args) DummyOperator(task_id='subdag_1.task', dag=subdag_1) return subdag_1 op_subdag_0 = SubDagOperator(task_id='op_subdag_0', dag=dag, subdag=subdag_0()) op_subdag_1 = SubDagOperator(task_id='op_subdag_1', dag=dag, subdag=subdag_1()) op_a = DummyOperator(task_id='A') op_a.set_downstream(op_subdag_0) op_a.set_downstream(op_subdag_1) return dag
def test_cycle_downstream_loop(self): # test downstream self loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C -> D -> E -> E with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op1.set_downstream(op2) op2.set_downstream(op3) op3.set_downstream(op4) op4.set_downstream(op5) op5.set_downstream(op5) with pytest.raises(AirflowDagCycleException): assert not _test_cycle(dag)
def test_cycle_arbitrary_loop(self): # test arbitrary loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # E-> A -> B -> F -> A # -> C -> F with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='E') op5 = DummyOperator(task_id='F') op1.set_downstream(op2) op1.set_downstream(op3) op4.set_downstream(op1) op3.set_downstream(op5) op2.set_downstream(op5) op5.set_downstream(op1) with self.assertRaises(AirflowDagCycleException): self.assertFalse(_test_cycle(dag))
def test_cycle_no_cycle(self): # test no cycle dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C # B -> D # E -> F with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op6 = DummyOperator(task_id='F') op1.set_downstream(op2) op2.set_downstream(op3) op2.set_downstream(op4) op5.set_downstream(op6) assert not _test_cycle(dag)
def test_with_dag_run(self): value = False dag = DAG( 'shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL, ) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks %s", dag.tasks) dr = dag.create_dagrun( run_type=DagRunType.MANUAL, start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() assert len(tis) == 4 for ti in tis: if ti.task_id == 'make_choice': assert ti.state == State.SUCCESS elif ti.task_id == 'upstream': assert ti.state == State.SUCCESS elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': assert ti.state == State.SKIPPED else: raise ValueError(f'Invalid task id {ti.task_id} found!') value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() assert len(tis) == 4 for ti in tis: if ti.task_id == 'make_choice': assert ti.state == State.SUCCESS elif ti.task_id == 'upstream': assert ti.state == State.SUCCESS elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': assert ti.state == State.NONE else: raise ValueError(f'Invalid task id {ti.task_id} found!')
seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_branch_operator', default_args=args, schedule_interval="@daily") cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) t.set_downstream(dummy_follow) dummy_follow.set_downstream(join)
def test_lineage(self): dag = DAG(dag_id='test_prepare_lineage', start_date=DEFAULT_DATE) f1s = "/tmp/does_not_exist_1-{}" f2s = "/tmp/does_not_exist_2-{}" f3s = "/tmp/does_not_exist_3" file1 = File(f1s.format("{{ execution_date }}")) file2 = File(f2s.format("{{ execution_date }}")) file3 = File(f3s) with dag: op1 = DummyOperator( task_id='leave1', inlets=file1, outlets=[ file2, ], ) op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1', inlets=AUTO, outlets=file3) op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3', inlets=["leave1", "upstream_level_1"]) op1.set_downstream(op3) op2.set_downstream(op3) op3.set_downstream(op4) op4.set_downstream(op5) dag.clear() # execution_date is set in the context in order to avoid creating task instances ctx1 = { "ti": TI(task=op1, execution_date=DEFAULT_DATE), "execution_date": DEFAULT_DATE } ctx2 = { "ti": TI(task=op2, execution_date=DEFAULT_DATE), "execution_date": DEFAULT_DATE } ctx3 = { "ti": TI(task=op3, execution_date=DEFAULT_DATE), "execution_date": DEFAULT_DATE } ctx5 = { "ti": TI(task=op5, execution_date=DEFAULT_DATE), "execution_date": DEFAULT_DATE } # prepare with manual inlets and outlets op1.pre_execute(ctx1) assert len(op1.inlets) == 1 assert op1.inlets[0].url == f1s.format(DEFAULT_DATE) assert len(op1.outlets) == 1 assert op1.outlets[0].url == f2s.format(DEFAULT_DATE) # post process with no backend op1.post_execute(ctx1) op2.pre_execute(ctx2) assert len(op2.inlets) == 0 op2.post_execute(ctx2) op3.pre_execute(ctx3) assert len(op3.inlets) == 1 assert op3.inlets[0].url == f2s.format(DEFAULT_DATE) assert op3.outlets[0] == file3 op3.post_execute(ctx3) # skip 4 op5.pre_execute(ctx5) assert len(op5.inlets) == 2 op5.post_execute(ctx5)
from datetime import timedelta import airflow from airflow import DAG from airflow.operators.dummy import DummyOperator args = {'owner': 'jakkie', 'start_date': datetime(2021, 5, 14)} dag = DAG('task5-dag', default_args=args, description='dag for task 5') with dag: task1 = DummyOperator(task_id='Task_1') task2 = DummyOperator(task_id='Task_2') task3 = DummyOperator(task_id='Task_3') task4 = DummyOperator(task_id='Task_4') task5 = DummyOperator(task_id='Task_5') task6 = DummyOperator(task_id='Task_6') task1.set_downstream([task2, task3]) task2.set_downstream([task4, task5, task6]) task3.set_downstream([task4, task5, task6])
def nested_subdag_cycle(): import datetime # pylint: disable=redefined-outer-name,reimported from airflow.models import DAG from airflow.operators.dummy import DummyOperator from airflow.operators.subdag import SubDagOperator dag_name = 'nested_cycle' default_args = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(dag_name, default_args=default_args) # cycle: # A -> op_subdag_0 # cycle.op_subdag_0: # -> opSubDag_A # cycle.op_subdag_0.opSubdag_A: # -> subdag_a.task # -> opSubdag_B # cycle.op_subdag_0.opSubdag_B: # -> subdag_b.task # A -> op_subdag_1 # cycle.op_subdag_1: # -> opSubdag_C # cycle.op_subdag_1.opSubdag_C: # -> subdag_c.task -> subdag_c.task >Invalid Loop< # -> opSubDag_D # cycle.op_subdag_1.opSubdag_D: # -> subdag_d.task with dag: def subdag_a(): subdag_a = DAG('nested_cycle.op_subdag_0.opSubdag_A', default_args=default_args) DummyOperator(task_id='subdag_a.task', dag=subdag_a) return subdag_a def subdag_b(): subdag_b = DAG('nested_cycle.op_subdag_0.opSubdag_B', default_args=default_args) DummyOperator(task_id='subdag_b.task', dag=subdag_b) return subdag_b def subdag_c(): subdag_c = DAG('nested_cycle.op_subdag_1.opSubdag_C', default_args=default_args) op_subdag_c_task = DummyOperator(task_id='subdag_c.task', dag=subdag_c) # introduce a loop in opSubdag_C op_subdag_c_task.set_downstream(op_subdag_c_task) return subdag_c def subdag_d(): subdag_d = DAG('nested_cycle.op_subdag_1.opSubdag_D', default_args=default_args) DummyOperator(task_id='subdag_d.task', dag=subdag_d) return subdag_d def subdag_0(): subdag_0 = DAG('nested_cycle.op_subdag_0', default_args=default_args) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_a()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_b()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.op_subdag_1', default_args=default_args) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_c()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_d()) return subdag_1 op_subdag_0 = SubDagOperator(task_id='op_subdag_0', dag=dag, subdag=subdag_0()) op_subdag_1 = SubDagOperator(task_id='op_subdag_1', dag=dag, subdag=subdag_1()) op_a = DummyOperator(task_id='A') op_a.set_downstream(op_subdag_0) op_a.set_downstream(op_subdag_1) return dag
tune_model_task = SageMakerTuningOperator( task_id='model_tuning', dag=dag, config=tuner_config, # aws_conn_id='airflow-sagemaker', wait_for_completion=True, check_interval=30) # launch sagemaker batch transform job and wait until it completes batch_transform_task = SageMakerTransformOperator( task_id='predicting', dag=dag, config=transform_config, # aws_conn_id='airflow-sagemaker', wait_for_completion=True, check_interval=30, trigger_rule=TriggerRule.ONE_SUCCESS) cleanup_task = DummyOperator(task_id='cleaning_up', dag=dag) # set the dependencies between tasks init.set_downstream(preprocess_task) preprocess_task.set_downstream(prepare_task) prepare_task.set_downstream(branching) branching.set_downstream(tune_model_task) branching.set_downstream(train_model_task) tune_model_task.set_downstream(batch_transform_task) train_model_task.set_downstream(batch_transform_task) batch_transform_task.set_downstream(cleanup_task)