def test_infer_dag(self): dag = DAG('dag', start_date=DEFAULT_DATE) dag2 = DAG('dag2', start_date=DEFAULT_DATE) op1 = DummyOperator(task_id='test_op_1', owner='test') op2 = DummyOperator(task_id='test_op_2', owner='test') op3 = DummyOperator(task_id='test_op_3', owner='test', dag=dag) op4 = DummyOperator(task_id='test_op_4', owner='test', dag=dag2) # double check dags self.assertEqual( [i.has_dag() for i in [op1, op2, op3, op4]], [False, False, True, True]) # can't combine operators with no dags self.assertRaises(AirflowException, op1.set_downstream, op2) # op2 should infer dag from op1 op1.dag = dag op1.set_downstream(op2) self.assertIs(op2.dag, dag) # can't assign across multiple DAGs self.assertRaises(AirflowException, op1.set_downstream, op4) self.assertRaises(AirflowException, op1.set_downstream, [op3, op4])
def test_check_task_dependencies( self, trigger_rule, successes, skipped, failed, upstream_failed, done, flag_upstream_failed, expect_state, expect_completed, ): start_date = datetime.datetime(2016, 2, 1, 0, 0, 0) dag = models.DAG("test-dag", start_date=start_date) downstream = DummyOperator(task_id="downstream", dag=dag, owner="airflow", trigger_rule=trigger_rule) for i in range(5): task = DummyOperator(task_id="runme_{}".format(i), dag=dag, owner="airflow") task.set_downstream(downstream) run_date = task.start_date + datetime.timedelta(days=5) ti = TI(downstream, run_date) completed = ti.evaluate_trigger_rule( successes=successes, skipped=skipped, failed=failed, upstream_failed=upstream_failed, done=done, flag_upstream_failed=flag_upstream_failed, ) self.assertEqual(completed, expect_completed) self.assertEqual(ti.state, expect_state)
def test_dagrun_success_when_all_skipped(self): """ Tests that a DAG run succeeds when all tasks are skipped """ dag = DAG( dag_id='test_dagrun_success_when_all_skipped', start_date=datetime.datetime(2017, 1, 1) ) dag_task1 = ShortCircuitOperator( task_id='test_short_circuit_false', dag=dag, python_callable=lambda: False) dag_task2 = DummyOperator( task_id='test_state_skipped1', dag=dag) dag_task3 = DummyOperator( task_id='test_state_skipped2', dag=dag) dag_task1.set_downstream(dag_task2) dag_task2.set_downstream(dag_task3) initial_task_states = { 'test_short_circuit_false': State.SUCCESS, 'test_state_skipped1': State.SKIPPED, 'test_state_skipped2': State.SKIPPED, } dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) updated_dag_state = dag_run.update_state() self.assertEqual(State.SUCCESS, updated_dag_state)
def test_check_task_dependencies(self, trigger_rule, successes, skipped, failed, upstream_failed, done, flag_upstream_failed, expect_state, expect_completed): start_date = datetime.datetime(2016, 2, 1, 0, 0, 0) dag = models.DAG('test-dag', start_date=start_date) downstream = DummyOperator(task_id='downstream', dag=dag, owner='airflow', trigger_rule=trigger_rule) for i in range(5): task = DummyOperator(task_id='runme_{}'.format(i), dag=dag, owner='airflow') task.set_downstream(downstream) run_date = task.start_date + datetime.timedelta(days=5) ti = TI(downstream, run_date) dep_results = TriggerRuleDep()._evaluate_trigger_rule( ti=ti, successes=successes, skipped=skipped, failed=failed, upstream_failed=upstream_failed, done=done, flag_upstream_failed=flag_upstream_failed) completed = all([dep.passed for dep in dep_results]) self.assertEqual(completed, expect_completed) self.assertEqual(ti.state, expect_state)
def test_with_dag_run(self): value = False dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks {}".format(dag.tasks)) dr = dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
def subdag_C(): subdag_C = DAG( 'nested_cycle.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) opSubdag_C_task = DummyOperator( task_id='subdag_C.task', dag=subdag_C) # introduce a loop in opSubdag_C opSubdag_C_task.set_downstream(opSubdag_C_task) return subdag_C
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close()
def test_backfill_ordered_concurrent_execute(self): dag = DAG( dag_id='test_backfill_ordered_concurrent_execute', start_date=DEFAULT_DATE, schedule_interval="@daily") with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1') op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3') # order randomly op2.set_downstream(op3) op1.set_downstream(op3) op4.set_downstream(op5) op3.set_downstream(op4) dag.clear() executor = TestExecutor() job = BackfillJob(dag=dag, executor=executor, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=2), ) job.run() d0 = DEFAULT_DATE d1 = d0 + datetime.timedelta(days=1) d2 = d1 + datetime.timedelta(days=1) # test executor history keeps a list history = executor.history self.maxDiff = None self.assertListEqual( # key[0] is dag id, key[3] is try_number, we don't care about either of those here [sorted([item[-1].key[1:3] for item in batch]) for batch in history], [ [ ('leave1', d0), ('leave1', d1), ('leave1', d2), ('leave2', d0), ('leave2', d1), ('leave2', d2) ], [('upstream_level_1', d0), ('upstream_level_1', d1), ('upstream_level_1', d2)], [('upstream_level_2', d0), ('upstream_level_2', d1), ('upstream_level_2', d2)], [('upstream_level_3', d0), ('upstream_level_3', d1), ('upstream_level_3', d2)], ] )
def __apply_task_to_dag_multiple_executors(self): start_task = DummyOperator(task_id=f'{self.task_name}_parallelize', trigger_rule=self.trigger_rule, dag=self.dag) end_task = DummyOperator(task_id=self.task_name, dag=self.dag) if self.parent: self.parent.set_downstream(start_task) for i in range(self.executors): split_task = self.__create_pod_operator(image=self.image, task_id=i) start_task.set_downstream(split_task) split_task.set_downstream(end_task) return end_task
def basic_cycle(): import datetime # pylint: disable=redefined-outer-name,reimported from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator dag_name = 'cycle_dag' default_args = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(dag_name, default_args=default_args) # A -> A with dag: op_a = DummyOperator(task_id='A') op_a.set_downstream(op_a) return dag
def basic_cycle(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator import datetime DAG_NAME = 'cycle_dag' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( DAG_NAME, default_args=DEFAULT_ARGS) # A -> A with dag: opA = DummyOperator(task_id='A') opA.set_downstream(opA) return dag
def standard_subdag(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime # pylint: disable=redefined-outer-name,reimported dag_name = 'master' default_args = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( dag_name, default_args=default_args) # master: # A -> opSubDag_0 # master.opsubdag_0: # -> subdag_0.task # A -> opSubDag_1 # master.opsubdag_1: # -> subdag_1.task with dag: def subdag_0(): subdag_0 = DAG('master.op_subdag_0', default_args=default_args) DummyOperator(task_id='subdag_0.task', dag=subdag_0) return subdag_0 def subdag_1(): subdag_1 = DAG('master.op_subdag_1', default_args=default_args) DummyOperator(task_id='subdag_1.task', dag=subdag_1) return subdag_1 op_subdag_0 = SubDagOperator( task_id='op_subdag_0', dag=dag, subdag=subdag_0()) op_subdag_1 = SubDagOperator( task_id='op_subdag_1', dag=dag, subdag=subdag_1()) op_a = DummyOperator(task_id='A') op_a.set_downstream(op_subdag_0) op_a.set_downstream(op_subdag_1) return dag
def standard_subdag(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'master' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( DAG_NAME, default_args=DEFAULT_ARGS) # master: # A -> opSubDag_0 # master.opsubdag_0: # -> subdag_0.task # A -> opSubDag_1 # master.opsubdag_1: # -> subdag_1.task with dag: def subdag_0(): subdag_0 = DAG('master.opSubdag_0', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_0.task', dag=subdag_0) return subdag_0 def subdag_1(): subdag_1 = DAG('master.opSubdag_1', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_1.task', dag=subdag_1) return subdag_1 opSubdag_0 = SubDagOperator( task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator( task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
def test_cycle_large_loop(self): # large loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C -> D -> E -> A with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op1.set_downstream(op2) op2.set_downstream(op3) op3.set_downstream(op4) op4.set_downstream(op5) op5.set_downstream(op1) with self.assertRaises(AirflowDagCycleException): dag.test_cycle()
def __apply_task_to_dag_multiple_executors(self, input_task): if not input_task: input_task = DummyOperator(task_id=self.input_task_id, trigger_rule=self.trigger_rule, dag=self.dag) end_task = DummyOperator(task_id=self.task_name, dag=self.dag) if self.parent: self.parent.set_downstream(input_task) for i in range(self.executors): split_task = self.__create_pod_operator( task_id=f'''{self.task_name}_{i}''', task_split=i, image=self.image) input_task.set_downstream(split_task) split_task.set_downstream(end_task) return end_task
def test_dagrun_success_when_all_skipped(self): """ Tests that a DAG run succeeds when all tasks are skipped """ dag = DAG(dag_id='test_dagrun_success_when_all_skipped', start_date=timezone.datetime(2017, 1, 1)) dag_task1 = ShortCircuitOperator( task_id='test_short_circuit_false', dag=dag, python_callable=lambda: False ) dag_task2 = DummyOperator(task_id='test_state_skipped1', dag=dag) dag_task3 = DummyOperator(task_id='test_state_skipped2', dag=dag) dag_task1.set_downstream(dag_task2) dag_task2.set_downstream(dag_task3) initial_task_states = { 'test_short_circuit_false': State.SUCCESS, 'test_state_skipped1': State.SKIPPED, 'test_state_skipped2': State.SKIPPED, } dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) dag_run.update_state() self.assertEqual(State.SUCCESS, dag_run.state)
def test_cycle_arbitrary_loop(self): # test arbitrary loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # E-> A -> B -> F -> A # -> C -> F with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='E') op5 = DummyOperator(task_id='F') op1.set_downstream(op2) op1.set_downstream(op3) op4.set_downstream(op1) op3.set_downstream(op5) op2.set_downstream(op5) op5.set_downstream(op1) with self.assertRaises(AirflowDagCycleException): self.assertFalse(_test_cycle(dag))
def test_dagrun_failure_callback(self): def on_failure_callable(context): self.assertEqual(context['dag_run'].dag_id, 'test_dagrun_failure_callback') dag = DAG( dag_id='test_dagrun_failure_callback', start_date=datetime.datetime(2017, 1, 1), on_failure_callback=on_failure_callable, ) dag_task1 = DummyOperator(task_id='test_state_succeeded1', dag=dag) dag_task2 = DummyOperator(task_id='test_state_failed2', dag=dag) initial_task_states = { 'test_state_succeeded1': State.SUCCESS, 'test_state_failed2': State.FAILED, } dag_task1.set_downstream(dag_task2) dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) _, callback = dag_run.update_state() self.assertEqual(State.FAILED, dag_run.state) # Callbacks are not added until handle_callback = False is passed to dag_run.update_state() self.assertIsNone(callback)
def test_sub_set_subdag(self): dag = DAG('test_sub_set_subdag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1') op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3') # order randomly op2.set_downstream(op3) op1.set_downstream(op3) op4.set_downstream(op5) op3.set_downstream(op4) dag.clear() dr = dag.create_dagrun(run_id="test", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE) executor = MockExecutor() sub_dag = dag.sub_dag(task_regex="leave*", include_downstream=False, include_upstream=False) job = BackfillJob(dag=sub_dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor) job.run() self.assertRaises(sqlalchemy.orm.exc.NoResultFound, dr.refresh_from_db) # the run_id should have changed, so a refresh won't work drs = DagRun.find(dag_id=dag.dag_id, execution_date=DEFAULT_DATE) dr = drs[0] self.assertEqual( BackfillJob.ID_FORMAT_PREFIX.format(DEFAULT_DATE.isoformat()), dr.run_id) for ti in dr.get_task_instances(): if ti.task_id == 'leave1' or ti.task_id == 'leave2': self.assertEqual(State.SUCCESS, ti.state) else: self.assertEqual(State.NONE, ti.state)
def test_infer_dag(self): dag = DAG('dag', start_date=DEFAULT_DATE) dag2 = DAG('dag2', start_date=DEFAULT_DATE) op1 = DummyOperator(task_id='test_op_1', owner='test') op2 = DummyOperator(task_id='test_op_2', owner='test') op3 = DummyOperator(task_id='test_op_3', owner='test', dag=dag) op4 = DummyOperator(task_id='test_op_4', owner='test', dag=dag2) # double check dags self.assertEqual([i.has_dag() for i in [op1, op2, op3, op4]], [False, False, True, True]) # can't combine operators with no dags self.assertRaises(AirflowException, op1.set_downstream, op2) # op2 should infer dag from op1 op1.dag = dag op1.set_downstream(op2) self.assertIs(op2.dag, dag) # can't assign across multiple DAGs self.assertRaises(AirflowException, op1.set_downstream, op4) self.assertRaises(AirflowException, op1.set_downstream, [op3, op4])
def test_dagrun_success_callback(self): def on_success_callable(context): self.assertEqual(context['dag_run'].dag_id, 'test_dagrun_success_callback') dag = DAG( dag_id='test_dagrun_success_callback', start_date=datetime.datetime(2017, 1, 1), on_success_callback=on_success_callable, ) dag_task1 = DummyOperator(task_id='test_state_succeeded1', dag=dag) dag_task2 = DummyOperator(task_id='test_state_succeeded2', dag=dag) dag_task1.set_downstream(dag_task2) initial_task_states = { 'test_state_succeeded1': State.SUCCESS, 'test_state_succeeded2': State.SUCCESS, } dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) updated_dag_state = dag_run.update_state() self.assertEqual(State.SUCCESS, updated_dag_state)
def test_cycle_no_cycle(self): # test no cycle dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C # B -> D # E -> F with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op6 = DummyOperator(task_id='F') op1.set_downstream(op2) op2.set_downstream(op3) op2.set_downstream(op4) op5.set_downstream(op6) self.assertFalse(dag.test_cycle())
def test_dag_topological_sort2(self): dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # C -> (A u B) -> D # C -> E # ordered: E | D, A | B, C with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op1.set_downstream(op3) op2.set_downstream(op3) op1.set_upstream(op4) op2.set_upstream(op4) op5.set_downstream(op3) topological_list = dag.topological_sort() logging.info(topological_list) set1 = [op4, op5] self.assertTrue(topological_list[0] in set1) set1.remove(topological_list[0]) set2 = [op1, op2] set2.extend(set1) self.assertTrue(topological_list[1] in set2) set2.remove(topological_list[1]) self.assertTrue(topological_list[2] in set2) set2.remove(topological_list[2]) self.assertTrue(topological_list[3] in set2) self.assertTrue(topological_list[4] == op3)
class ShortCircuitOperatorTest(unittest.TestCase): def setUp(self): self.dag = DAG('shortcircuit_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.short_op = ShortCircuitOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: self.value) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.short_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_1) self.upstream = DummyOperator(task_id='upstream', dag=self.dag) self.upstream.set_downstream(self.short_op) self.dag.clear() self.value = True def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.value = False self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise self.value = True self.dag.clear() self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close() def test_with_dag_run(self): self.value = False logging.error("Tasks {}".format(self.dag.tasks)) dr = self.dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise self.value = True self.dag.clear() dr.verify_integrity() self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
def test_dag_topological_sort(self): dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B # A -> C -> D # ordered: B, D, C, A or D, B, C, A or D, C, B, A with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op1.set_upstream([op2, op3]) op3.set_upstream(op4) topological_list = dag.topological_sort() logging.info(topological_list) tasks = [op2, op3, op4] self.assertTrue(topological_list[0] in tasks) tasks.remove(topological_list[0]) self.assertTrue(topological_list[1] in tasks) tasks.remove(topological_list[1]) self.assertTrue(topological_list[2] in tasks) tasks.remove(topological_list[2]) self.assertTrue(topological_list[3] == op1) dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # C -> (A u B) -> D # C -> E # ordered: E | D, A | B, C with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op1.set_downstream(op3) op2.set_downstream(op3) op1.set_upstream(op4) op2.set_upstream(op4) op5.set_downstream(op3) topological_list = dag.topological_sort() logging.info(topological_list) set1 = [op4, op5] self.assertTrue(topological_list[0] in set1) set1.remove(topological_list[0]) set2 = [op1, op2] set2.extend(set1) self.assertTrue(topological_list[1] in set2) set2.remove(topological_list[1]) self.assertTrue(topological_list[2] in set2) set2.remove(topological_list[2]) self.assertTrue(topological_list[3] in set2) self.assertTrue(topological_list[4] == op3) dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) self.assertEquals(tuple(), dag.topological_sort())
def test_get_states_count_upstream_ti(self): """ this test tests the helper function '_get_states_count_upstream_ti' as a unit and inside update_state """ from airflow.ti_deps.dep_context import DepContext get_states_count_upstream_ti = TriggerRuleDep._get_states_count_upstream_ti session = settings.Session() now = timezone.utcnow() dag = DAG('test_dagrun_with_pre_tis', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E', trigger_rule=TriggerRule.ONE_FAILED) op1.set_downstream([op2, op3]) # op1 >> op2, op3 op4.set_upstream([op3, op2]) # op3, op2 >> op4 op5.set_upstream([op2, op3, op4]) # (op2, op3, op4) >> op5 dag.clear() dr = dag.create_dagrun(run_id='test_dagrun_with_pre_tis', state=State.RUNNING, execution_date=now, start_date=now) ti_op1 = TaskInstance(task=dag.get_task(op1.task_id), execution_date=dr.execution_date) ti_op2 = TaskInstance(task=dag.get_task(op2.task_id), execution_date=dr.execution_date) ti_op3 = TaskInstance(task=dag.get_task(op3.task_id), execution_date=dr.execution_date) ti_op4 = TaskInstance(task=dag.get_task(op4.task_id), execution_date=dr.execution_date) ti_op5 = TaskInstance(task=dag.get_task(op5.task_id), execution_date=dr.execution_date) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) ti_op5.set_state(state=State.SUCCESS, session=session) # check handling with cases that tasks are triggered from backfill with no finished tasks finished_tasks = DepContext().ensure_finished_tasks( ti_op2.task.dag, ti_op2.execution_date, session) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op2), (1, 0, 0, 0, 1)) finished_tasks = dr.get_task_instances(state=State.finished() + [State.UPSTREAM_FAILED], session=session) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op4), (1, 0, 1, 0, 2)) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op5), (2, 0, 1, 0, 3)) dr.update_state() self.assertEqual(State.SUCCESS, dr.state)
from __future__ import print_function from builtins import range import os import sys import airflow from airflow.operators.dummy_operator import DummyOperator from airflow.models import DAG DAG_ID = os.path.basename(__file__).replace('.pyc', '').replace('.py', '') args = {'owner': 'nehiljain', 'start_date': airflow.utils.dates.days_ago(2)} dag = DAG(dag_id=DAG_ID, default_args=args, schedule_interval='*/5 * * * *') a_task = DummyOperator(task_id='a', dag=dag) b_task = DummyOperator(task_id='b', dag=dag) c_task = DummyOperator(task_id='c', dag=dag) d_task = DummyOperator(task_id='d', dag=dag) e_task = DummyOperator(task_id='e', dag=dag) f_task = DummyOperator(task_id='f', dag=dag) g_task = DummyOperator(task_id='g', dag=dag) h_task = DummyOperator(task_id='h', dag=dag) a_task.set_downstream(b_task) b_task.set_downstream([c_task, e_task, g_task]) c_task.set_downstream(d_task) e_task.set_downstream(f_task) g_task.set_downstream(h_task)
def make_campaign(name, id_malette, id_campaign, args): """ Create a MakePanorama operators in a new DAG The purpose of this function is to create a dag as followed: ############################# ---># MakePanorama_Id-malette_1 #----- | ############################# | ######### | | ####### # Start # --- ---> # End # ######### | | ####### | ############################# | ---># MakePanorama_Id-malette_2 #----- ############################# :param name: The name of the campaign :param id_malette: The id_malette to use :param id_campaign: The id_campaign to use :param args: Some args to use to create dags :return: The new dag """ dag_name = "%s_%s_%s" % (name, id_malette, id_campaign) logging.debug("Creating the dag %s for id_malette=%s and id_campaign=%s" % ( dag_name, id_malette, id_campaign )) dag = DAG( dag_id=dag_name, default_args=args, schedule_interval=None, ) start = DummyOperator( task_id='%s_start' % dag_name, default_args=args, dag=dag, ) end = DummyOperator( task_id='%s_end' % dag_name, default_args=args, dag=dag, ) # Get all sorted lot db_client = RestClient("http://OPV_Master:5000") lots = db_client.make(Campaign, id_campaign, id_malette).lots lots = sorted(lots, key=attrgetter('id_lot')) priority = len(lots) + 1 for lot in lots: # Create the operator to make the panorama and link it with the start # and end operator task = make_panorama( dag, lot.id_lot, lot.id_malette, args, priority_weight=priority ) start.set_downstream(task) task.set_downstream(end) priority -= 1 return dag
def create_subdag(dag_parent, label, team): dag_id_child = "%s.%s" % (dag_parent.dag_id, label) schema = team["schema"][label] dag = DAG( dag_id=dag_id_child, default_args=dag_parent.default_args, schedule_interval=dag_parent.schedule_interval, ) # Find the corresponding operator and its parameters fn, operator_params = find_label_operator(schema["qos"]) # Label is declared but there is no node in Neo4j count = team["labels"][label] if not count: DummyOperator(task_id="{}.notask".format(label), dag=dag) return dag, operator_params.get("dependencies") if count < 100: length = count else: frac, length = math.modf(count / 100) if frac: length += 1 chunks = { "{}.chunk.{}".format(label, i): i for i in range(0, count, int(length)) } tasks = [] for name, skip in chunks.items(): # All custom operators share these parameters params = { "app": app, "team": team, "label": label, "skip": skip, "length": length, **operator_params, } tasks.append(fn(task_id=name, dag=dag, params=params)) with dag: delete_redis_avg_op = PythonOperator( task_id="{}.del_redis_average".format(label), provide_context=True, python_callable=delete_redis_avg, params={ "app": app, "team": team, "label": label }, ) before_subdag_task = BeforeSubdagOperator( task_id="{}.before_subdag".format(label), params={ "app": app, "team": team, "label": label, "count": count }, ) after_subdag_task = AfterSubdagOperator( task_id="{}.after_subdag".format(label), params={ "app": app, "team": team, "label": label }, ) after_chunks_task = DummyOperator(task_id="{}.dummy".format(label)) average_op = AverageOperator( task_id="{}.average".format(label), params={ "app": app, "team": team, "label": label }, ) daily_worst_op = DailyWorstOperator( task_id="{}.daily_worst".format(label), params={ "app": app, "team": team, "label": label }, ) before_subdag_task.set_downstream(delete_redis_avg_op) delete_redis_avg_op.set_downstream(tasks) after_chunks_task.set_upstream(tasks) after_chunks_task.set_downstream([average_op, daily_worst_op]) after_subdag_task.set_upstream([average_op, daily_worst_op]) return dag, operator_params.get("dependencies")
} dag = DAG( dag_id='example_branch_operator', default_args=args, schedule_interval="@daily") cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) join = DummyOperator( task_id='join', trigger_rule='one_success', dag=dag ) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) t.set_downstream(dummy_follow) dummy_follow.set_downstream(join)
def nested_subdags(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'master' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( DAG_NAME, default_args=DEFAULT_ARGS) # master: # A -> opSubdag_0 # master.opSubdag_0: # -> opSubDag_A # master.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # master.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # master.opSubdag_1: # -> opSubdag_C # master.opSubdag_1.opSubdag_C: # -> subdag_C.task # -> opSubDag_D # master.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG( 'master.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG( 'master.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG( 'master.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_C.task', dag=subdag_C) return subdag_C def subdag_D(): subdag_D = DAG( 'master.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('master.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('master.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator( task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator( task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
def nested_subdag_cycle(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'nested_cycle' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( DAG_NAME, default_args=DEFAULT_ARGS) # cycle: # A -> opSubdag_0 # cycle.opSubdag_0: # -> opSubDag_A # cycle.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # cycle.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # cycle.opSubdag_1: # -> opSubdag_C # cycle.opSubdag_1.opSubdag_C: # -> subdag_C.task -> subdag_C.task >Invalid Loop< # -> opSubDag_D # cycle.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG( 'nested_cycle.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG( 'nested_cycle.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG( 'nested_cycle.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) opSubdag_C_task = DummyOperator( task_id='subdag_C.task', dag=subdag_C) # introduce a loop in opSubdag_C opSubdag_C_task.set_downstream(opSubdag_C_task) return subdag_C def subdag_D(): subdag_D = DAG( 'nested_cycle.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('nested_cycle.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator( task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator( task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args) subdag7_task1 = PythonOperator( task_id='test_subdag_fail', dag=subdag7, python_callable=fail) subdag7_task2 = DummyOperator( task_id='test_subdag_dummy_1', dag=subdag7,) subdag7_task3 = DummyOperator( task_id='test_subdag_dummy_2', dag=subdag7) dag7_subdag1 = SubDagOperator( task_id='subdag', dag=dag7, subdag=subdag7) subdag7_task1.set_downstream(subdag7_task2) subdag7_task2.set_downstream(subdag7_task3) # DAG tests that a Dag run that doesn't complete but has a root failure is marked running dag8 = DAG(dag_id='test_dagrun_states_root_fail_unfinished', default_args=default_args) dag8_task1 = DummyOperator( task_id='test_dagrun_unfinished', # The test will unset the task instance state after # running this test dag=dag8, ) dag8_task2 = PythonOperator( task_id='test_dagrun_fail', dag=dag8, python_callable=fail, )
section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', args), default_args=args, dag=dag, ) some_other_task = DummyOperator( task_id='some-other-task', default_args=args, dag=dag, ) section_2 = SubDagOperator( task_id='section-2', subdag=subdag(DAG_NAME, 'section-2', args), default_args=args, dag=dag, ) end = DummyOperator( task_id='end', default_args=args, dag=dag, ) start.set_downstream(section_1) section_1.set_downstream(some_other_task) some_other_task.set_downstream(section_2) section_2.set_downstream(end)
def test_cycle(self): # test empty dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) self.assertFalse(dag.test_cycle()) # test single task dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: opA = DummyOperator(task_id='A') self.assertFalse(dag.test_cycle()) # test no cycle dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C # B -> D # E -> F with dag: opA = DummyOperator(task_id='A') opB = DummyOperator(task_id='B') opC = DummyOperator(task_id='C') opD = DummyOperator(task_id='D') opE = DummyOperator(task_id='E') opF = DummyOperator(task_id='F') opA.set_downstream(opB) opB.set_downstream(opC) opB.set_downstream(opD) opE.set_downstream(opF) self.assertFalse(dag.test_cycle()) # test self loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> A with dag: opA = DummyOperator(task_id='A') opA.set_downstream(opA) with self.assertRaises(AirflowDagCycleException): dag.test_cycle() # test downstream self loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C -> D -> E -> E with dag: opA = DummyOperator(task_id='A') opB = DummyOperator(task_id='B') opC = DummyOperator(task_id='C') opD = DummyOperator(task_id='D') opE = DummyOperator(task_id='E') opA.set_downstream(opB) opB.set_downstream(opC) opC.set_downstream(opD) opD.set_downstream(opE) opE.set_downstream(opE) with self.assertRaises(AirflowDagCycleException): dag.test_cycle() # large loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C -> D -> E -> A with dag: opA = DummyOperator(task_id='A') opB = DummyOperator(task_id='B') opC = DummyOperator(task_id='C') opD = DummyOperator(task_id='D') opE = DummyOperator(task_id='E') opA.set_downstream(opB) opB.set_downstream(opC) opC.set_downstream(opD) opD.set_downstream(opE) opE.set_downstream(opA) with self.assertRaises(AirflowDagCycleException): dag.test_cycle() # test arbitrary loop dag = DAG('dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # E-> A -> B -> F -> A # -> C -> F with dag: opA = DummyOperator(task_id='A') opB = DummyOperator(task_id='B') opC = DummyOperator(task_id='C') opD = DummyOperator(task_id='D') opE = DummyOperator(task_id='E') opF = DummyOperator(task_id='F') opA.set_downstream(opB) opA.set_downstream(opC) opE.set_downstream(opA) opC.set_downstream(opF) opB.set_downstream(opF) opF.set_downstream(opA) with self.assertRaises(AirflowDagCycleException): dag.test_cycle()
default_args=DEFAULT_ARGS, catchup=False, ) as dag: start_task = DummyOperator(task_id="start_task") get_routes_data_task = AWSAthenaOperator( task_id="get_routes_data_task", aws_conn_id="aws_default", query=GET_ROUTES_QUERY, database="db_logistics", output_location= f"s3://gln-airflow/commercial/athena-routes-data/{dt.datetime.now():%Y-%m-%d}", ) load_routes_task = PythonOperator( task_id="load_routes_task", python_callable=load_athena_to_postgres, op_kwargs={ "p_filename": ROUTE_FILENAME, "p_buckpref": f"commercial/athena-routes-data/{dt.datetime.now():%Y-%m-%d}", "p_staging_table": "sales.transportation_zones_staging", "p_target_table": "sales.transportation_zones", "p_target_sql": PG_LOAD_ROUTES_SQL, }, ) start_task.set_downstream(get_routes_data_task) get_routes_data_task.set_downstream(load_routes_task)
def nested_subdag_cycle(): import datetime # pylint: disable=redefined-outer-name,reimported from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator dag_name = 'nested_cycle' default_args = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(dag_name, default_args=default_args) # cycle: # A -> op_subdag_0 # cycle.op_subdag_0: # -> opSubDag_A # cycle.op_subdag_0.opSubdag_A: # -> subdag_a.task # -> opSubdag_B # cycle.op_subdag_0.opSubdag_B: # -> subdag_b.task # A -> op_subdag_1 # cycle.op_subdag_1: # -> opSubdag_C # cycle.op_subdag_1.opSubdag_C: # -> subdag_c.task -> subdag_c.task >Invalid Loop< # -> opSubDag_D # cycle.op_subdag_1.opSubdag_D: # -> subdag_d.task with dag: def subdag_a(): subdag_a = DAG('nested_cycle.op_subdag_0.opSubdag_A', default_args=default_args) DummyOperator(task_id='subdag_a.task', dag=subdag_a) return subdag_a def subdag_b(): subdag_b = DAG('nested_cycle.op_subdag_0.opSubdag_B', default_args=default_args) DummyOperator(task_id='subdag_b.task', dag=subdag_b) return subdag_b def subdag_c(): subdag_c = DAG('nested_cycle.op_subdag_1.opSubdag_C', default_args=default_args) op_subdag_c_task = DummyOperator(task_id='subdag_c.task', dag=subdag_c) # introduce a loop in opSubdag_C op_subdag_c_task.set_downstream(op_subdag_c_task) return subdag_c def subdag_d(): subdag_d = DAG('nested_cycle.op_subdag_1.opSubdag_D', default_args=default_args) DummyOperator(task_id='subdag_d.task', dag=subdag_d) return subdag_d def subdag_0(): subdag_0 = DAG('nested_cycle.op_subdag_0', default_args=default_args) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_a()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_b()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.op_subdag_1', default_args=default_args) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_c()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_d()) return subdag_1 op_subdag_0 = SubDagOperator(task_id='op_subdag_0', dag=dag, subdag=subdag_0()) op_subdag_1 = SubDagOperator(task_id='op_subdag_1', dag=dag, subdag=subdag_1()) op_a = DummyOperator(task_id='A') op_a.set_downstream(op_subdag_0) op_a.set_downstream(op_subdag_1) return dag
python_callable=preprocess.preprocess, op_kwargs=config["preprocess_data"]) train_task= PythonOperator( task_id='train', dag=dag, provide_context=False, python_callable=preprocess.preprocess, op_kwargs=config["preprocess_data"]) model_task= PythonOperator( task_id='model', dag=dag, provide_context=False, python_callable=preprocess.preprocess, op_kwargs=config["preprocess_data"]) deploy_task= PythonOperator( task_id='deploy', dag=dag, provide_context=False, python_callable=preprocess.preprocess, op_kwargs=config["preprocess_data"]) # set the dependencies between tasks init.set_downstream(process_task) process_task.set_downstream(train_task) train_task.set_downstream(model_task) model_task.set_downstream(deploy_task)
seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_branch_operator', default_args=args, schedule_interval="@daily") cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) t.set_downstream(dummy_follow) dummy_follow.set_downstream(join)
# -*- coding: utf-8 -*- # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from datetime import datetime from airflow.models import DAG from airflow.operators.python_operator import ShortCircuitOperator from airflow.operators.dummy_operator import DummyOperator # DAG that has its short circuit op fail and skip multiple downstream tasks dag = DAG(dag_id='test_dagrun_short_circuit_false', start_date=datetime(2017, 1, 1)) dag_task1 = ShortCircuitOperator(task_id='test_short_circuit_false', dag=dag, python_callable=lambda: False) dag_task2 = DummyOperator(task_id='test_state_skipped1', dag=dag) dag_task3 = DummyOperator(task_id='test_state_skipped2', dag=dag) dag_task1.set_downstream(dag_task2) dag_task2.set_downstream(dag_task3)
from airflow.models import DAG args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(12)} dag = DAG(dag_id='example_branch_operator_further_back', default_args=args, schedule_interval="@daily") cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] def return_current_day(**context): return options.__getitem__(context["execution_date"].weekday()) branching = BranchPythonOperator(task_id='branching', python_callable=return_current_day, provide_context=True, dag=dag) branching.set_upstream(run_this_first) join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) t.set_downstream(join)
def test_lineage(self, _get_backend): backend = mock.Mock() send_mock = mock.Mock() backend.send_lineage = send_mock _get_backend.return_value = backend dag = DAG( dag_id='test_prepare_lineage', start_date=DEFAULT_DATE ) f1 = File("/tmp/does_not_exist_1") f2 = File("/tmp/does_not_exist_2") f3 = File("/tmp/does_not_exist_3") with dag: op1 = DummyOperator(task_id='leave1', inlets={"datasets": [f1, ]}, outlets={"datasets": [f2, ]}) op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1', inlets={"auto": True}, outlets={"datasets": [f3, ]}) op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3', inlets={"task_ids": ["leave1", "upstream_level_1"]}) op1.set_downstream(op3) op2.set_downstream(op3) op3.set_downstream(op4) op4.set_downstream(op5) ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)} ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)} ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)} ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)} func = mock.Mock() func.__name__ = 'foo' # prepare with manual inlets and outlets prep = prepare_lineage(func) prep(op1, ctx1) self.assertEqual(len(op1.inlets), 1) self.assertEqual(op1.inlets[0], f1) self.assertEqual(len(op1.outlets), 1) self.assertEqual(op1.outlets[0], f2) # post process with no backend post = apply_lineage(func) post(op1, ctx1) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op2, ctx2) self.assertEqual(len(op2.inlets), 0) post(op2, ctx2) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op3, ctx3) self.assertEqual(len(op3.inlets), 1) self.assertEqual(op3.inlets[0].qualified_name, f2.qualified_name) post(op3, ctx3) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() # skip 4 prep(op5, ctx5) self.assertEqual(len(op5.inlets), 2) post(op5, ctx5) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock()
def test_cycle(self): # test empty dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) self.assertFalse(dag.test_cycle()) # test single task dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: opA = DummyOperator(task_id='A') self.assertFalse(dag.test_cycle()) # test no cycle dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C # B -> D # E -> F with dag: opA = DummyOperator(task_id='A') opB = DummyOperator(task_id='B') opC = DummyOperator(task_id='C') opD = DummyOperator(task_id='D') opE = DummyOperator(task_id='E') opF = DummyOperator(task_id='F') opA.set_downstream(opB) opB.set_downstream(opC) opB.set_downstream(opD) opE.set_downstream(opF) self.assertFalse(dag.test_cycle()) # test self loop dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> A with dag: opA = DummyOperator(task_id='A') opA.set_downstream(opA) with self.assertRaises(AirflowDagCycleException): dag.test_cycle() # test downstream self loop dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C -> D -> E -> E with dag: opA = DummyOperator(task_id='A') opB = DummyOperator(task_id='B') opC = DummyOperator(task_id='C') opD = DummyOperator(task_id='D') opE = DummyOperator(task_id='E') opA.set_downstream(opB) opB.set_downstream(opC) opC.set_downstream(opD) opD.set_downstream(opE) opE.set_downstream(opE) with self.assertRaises(AirflowDagCycleException): dag.test_cycle() # large loop dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C -> D -> E -> A with dag: opA = DummyOperator(task_id='A') opB = DummyOperator(task_id='B') opC = DummyOperator(task_id='C') opD = DummyOperator(task_id='D') opE = DummyOperator(task_id='E') opA.set_downstream(opB) opB.set_downstream(opC) opC.set_downstream(opD) opD.set_downstream(opE) opE.set_downstream(opA) with self.assertRaises(AirflowDagCycleException): dag.test_cycle() # test arbitrary loop dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # E-> A -> B -> F -> A # -> C -> F with dag: opA = DummyOperator(task_id='A') opB = DummyOperator(task_id='B') opC = DummyOperator(task_id='C') opD = DummyOperator(task_id='D') opE = DummyOperator(task_id='E') opF = DummyOperator(task_id='F') opA.set_downstream(opB) opA.set_downstream(opC) opE.set_downstream(opA) opC.set_downstream(opF) opB.set_downstream(opF) opF.set_downstream(opA) with self.assertRaises(AirflowDagCycleException): dag.test_cycle()