def test_dagrun_success_when_all_skipped(self): """ Tests that a DAG run succeeds when all tasks are skipped """ dag = DAG( dag_id='test_dagrun_success_when_all_skipped', start_date=datetime.datetime(2017, 1, 1) ) dag_task1 = ShortCircuitOperator( task_id='test_short_circuit_false', dag=dag, python_callable=lambda: False) dag_task2 = DummyOperator( task_id='test_state_skipped1', dag=dag) dag_task3 = DummyOperator( task_id='test_state_skipped2', dag=dag) dag_task1.set_downstream(dag_task2) dag_task2.set_downstream(dag_task3) initial_task_states = { 'test_short_circuit_false': State.SUCCESS, 'test_state_skipped1': State.SKIPPED, 'test_state_skipped2': State.SKIPPED, } dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) updated_dag_state = dag_run.update_state() self.assertEqual(State.SUCCESS, updated_dag_state)
def test_check_task_dependencies(self, trigger_rule, successes, skipped, failed, upstream_failed, done, flag_upstream_failed, expect_state, expect_completed): start_date = datetime.datetime(2016, 2, 1, 0, 0, 0) dag = models.DAG('test-dag', start_date=start_date) downstream = DummyOperator(task_id='downstream', dag=dag, owner='airflow', trigger_rule=trigger_rule) for i in range(5): task = DummyOperator(task_id='runme_{}'.format(i), dag=dag, owner='airflow') task.set_downstream(downstream) run_date = task.start_date + datetime.timedelta(days=5) ti = TI(downstream, run_date) dep_results = TriggerRuleDep()._evaluate_trigger_rule( ti=ti, successes=successes, skipped=skipped, failed=failed, upstream_failed=upstream_failed, done=done, flag_upstream_failed=flag_upstream_failed) completed = all([dep.passed for dep in dep_results]) self.assertEqual(completed, expect_completed) self.assertEqual(ti.state, expect_state)
def test_infer_dag(self): dag = DAG('dag', start_date=DEFAULT_DATE) dag2 = DAG('dag2', start_date=DEFAULT_DATE) op1 = DummyOperator(task_id='test_op_1', owner='test') op2 = DummyOperator(task_id='test_op_2', owner='test') op3 = DummyOperator(task_id='test_op_3', owner='test', dag=dag) op4 = DummyOperator(task_id='test_op_4', owner='test', dag=dag2) # double check dags self.assertEqual( [i.has_dag() for i in [op1, op2, op3, op4]], [False, False, True, True]) # can't combine operators with no dags self.assertRaises(AirflowException, op1.set_downstream, op2) # op2 should infer dag from op1 op1.dag = dag op1.set_downstream(op2) self.assertIs(op2.dag, dag) # can't assign across multiple DAGs self.assertRaises(AirflowException, op1.set_downstream, op4) self.assertRaises(AirflowException, op1.set_downstream, [op3, op4])
def test_skipping(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task.set_upstream(latest_task) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} assert exec_date_to_latest_state == { datetime.datetime(2016, 1, 1): 'success', datetime.datetime(2016, 1, 1, 12): 'success', datetime.datetime(2016, 1, 2): 'success', } downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} assert exec_date_to_downstream_state == { datetime.datetime(2016, 1, 1): 'skipped', datetime.datetime(2016, 1, 1, 12): 'skipped', datetime.datetime(2016, 1, 2): 'success', }
def test_operator_clear(self): dag = DAG('test_operator_clear', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10)) t1 = DummyOperator(task_id='bash_op', owner='test', dag=dag) t2 = DummyOperator(task_id='dummy_op', owner='test', dag=dag, retries=1) t2.set_upstream(t1) ti1 = TI(task=t1, execution_date=DEFAULT_DATE) ti2 = TI(task=t2, execution_date=DEFAULT_DATE) ti2.run() # Dependency not met self.assertEqual(ti2.try_number, 1) self.assertEqual(ti2.max_tries, 1) t2.clear(upstream=True) ti1.run() ti2.run() self.assertEqual(ti1.try_number, 2) # max_tries is 0 because there is no task instance in db for ti1 # so clear won't change the max_tries. self.assertEqual(ti1.max_tries, 0) self.assertEqual(ti2.try_number, 2) # try_number (0) + retries(1) self.assertEqual(ti2.max_tries, 1)
def test_check_task_dependencies( self, trigger_rule, successes, skipped, failed, upstream_failed, done, flag_upstream_failed, expect_state, expect_completed, ): start_date = datetime.datetime(2016, 2, 1, 0, 0, 0) dag = models.DAG("test-dag", start_date=start_date) downstream = DummyOperator(task_id="downstream", dag=dag, owner="airflow", trigger_rule=trigger_rule) for i in range(5): task = DummyOperator(task_id="runme_{}".format(i), dag=dag, owner="airflow") task.set_downstream(downstream) run_date = task.start_date + datetime.timedelta(days=5) ti = TI(downstream, run_date) completed = ti.evaluate_trigger_rule( successes=successes, skipped=skipped, failed=failed, upstream_failed=upstream_failed, done=done, flag_upstream_failed=flag_upstream_failed, ) self.assertEqual(completed, expect_completed) self.assertEqual(ti.state, expect_state)
class BranchOperatorTest(unittest.TestCase): def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.branch_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_op) self.dag.clear() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) session.close() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise def test_with_dag_run(self): dr = self.dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise
def subdag_C(): subdag_C = DAG( 'nested_cycle.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) opSubdag_C_task = DummyOperator( task_id='subdag_C.task', dag=subdag_C) # introduce a loop in opSubdag_C opSubdag_C_task.set_downstream(opSubdag_C_task) return subdag_C
def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
def test_dag_as_context_manager(self): """ Test DAG as a context manager. When used as a context manager, Operators are automatically added to the DAG (unless they specify a different DAG) """ dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) dag2 = DAG( 'dag2', start_date=DEFAULT_DATE, default_args={'owner': 'owner2'}) with dag: op1 = DummyOperator(task_id='op1') op2 = DummyOperator(task_id='op2', dag=dag2) self.assertIs(op1.dag, dag) self.assertEqual(op1.owner, 'owner1') self.assertIs(op2.dag, dag2) self.assertEqual(op2.owner, 'owner2') with dag2: op3 = DummyOperator(task_id='op3') self.assertIs(op3.dag, dag2) self.assertEqual(op3.owner, 'owner2') with dag: with dag2: op4 = DummyOperator(task_id='op4') op5 = DummyOperator(task_id='op5') self.assertIs(op4.dag, dag2) self.assertIs(op5.dag, dag) self.assertEqual(op4.owner, 'owner2') self.assertEqual(op5.owner, 'owner1') with DAG('creating_dag_in_cm', start_date=DEFAULT_DATE) as dag: DummyOperator(task_id='op6') self.assertEqual(dag.dag_id, 'creating_dag_in_cm') self.assertEqual(dag.tasks[0].task_id, 'op6') with dag: with dag: op7 = DummyOperator(task_id='op7') op8 = DummyOperator(task_id='op8') op9 = DummyOperator(task_id='op8') op9.dag = dag2 self.assertEqual(op7.dag, dag) self.assertEqual(op8.dag, dag) self.assertEqual(op9.dag, dag2)
def test_render_template_field(self): """Tests if render_template from a field works""" dag = DAG('test-dag', start_date=DEFAULT_DATE) with dag: task = DummyOperator(task_id='op1') result = task.render_template('', '{{ foo }}', dict(foo='bar')) self.assertEqual(result, 'bar')
def test_render_template_field_undefined_strict(self): """Tests if render_template from a field works""" dag = DAG('test-dag', start_date=DEFAULT_DATE, template_undefined=jinja2.StrictUndefined) with dag: task = DummyOperator(task_id='op1') with self.assertRaises(jinja2.UndefinedError): task.render_template('', '{{ foo }}', {})
def test_render_template_field_macro(self): """ Tests if render_template from a field works, if a custom filter was defined""" dag = DAG('test-dag', start_date=DEFAULT_DATE, user_defined_macros = dict(foo='bar')) with dag: task = DummyOperator(task_id='op1') result = task.render_template('', '{{ foo }}', dict()) self.assertEqual(result, 'bar')
def test_render_template_datetime_field(self): """Tests if render_template from a datetime field works""" dag = DAG('test-dag', start_date=DEFAULT_DATE) with dag: task = DummyOperator(task_id='op1') self.assertEqual( task.render_template('', datetime.datetime(2018, 12, 6, 10, 55), {'foo': 'bar'}), datetime.datetime(2018, 12, 6, 10, 55) )
def test_render_template_list_field(self): """Tests if render_template from a list field works""" dag = DAG('test-dag', start_date=DEFAULT_DATE) with dag: task = DummyOperator(task_id='op1') self.assertListEqual( task.render_template('', ['{{ foo }}_1', '{{ foo }}_2'], {'foo': 'bar'}), ['bar_1', 'bar_2'] )
def test_render_template_dict_field(self): """Tests if render_template from a dict field works""" dag = DAG('test-dag', start_date=DEFAULT_DATE) with dag: task = DummyOperator(task_id='op1') self.assertDictEqual( task.render_template('', {'key1': '{{ foo }}_1', 'key2': '{{ foo }}_2'}, {'foo': 'bar'}), {'key1': 'bar_1', 'key2': 'bar_2'} )
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close()
def test_render_template_object_field(self): """Tests if render_template from an object field works""" dag = DAG('test-dag', start_date=DEFAULT_DATE) with dag: task = DummyOperator(task_id='op1') test_object = object() self.assertIs( task.render_template('', test_object, {'foo': 'bar'}), test_object )
def test_render_template_dict_field_with_templated_keys(self): """Tests if render_template from a dict field works as expected: dictionary keys are not templated""" dag = DAG('test-dag', start_date=DEFAULT_DATE) with dag: task = DummyOperator(task_id='op1') self.assertDictEqual( task.render_template('', {'key_{{ foo }}_1': 1, 'key_2': '{{ foo }}_2'}, {'foo': 'bar'}), {'key_{{ foo }}_1': 1, 'key_2': 'bar_2'} )
def test_render_template_UUID_field(self): """Tests if render_template from a UUID field works""" dag = DAG('test-dag', start_date=DEFAULT_DATE) with dag: task = DummyOperator(task_id='op1') random_uuid = uuid.uuid4() self.assertIs( task.render_template('', random_uuid, {'foo': 'bar'}), random_uuid )
def _create_events_branch(self, task_id): """Create the DAG branch with sensor and operator (to be called by each subclass).""" self.decrypt_connection() tables = self.get_events() tables_op = DummyOperator(task_id=task_id, dag=self.dag, resources=dict(organizationId='astronomer')) tables_op.set_upstream(self.upstream_task) for table in tables: sensor = self.create_key_sensor(table=table) sensor.set_upstream(tables_op) copy_task = self.create_copy_operator(table=table) if not copy_task: logger.info('Skipping table due to invalid config') continue copy_task.set_upstream(sensor)
def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.branch_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_op) self.dag.clear()
def test_branch_list_without_dag_run(self): """This checks if the BranchPythonOperator supports branching off to a list of tasks.""" self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: ['branch_1', 'branch_2']) self.branch_1.set_upstream(self.branch_op) self.branch_2.set_upstream(self.branch_op) self.branch_3 = DummyOperator(task_id='branch_3', dag=self.dag) self.branch_3.set_upstream(self.branch_op) self.dag.clear() self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) expected = { "make_choice": State.SUCCESS, "branch_1": State.NONE, "branch_2": State.NONE, "branch_3": State.SKIPPED, } for ti in tis: if ti.task_id in expected: self.assertEqual(ti.state, expected[ti.task_id]) else: raise Exception
def test_render_template_field_filter(self): """ Tests if render_template from a field works, if a custom filter was defined""" def jinja_udf(name): return 'Hello %s' %name dag = DAG('test-dag', start_date=DEFAULT_DATE, user_defined_filters = dict(hello=jinja_udf)) with dag: task = DummyOperator(task_id='op1') result = task.render_template('', "{{ 'world' | hello}}", dict()) self.assertEqual(result, 'Hello world')
def generated_sub_dag(parent_dag_name, child_dag_name, start_date, schedule_interval): dag = DAG( '%s.%s' % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, default_args=default_args ) task_count = 3 previous_task = None for i in range(task_count): task = DummyOperator( task_id='generated_task_' + str(i), dag=dag, ) if previous_task: task.set_upstream(previous_task) previous_task = task return dag
def setUp(self): self.dag = DAG('shortcircuit_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.short_op = ShortCircuitOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: self.value) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.short_op) self.upstream = DummyOperator(task_id='upstream', dag=self.dag) self.upstream.set_downstream(self.short_op) self.dag.clear() self.value = True
def basic_cycle(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator import datetime DAG_NAME = 'cycle_dag' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( DAG_NAME, default_args=DEFAULT_ARGS) # A -> A with dag: opA = DummyOperator(task_id='A') opA.set_downstream(opA) return dag
def standard_subdag(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'master' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( DAG_NAME, default_args=DEFAULT_ARGS) # master: # A -> opSubDag_0 # master.opsubdag_0: # -> subdag_0.task # A -> opSubDag_1 # master.opsubdag_1: # -> subdag_1.task with dag: def subdag_0(): subdag_0 = DAG('master.opSubdag_0', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_0.task', dag=subdag_0) return subdag_0 def subdag_1(): subdag_1 = DAG('master.opSubdag_1', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_1.task', dag=subdag_1) return subdag_1 opSubdag_0 = SubDagOperator( task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator( task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
def create_test_pipeline(suffix, trigger_rule, dag): skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag) always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag) join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule) join.set_upstream(skip_operator) join.set_upstream(always_true) final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag) final.set_upstream(join)
def _make_sensor(self, return_value, **kwargs): poke_interval = 'poke_interval' timeout = 'timeout' if poke_interval not in kwargs: kwargs[poke_interval] = 0 if timeout not in kwargs: kwargs[timeout] = 0 sensor = DummySensor( task_id=SENSOR_OP, return_value=return_value, dag=self.dag, **kwargs ) dummy_op = DummyOperator( task_id=DUMMY_OP, dag=self.dag ) dummy_op.set_upstream(sensor) return sensor
'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2) } # Initializing our DAG dag = DAG(dag_id='DAG_tmp', default_args=default_args, description='DAG for DSC, temporary', schedule_interval=timedelta(days=1)) # Defining Tasks ############## # Reading data read_data_1 = DummyOperator(task_id='Read data', dag=dag) read_data_2 = DummyOperator(task_id='Read data', dag=dag) read_data_3 = DummyOperator(task_id='Read data', dag=dag) read_data_4 = DummyOperator(task_id='Read data', dag=dag) read_data_5 = DummyOperator(task_id='Read data', dag=dag) read_data_6 = DummyOperator(task_id='Read data', dag=dag) read_data_7 = DummyOperator(task_id='Read data', dag=dag) read_data_8 = DummyOperator(task_id='Read data', dag=dag)
def nested_subdags(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'master' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS) # master: # A -> opSubdag_0 # master.opSubdag_0: # -> opSubDag_A # master.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # master.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # master.opSubdag_1: # -> opSubdag_C # master.opSubdag_1.opSubdag_C: # -> subdag_C.task # -> opSubDag_D # master.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG('master.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG('master.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG('master.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_C.task', dag=subdag_C) return subdag_C def subdag_D(): subdag_D = DAG('master.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('master.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('master.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator(task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator(task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
from datetime import datetime from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator def print_hello(): return 'Hello world 2!' dag = DAG('hello_world_3', description='Simple tutorial DAG', schedule_interval='* * * * *', start_date=datetime(2018, 4, 20), catchup=False) dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag) hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag) dummy_operator >> hello_operator
def subdag_B(): subdag_B = DAG('nested_cycle.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B
def subdag_C(): subdag_C = DAG('master.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_C.task', dag=subdag_C) return subdag_C
import pandas as pd from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator from airflow.utils import dates from airflow.hooks.postgres_hook import PostgresHook default_args = { 'owner': 'estevo_vazquez', 'start_date': dates.days_ago(1) } def obtener_pandas(): conn = PostgresHook('pg_local') df = conn.get_pandas_df('select * from emp') print (df) with DAG('dag_leer_posgres', default_args=default_args, schedule_interval='@daily') as dag: start = DummyOperator(task_id='start') obtener_pandas_operator = PythonOperator(task_id='obtener_pandas_operator',python_callable=obtener_pandas) end = DummyOperator(task_id='end') start >> obtener_pandas_operator >> end
from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.contrib.operators.dataproc_operator import DataProcSparkOperator, DataprocClusterCreateOperator, \ DataprocClusterDeleteOperator, DataProcPySparkOperator from datetime import datetime, timedelta from airflow.operators.dummy_operator import DummyOperator default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2019, 12, 4), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=3) } dag = DAG('pipeline', default_args=default_args, schedule_interval=None, catchup=False) start = DummyOperator(task_id='run_this_first', dag=dag) end = DummyOperator(task_id='run_this_end', dag=dag) start >> end
def subdag_1(): subdag_1 = DAG('master.op_subdag_1', default_args=default_args) DummyOperator(task_id='subdag_1.task', dag=subdag_1) return subdag_1
def subdag_0(): subdag_0 = DAG('master.op_subdag_0', default_args=default_args) DummyOperator(task_id='subdag_0.task', dag=subdag_0) return subdag_0
def nested_subdag_cycle(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime # pylint: disable=redefined-outer-name,reimported dag_name = 'nested_cycle' default_args = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( dag_name, default_args=default_args) # cycle: # A -> op_subdag_0 # cycle.op_subdag_0: # -> opSubDag_A # cycle.op_subdag_0.opSubdag_A: # -> subdag_a.task # -> opSubdag_B # cycle.op_subdag_0.opSubdag_B: # -> subdag_b.task # A -> op_subdag_1 # cycle.op_subdag_1: # -> opSubdag_C # cycle.op_subdag_1.opSubdag_C: # -> subdag_c.task -> subdag_c.task >Invalid Loop< # -> opSubDag_D # cycle.op_subdag_1.opSubdag_D: # -> subdag_d.task with dag: def subdag_a(): subdag_a = DAG( 'nested_cycle.op_subdag_0.opSubdag_A', default_args=default_args) DummyOperator(task_id='subdag_a.task', dag=subdag_a) return subdag_a def subdag_b(): subdag_b = DAG( 'nested_cycle.op_subdag_0.opSubdag_B', default_args=default_args) DummyOperator(task_id='subdag_b.task', dag=subdag_b) return subdag_b def subdag_c(): subdag_c = DAG( 'nested_cycle.op_subdag_1.opSubdag_C', default_args=default_args) op_subdag_c_task = DummyOperator( task_id='subdag_c.task', dag=subdag_c) # introduce a loop in opSubdag_C op_subdag_c_task.set_downstream(op_subdag_c_task) return subdag_c def subdag_d(): subdag_d = DAG( 'nested_cycle.op_subdag_1.opSubdag_D', default_args=default_args) DummyOperator(task_id='subdag_d.task', dag=subdag_d) return subdag_d def subdag_0(): subdag_0 = DAG('nested_cycle.op_subdag_0', default_args=default_args) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_a()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_b()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.op_subdag_1', default_args=default_args) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_c()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_d()) return subdag_1 op_subdag_0 = SubDagOperator( task_id='op_subdag_0', dag=dag, subdag=subdag_0()) op_subdag_1 = SubDagOperator( task_id='op_subdag_1', dag=dag, subdag=subdag_1()) op_a = DummyOperator(task_id='A') op_a.set_downstream(op_subdag_0) op_a.set_downstream(op_subdag_1) return dag
def subdag_c(): subdag_c = DAG( 'master.op_subdag_1.opSubdag_C', default_args=default_args) DummyOperator(task_id='subdag_c.task', dag=subdag_c) return subdag_c
def subdag_a(): subdag_a = DAG( 'nested_cycle.op_subdag_0.opSubdag_A', default_args=default_args) DummyOperator(task_id='subdag_a.task', dag=subdag_a) return subdag_a
def subdag_b(): subdag_b = DAG( 'nested_cycle.op_subdag_0.opSubdag_B', default_args=default_args) DummyOperator(task_id='subdag_b.task', dag=subdag_b) return subdag_b
def subdag_d(): subdag_d = DAG( 'nested_cycle.op_subdag_1.opSubdag_D', default_args=default_args) DummyOperator(task_id='subdag_d.task', dag=subdag_d) return subdag_d
import datetime from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.sensors.external_task_sensor import ExternalTaskMarker, ExternalTaskSensor start_date = datetime.datetime(2015, 1, 1) with DAG("example_external_task_marker_parent", start_date=start_date, schedule_interval=None) as parent_dag: # [START howto_operator_external_task_marker] parent_task = ExternalTaskMarker( task_id="parent_task", external_dag_id="example_external_task_marker_child", external_task_id="child_task1") # [END howto_operator_external_task_marker] with DAG("example_external_task_marker_child", start_date=start_date, schedule_interval=None) as child_dag: # [START howto_operator_external_task_sensor] child_task1 = ExternalTaskSensor(task_id="child_task1", external_dag_id=parent_dag.dag_id, external_task_id=parent_task.task_id, mode="reschedule") # [END howto_operator_external_task_sensor] child_task2 = DummyOperator(task_id="child_task2") child_task1 >> child_task2
from airflow import DAG from datetime import datetime from airflow.operators.http_operator import SimpleHttpOperator from airflow.operators.dummy_operator import DummyOperator dag = DAG('simple-api-dag-example', schedule_interval='30 * * * *', start_date=datetime(2018, 1, 1), catchup=False) api_call = SimpleHttpOperator( task_id='simple_api_call', http_conn_id='flask_example_conn', endpoint='/hello-api', method='GET', dag=dag) dag_success = DummyOperator(task_id='success') api_call >> dag_success
Example DAG demonstrating a workflow with nested branching. The join tasks are created with ``none_failed_or_skipped`` trigger rule such that they are skipped whenever their corresponding ``BranchPythonOperator`` are skipped. """ from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import BranchPythonOperator from airflow.utils.dates import days_ago with DAG(dag_id="example_nested_branch_dag", start_date=days_ago(2), schedule_interval="@daily") as dag: branch_1 = BranchPythonOperator(task_id="branch_1", python_callable=lambda: "true_1") join_1 = DummyOperator(task_id="join_1", trigger_rule="none_failed_or_skipped") true_1 = DummyOperator(task_id="true_1") false_1 = DummyOperator(task_id="false_1") branch_2 = BranchPythonOperator(task_id="branch_2", python_callable=lambda: "true_2") join_2 = DummyOperator(task_id="join_2", trigger_rule="none_failed_or_skipped") true_2 = DummyOperator(task_id="true_2") false_2 = DummyOperator(task_id="false_2") false_3 = DummyOperator(task_id="false_3") branch_1 >> true_1 >> join_1 branch_1 >> false_1 >> branch_2 >> [true_2, false_2 ] >> join_2 >> false_3 >> join_1
import airflow from airflow.models import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.dummy_operator import DummyOperator from datetime import timedelta args = {'owner': 'wu gang', 'start_date': airflow.utils.dates.days_ago(2)} dag = DAG( dag_id='example_bash_operator', default_args=args, schedule_interval='0 0 * * *', # 任务触发的周期 dagrun_timeout=timedelta(minutes=60)) cmd = 'ls -l' run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this = BashOperator(task_id='run_after_loop', bash_command='echo 1', dag=dag) run_this.set_downstream(run_this_last) for i in range(3): i = str(i) task = BashOperator( task_id='runme_' + i, bash_command='echo "{{ task_instance_key_str }}" && sleep 1', dag=dag) task.set_downstream(run_this) task = BashOperator(
from airflow.operators.dummy_operator import DummyOperator from airflow.operators import MeetingSchedulerOperator from airflow.operators import UserRegisterOperator from airflow.operators import EMailMessengerOperator default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(2), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 1 } dag = DAG('contract.onboard_dag', description='The OnBoard DAG for contract', schedule_interval='0 12 * * *', default_args=default_args, start_date=datetime(2017, 3, 20), catchup=False) dummy_task = DummyOperator(task_id='dummy_task', dag=dag) user_register_task = UserRegisterOperator(args_operator_param='This is part of a OnBoard Flux', task_id='user_register_task', dag=dag) meeting_scheduler_task = MeetingSchedulerOperator(args_operator_param='This part of is a OnBoard Flux', task_id='meeting_scheduler_task', dag=dag) email_messenger_task = EMailMessengerOperator(args_operator_param='This part of is a OnBoard Flux', task_id='email_messenger_task', dag=dag) dummy_task >> user_register_task >> meeting_scheduler_task >> email_messenger_task
"start_date": datetime(2019, 1, 1), "depends_on_past": False, "email_on_failure": False, "email_on_retry": False, "email": "*****@*****.**", "retries": 1, "retry_delay": timedelta(minutes=5) } with DAG( dag_id="test_kerberos_conn", schedule_interval="0 1 * * *", default_args=default_args, ) as dag: start = DummyOperator(task_id='start_task', retries=3) start_Bash = BashOperator(task_id='start_Bash', bash_command="echo hello BashOperator", retries=3) test_kerberos = BashOperator(task_id="test_kerberos", bash_command=""" echo $HADOOP_HOME klist hdfs dfs -ls /user echo "$(AIRFLOW_HOME)" hdfs dfs -put $AIRFLOW_HOME/dags/files/forex_currencies.csv /tmp hdfs dfs -ls /tmp """, retries=3)
def subdag_D(): subdag_D = DAG('nested_cycle.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D
def test_with_dag_run(self): value = False dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks %s", dag.tasks) dr = dag.create_dagrun(run_type=DagRunType.MANUAL, start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f'Invalid task id {ti.task_id} found!') value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise ValueError(f'Invalid task id {ti.task_id} found!')
def nested_subdag_cycle(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'nested_cycle' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS) # cycle: # A -> opSubdag_0 # cycle.opSubdag_0: # -> opSubDag_A # cycle.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # cycle.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # cycle.opSubdag_1: # -> opSubdag_C # cycle.opSubdag_1.opSubdag_C: # -> subdag_C.task -> subdag_C.task >Invalid Loop< # -> opSubDag_D # cycle.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG('nested_cycle.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG('nested_cycle.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG('nested_cycle.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) opSubdag_C_task = DummyOperator(task_id='subdag_C.task', dag=subdag_C) # introduce a loop in opSubdag_C opSubdag_C_task.set_downstream(opSubdag_C_task) return subdag_C def subdag_D(): subdag_D = DAG('nested_cycle.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('nested_cycle.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator(task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator(task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter(TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise ValueError(f'Invalid task id {ti.task_id} found!') elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f'Invalid task id {ti.task_id} found!') value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise ValueError(f'Invalid task id {ti.task_id} found!') elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise ValueError(f'Invalid task id {ti.task_id} found!')
def subdag_A(): subdag_A = DAG('master.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A
class TestBranchOperator(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_3 = None def tearDown(self): super().tearDown() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.dag.clear() branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter(TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f'Invalid task id {ti.task_id} found!') def test_branch_list_without_dag_run(self): """This checks if the BranchPythonOperator supports branching off to a list of tasks.""" branch_op = BranchPythonOperator( task_id='make_choice', dag=self.dag, python_callable=lambda: ['branch_1', 'branch_2']) self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.branch_3 = DummyOperator(task_id='branch_3', dag=self.dag) self.branch_3.set_upstream(branch_op) self.dag.clear() branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with create_session() as session: tis = session.query(TI).filter(TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE) expected = { "make_choice": State.SUCCESS, "branch_1": State.NONE, "branch_2": State.NONE, "branch_3": State.SKIPPED, } for ti in tis: if ti.task_id in expected: self.assertEqual(ti.state, expected[ti.task_id]) else: raise ValueError(f'Invalid task id {ti.task_id} found!') def test_with_dag_run(self): branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.dag.clear() dr = self.dag.create_dagrun(run_type=DagRunType.MANUAL, start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING) branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f'Invalid task id {ti.task_id} found!') def test_with_skip_in_branch_downstream_dependencies(self): branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') branch_op >> self.branch_1 >> self.branch_2 branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun(run_type=DagRunType.MANUAL, start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING) branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise ValueError(f'Invalid task id {ti.task_id} found!') def test_with_skip_in_branch_downstream_dependencies2(self): branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_2') branch_op >> self.branch_1 >> self.branch_2 branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun(run_type=DagRunType.MANUAL, start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING) branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise ValueError(f'Invalid task id {ti.task_id} found!') def test_xcom_push(self): branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.dag.clear() dr = self.dag.create_dagrun(run_type=DagRunType.MANUAL, start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING) branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.xcom_pull(task_ids='make_choice'), 'branch_1') def test_clear_skipped_downstream_task(self): """ After a downstream task is skipped by BranchPythonOperator, clearing the skipped task should not cause it to be executed. """ branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') branches = [self.branch_1, self.branch_2] branch_op >> branches self.dag.clear() dr = self.dag.create_dagrun(run_type=DagRunType.MANUAL, start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING) branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for task in branches: task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f'Invalid task id {ti.task_id} found!') children_tis = [ ti for ti in tis if ti.task_id in branch_op.get_direct_relative_ids() ] # Clear the children tasks. with create_session() as session: clear_task_instances(children_tis, session=session, dag=self.dag) # Run the cleared tasks again. for task in branches: task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # Check if the states are correct after children tasks are cleared. for ti in dr.get_task_instances(): if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f'Invalid task id {ti.task_id} found!')
# BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs dag = DAG( dag_id='example_branch_dop_operator_v3', schedule_interval='*/1 * * * *', default_args=args, ) def should_run(**kwargs): print('------------- exec dttm = {} and minute = {}'.format( kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: return "dummy_task_1" else: return "dummy_task_2" cond = BranchPythonOperator( task_id='condition', provide_context=True, python_callable=should_run, dag=dag, ) dummy_task_1 = DummyOperator(task_id='dummy_task_1', dag=dag) dummy_task_2 = DummyOperator(task_id='dummy_task_2', dag=dag) cond >> [dummy_task_1, dummy_task_2]
write_path = "{0}/attend_student_ytd.feather".format(save_path) feather.write_dataframe(attend_student_ytd, write_path) return write_path dag = DAG("idea_ops_attendance_dashboard_2019-08-26", default_args=default_args, schedule_interval='0 6/3 * * *', catchup=False) with dag: start_dag = DummyOperator(task_id="start_idea_attendance") get_students = BigQueryToFeatherOperator( task_id="get_students", sql=students_qry, destination_file="{0}/students.feather".format(SAVE_PATH)) get_attendance = BigQueryToFeatherOperator( task_id="get_attendance", sql=att_qry, destination_file="{0}/attendance.feather".format(SAVE_PATH)) get_att_code = BigQueryToFeatherOperator( task_id="get_att_code", sql=att_code_qry, destination_file="{0}/att_code.feather".format(SAVE_PATH))
def test_dagrun_update_state_end_date(self): session = settings.Session() dag = DAG('test_dagrun_update_state_end_date', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op1.set_upstream(op2) dag.clear() now = timezone.utcnow() dr = dag.create_dagrun( run_id='test_dagrun_update_state_end_date', state=State.RUNNING, execution_date=now, start_date=now, ) # Initial end_date should be NULL # State.SUCCESS and State.FAILED are all ending state and should set end_date # State.RUNNING set end_date back to NULL session.merge(dr) session.commit() self.assertIsNone(dr.end_date) ti_op1 = dr.get_task_instance(task_id=op1.task_id) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2 = dr.get_task_instance(task_id=op2.task_id) ti_op2.set_state(state=State.SUCCESS, session=session) dr.update_state() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_update_state_end_date').one() self.assertIsNotNone(dr_database.end_date) self.assertEqual(dr.end_date, dr_database.end_date) ti_op1.set_state(state=State.RUNNING, session=session) ti_op2.set_state(state=State.RUNNING, session=session) dr.update_state() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_update_state_end_date').one() self.assertEqual(dr._state, State.RUNNING) self.assertIsNone(dr.end_date) self.assertIsNone(dr_database.end_date) ti_op1.set_state(state=State.FAILED, session=session) ti_op2.set_state(state=State.FAILED, session=session) dr.update_state() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_update_state_end_date').one() self.assertIsNotNone(dr_database.end_date) self.assertEqual(dr.end_date, dr_database.end_date)
task_id='Load_artist_dim_table', dag=dag, redshift_conn_id='redshift', table='artists', sql_stmt=SqlQueries.artist_table_insert) load_time_dim_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id='redshift', table='time', sql_stmt=SqlQueries.time_table_insert) run_quality_checks = DataQualityOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', tables=['users', 'songs', 'artists', 'time']) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) start_operator >> stage_events >> load_songplays_table start_operator >> stage_songs >> load_songplays_table dim_tasks = [ load_song_dim_table, load_user_dim_table, load_artist_dim_table, load_time_dim_table ] for task in dim_tasks: load_songplays_table >> task >> run_quality_checks run_quality_checks >> end_operator