def test_scheduler_add_new_task(self): """ Test if a task instance will be added if the dag is updated """ dag = DAG(dag_id='test_scheduler_add_new_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances() self.assertEquals(len(tis), 1) dag_task2 = DummyOperator(task_id='dummy2', dag=dag, owner='airflow') queue = mock.Mock() scheduler.process_dag(dag, queue=queue) tis = dr.get_task_instances() self.assertEquals(len(tis), 2)
def test_bitshift_compose_operators(self): dag = DAG('dag', start_date=DEFAULT_DATE) op1 = DummyOperator(task_id='test_op_1', owner='test') op2 = DummyOperator(task_id='test_op_2', owner='test') op3 = DummyOperator(task_id='test_op_3', owner='test') op4 = DummyOperator(task_id='test_op_4', owner='test') op5 = DummyOperator(task_id='test_op_5', owner='test') # can't compose operators without dags with self.assertRaises(AirflowException): op1 >> op2 dag >> op1 >> op2 << op3 # make sure dag assignment carries through # using __rrshift__ self.assertIs(op1.dag, dag) self.assertIs(op2.dag, dag) self.assertIs(op3.dag, dag) # op2 should be downstream of both self.assertIn(op2, op1.downstream_list) self.assertIn(op2, op3.downstream_list) # test dag assignment with __rlshift__ dag << op4 self.assertIs(op4.dag, dag) # dag assignment with __rrshift__ dag >> op5 self.assertIs(op5.dag, dag)
def build_dags(): args = { "owner": "airflow", "start_date": airflow.utils.dates.days_ago(2), } with DAG(dag_id="dag1", default_args=args, schedule_interval="0 0 * * *") as dag1: run_this_last = DummyOperator(task_id="run_this_last") run_this_first = BashOperator(task_id="run_this_first", bash_command="echo 1") run_this_first >> run_this_last with DAG(dag_id="dag2", default_args=args, schedule_interval="0 0 * * *") as dag2: run_this_last = DummyOperator(task_id="run_this_last") run_this_first = BashOperator(task_id="run_this_first", bash_command="echo 1") run_this_first >> run_this_last return [dag1, dag2]
def test_infer_dag(self): dag = DAG('dag', start_date=DEFAULT_DATE) dag2 = DAG('dag2', start_date=DEFAULT_DATE) op1 = DummyOperator(task_id='test_op_1', owner='test') op2 = DummyOperator(task_id='test_op_2', owner='test') op3 = DummyOperator(task_id='test_op_3', owner='test', dag=dag) op4 = DummyOperator(task_id='test_op_4', owner='test', dag=dag2) # double check dags self.assertEqual( [i.has_dag() for i in [op1, op2, op3, op4]], [False, False, True, True]) # can't combine operators with no dags self.assertRaises(AirflowException, op1.set_downstream, op2) # op2 should infer dag from op1 op1.dag = dag op1.set_downstream(op2) self.assertIs(op2.dag, dag) # can't assign across multiple DAGs self.assertRaises(AirflowException, op1.set_downstream, op4) self.assertRaises(AirflowException, op1.set_downstream, [op3, op4])
def test_check_task_dependencies(self, trigger_rule, successes, skipped, failed, upstream_failed, done, flag_upstream_failed, expect_state, expect_completed): start_date = datetime.datetime(2016, 2, 1, 0, 0, 0) dag = models.DAG('test-dag', start_date=start_date) downstream = DummyOperator(task_id='downstream', dag=dag, owner='airflow', trigger_rule=trigger_rule) for i in range(5): task = DummyOperator(task_id='runme_{}'.format(i), dag=dag, owner='airflow') task.set_downstream(downstream) run_date = task.start_date + datetime.timedelta(days=5) ti = TI(downstream, run_date) completed = ti.evaluate_trigger_rule( successes=successes, skipped=skipped, failed=failed, upstream_failed=upstream_failed, done=done, flag_upstream_failed=flag_upstream_failed) self.assertEqual(completed, expect_completed) self.assertEqual(ti.state, expect_state)
def database_sub_dag(parent_dag_name, database_name, schedule_interval): #'@once' one_dag = DAG(parent_dag_name + '.' + database_name, default_args=default_args, schedule_interval=schedule_interval) #in production, need to update this to run once daily (add various dags and set variables in Airflow?) #start dummy taks start_task = DummyOperator( task_id='start_task', dag=one_dag ) # Creates the tasks dynamically. Each one will elaborate one chunk of data. def create_dynamic_task_tos3(table): return PythonOperator( #provide_context=True, task_id='upload_to_S3_task_' + table, pool='Pool_max_parallel_5', python_callable=upload_table_to_S3_with_hook, op_kwargs={ 'Source_System_Name': Source_System_Name, 'database': mysql_database, 'Task_id': 'upload_to_S3_task_', 'bucket_name': s3_bucket_name, 'table_name': table #'exclude_columns': False }, dag=one_dag) def create_dynamic_task_tosf(table): return PythonOperator( #provide_context=True, task_id='upload_to_snowflake_task_' + table, pool='Pool_max_parallel_5', python_callable=upload_to_snowflake, op_kwargs={ 'database': mysql_database, 'table_name': table, 'Task_id': 'upload_to_snowflake_task_', 'Prev_task_id': 'upload_to_S3_task_' }, dag=one_dag) #end dummy dag end = DummyOperator( task_id='end', dag=one_dag) tbl_list = get_table_list(mysql_database, exclude_tables = True, exclude_tbls_list = excluded_tables) #collecting all table names from database database #Setting dependencies, the configuration below creates a parallel task for each table that migrates the table from mysql to s3, then from s3 to for t in tbl_list: dt_s3 = create_dynamic_task_tos3(t) dt_sf = create_dynamic_task_tosf(t) start_task >> dt_s3 dt_s3 >> dt_sf dt_sf >> end return one_dag
def create_test_pipeline(suffix, trigger_rule, dag): skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag) always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag) join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule) join.set_upstream(skip_operator) join.set_upstream(always_true) final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag) final.set_upstream(join)
def test_subdag_pools(self): """ Subdags and subdag tasks can't both have a pool with 1 slot """ dag = DAG('parent', default_args=default_args) subdag = DAG('parent.child', default_args=default_args) session = airflow.settings.Session() pool_1 = airflow.models.Pool(pool='test_pool_1', slots=1) pool_10 = airflow.models.Pool(pool='test_pool_10', slots=10) session.add(pool_1) session.add(pool_10) session.commit() dummy_1 = DummyOperator(task_id='dummy', dag=subdag, pool='test_pool_1') self.assertRaises(AirflowException, SubDagOperator, task_id='child', dag=dag, subdag=subdag, pool='test_pool_1') # recreate dag because failed subdagoperator was already added dag = DAG('parent', default_args=default_args) SubDagOperator(task_id='child', dag=dag, subdag=subdag, pool='test_pool_10') session.delete(pool_1) session.delete(pool_10) session.commit()
def test_scheduler_do_not_run_finished(self): dag = DAG(dag_id='test_scheduler_do_not_run_finished', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.SUCCESS session.commit() session.close() queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_fail_dagrun_timeout(self): """ Test if a a dagrun wil be set failed if timeout """ dag = DAG(dag_id='test_scheduler_fail_dagrun_timeout', start_date=DEFAULT_DATE) dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) print(dr.start_date) dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) print(dr.start_date) session.merge(dr) session.commit() dr2 = scheduler.schedule_dag(dag) self.assertIsNotNone(dr2) dr.refresh_from_db(session=session) self.assertEquals(dr.state, State.FAILED)
def test_scheduler_process_check_heartrate(self): """ Test if process dag honors the heartrate """ dag = DAG(dag_id='test_scheduler_process_check_heartrate', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.last_scheduler_run = datetime.datetime.now() session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() scheduler.heartrate = 1000 dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_set_dag(self): """ Test assigning Operators to Dags, including deferred assignment """ dag = DAG('dag', start_date=DEFAULT_DATE) dag2 = DAG('dag2', start_date=DEFAULT_DATE) op = DummyOperator(task_id='op_1', owner='test') # no dag assigned self.assertFalse(op.has_dag()) self.assertRaises(AirflowException, getattr, op, 'dag') # no improper assignment with self.assertRaises(TypeError): op.dag = 1 op.dag = dag # no reassignment with self.assertRaises(AirflowException): op.dag = dag2 # but assigning the same dag is ok op.dag = dag self.assertIs(op.dag, dag) self.assertIn(op, dag.tasks)
def test_scheduler_process_execute_task(self): """ Test if process dag sends a task to the executor """ dag = DAG(dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_called_with( ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None)) tis = dr.get_task_instances(state=State.SCHEDULED) self.assertIsNotNone(tis)
def test_xcom_pull_different_execution_date(self): """ tests xcom fetch behavior with different execution dates, using both xcom_pull with "include_prior_dates" and without """ key = 'xcom_key' value = 'xcom_value' dag = models.DAG(dag_id='test_xcom', schedule_interval='@monthly') task = DummyOperator( task_id='test_xcom', dag=dag, pool='test_xcom', owner='airflow', start_date=datetime.datetime(2016, 6, 2, 0, 0, 0)) exec_date = datetime.datetime.now() ti = TI( task=task, execution_date=exec_date) ti.run(mark_success=True) ti.xcom_push(key=key, value=value) self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value) ti.run() exec_date = exec_date.replace(day=exec_date.day + 1) ti = TI( task=task, execution_date=exec_date) ti.run() # We have set a new execution date (and did not pass in # 'include_prior_dates'which means this task should now have a cleared # xcom value self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), None) # We *should* get a value using 'include_prior_dates' self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key, include_prior_dates=True), value)
def test_scheduler_auto_align(self): """ Test if the schedule_interval will be auto aligned with the start_date such that if the start_date coincides with the schedule the first execution_date will be start_date, otherwise it will be start_date + interval. """ dag = DAG(dag_id='test_scheduler_auto_align_1', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="4 5 * * *") dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 2, 5, 4)) dag = DAG(dag_id='test_scheduler_auto_align_2', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="10 10 * * *") dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 1, 10, 10))
def database_sub_dag(parent_dag_name, database_name, schedule_interval): #'@once' one_dag = DAG(parent_dag_name + '.' + database_name, default_args=default_args, schedule_interval=schedule_interval) #in production, need to update this to run once daily (add various dags and set variables in Airflow?) #start dummy taks start_task = DummyOperator( task_id='start_task', dag=one_dag ) # Creates the tasks dynamically. Each one will elaborate one chunk of data. def create_dynamic_task_collect_table_counts(table): return PythonOperator( #provide_context=True, task_id='Get_mysql_table_counts_for_' + table, pool='Pool_max_parallel_5', python_callable=get_mysql_table_counts, op_kwargs={ 'database': database_name, 'table': table, }, dag=one_dag) #end dummy dag end = DummyOperator( task_id='end', dag=one_dag) tbl_list = get_table_list(database_name) #collecting all table names from database #Setting dependencies, the configuration below creates a parallel task for each table that migrates the table from mysql to s3, then from s3 to for t in tbl_list: dt_create_tables = create_dynamic_task_collect_table_counts(t) start_task >> dt_create_tables dt_create_tables >> end return one_dag
def test_run_pooling_task(self): """ test that running task with mark_success param update task state as SUCCESS without running task. """ dag = models.DAG(dag_id='test_run_pooling_task') task = DummyOperator(task_id='test_run_pooling_task_op', dag=dag, pool='test_run_pooling_task_pool', owner='airflow', start_date=datetime.datetime(2016, 2, 1, 0, 0, 0)) ti = TI( task=task, execution_date=datetime.datetime.now()) ti.run() self.assertEqual(ti.state, models.State.QUEUED)
def subdag(parent_dag_name, child_dag_name, args): dag_subdag = DAG( dag_id='%s.%s' % (parent_dag_name, child_dag_name), default_args=args, schedule_interval="@daily", ) for i in range(5): DummyOperator( task_id='%s-task-%s' % (child_dag_name, i + 1), default_args=args, dag=dag_subdag, ) return dag_subdag
def database_sub_dag(parent_dag_name, database_name, schedule_interval): #'@once' one_dag = DAG(parent_dag_name + '.' + database_name, default_args=default_args, schedule_interval=schedule_interval, concurrency=50, catchup=False) #start dummy taks start_task = DummyOperator(task_id='start_task', dag=one_dag) # Creates the tasks dynamically. Each one will elaborate one chunk of data. def create_dynamic_task_add_primary_key(table): return PythonOperator( #provide_context=True, task_id='Add_primary_key_for_' + database_name + '_' + table, pool='Pool_max_parallel_5', python_callable=add_index_to_tbl, op_kwargs={ 'database': database_name, 'table': table, }, dag=one_dag) #end dummy dag end = DummyOperator(task_id='end', dag=one_dag) tbl_list = include_tables #Setting dependencies, the configuration below creates a parallel task for each table that migrates the table from mysql to s3, then from s3 to for t in tbl_list: dt_cts = create_dynamic_task_add_primary_key(t) start_task >> dt_cts dt_cts >> end return one_dag
def gen_dummy(task_name): error = False if " " in task_name: error = True if "(" in task_name: error = True if ")" in task_name: error = True if '"' in task_name: error = True if error: print(task_name) return DummyOperator(task_id=task_name)
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG(dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): scheduler = SchedulerJob( num_runs=1, executor=executor, ) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_scheduler_do_not_schedule_too_early(self): dag = DAG(dag_id='test_scheduler_do_not_schedule_too_early', start_date=datetime.datetime(2200, 1, 1)) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_dag_as_context_manager(self): """ Test DAG as a context manager. When used as a context manager, Operators are automatically added to the DAG (unless they specifiy a different DAG) """ dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) dag2 = DAG( 'dag2', start_date=DEFAULT_DATE, default_args={'owner': 'owner2'}) with dag: op1 = DummyOperator(task_id='op1') op2 = DummyOperator(task_id='op2', dag=dag2) self.assertIs(op1.dag, dag) self.assertEqual(op1.owner, 'owner1') self.assertIs(op2.dag, dag2) self.assertEqual(op2.owner, 'owner2') with dag2: op3 = DummyOperator(task_id='op3') self.assertIs(op3.dag, dag2) self.assertEqual(op3.owner, 'owner2') with dag: with dag2: op4 = DummyOperator(task_id='op4') op5 = DummyOperator(task_id='op5') self.assertIs(op4.dag, dag2) self.assertIs(op5.dag, dag) self.assertEqual(op4.owner, 'owner2') self.assertEqual(op5.owner, 'owner1') with DAG('creating_dag_in_cm', start_date=DEFAULT_DATE) as dag: DummyOperator(task_id='op6') self.assertEqual(dag.dag_id, 'creating_dag_in_cm') self.assertEqual(dag.tasks[0].task_id, 'op6')
def test_scheduler_verify_max_active_runs(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached """ dag = DAG(dag_id='test_scheduler_verify_max_active_runs', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) dr = scheduler.schedule_dag(dag) self.assertIsNone(dr)
def test_xcom_pull_after_success(self): """ tests xcom set/clear relative to a task in a 'success' rerun scenario """ key = 'xcom_key' value = 'xcom_value' dag = models.DAG(dag_id='test_xcom', schedule_interval='@monthly') task = DummyOperator( task_id='test_xcom', dag=dag, pool='test_xcom', owner='airflow', start_date=datetime.datetime(2016, 6, 2, 0, 0, 0)) exec_date = datetime.datetime.now() ti = TI( task=task, execution_date=exec_date) ti.run(mark_success=True) ti.xcom_push(key=key, value=value) self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value) ti.run() # The second run and assert is to handle AIRFLOW-131 (don't clear on # prior success) self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value)
from airflow.operators import DummyOperator, PythonOperator, SubDagOperator from airflow.utils.trigger_rule import TriggerRule DEFAULT_DATE = datetime(2016, 1, 1) default_args = dict(start_date=DEFAULT_DATE, owner='airflow') def fail(): raise ValueError('Expected failure.') # DAG tests backfill with pooled tasks # Previously backfill would queue the task but never run it dag1 = DAG(dag_id='test_backfill_pooled_task_dag', default_args=default_args) dag1_task1 = DummyOperator( task_id='test_backfill_pooled_task', dag=dag1, pool='test_backfill_pooled_task_pool', ) # DAG tests depends_on_past dependencies dag2 = DAG(dag_id='test_depends_on_past', default_args=default_args) dag2_task1 = DummyOperator( task_id='test_dop_task', dag=dag2, depends_on_past=True, ) # DAG tests that a Dag run that doesn't complete is marked failed dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args) dag3_task1 = PythonOperator(task_id='test_dagrun_fail', dag=dag3,
def _get_task_id(execution_date, **context): return 'email_' + weekday_person_to_email[execution_date.weekday()] def _print_weekday(execution_date: datetime, **context): print(execution_date.strftime('%a')) with dag: print_weekday = PythonOperator( task_id='print_weekday', python_callable=_print_weekday, provide_context=True, ) branching = BranchPythonOperator( task_id='branching', python_callable=_get_task_id, provide_context=True, ) users = ['bob', 'alice', 'joe'] branches = [DummyOperator(task_id='email_' + user) for user in users] end = BashOperator(task_id='end', bash_command='echo "That\'s it folks!"', trigger_rule=TriggerRule.ONE_SUCCESS) print_weekday >> branching >> branches >> end
" file_format = (type = csv field_delimiter = ','" #" field_optionally_enclosed_by = '\"'" " skip_header = 0)" #" on_error = 'continue' ";" % (table_name, sfstage, file)) cs.execute(copy) cs.close() # Using the context manager alllows you not to duplicate the dag parameter in each operator with DAG('S3_dag_test_v3', default_args=default_args, schedule_interval='@once') as dag: start_task = DummyOperator(task_id='dummy_start') upload_to_S3_task = PythonOperator( task_id='upload_to_S3', python_callable=upload_file_to_S3_with_hook, op_kwargs={ 'filename': '/usr/local/file-to-watch-1.csv', 'key': 'test.csv', 'bucket_name': 'celltrak-test-arflow1', }, dag=dag) upload_file = PythonOperator( task_id='upload_to_snowflake_task', python_callable=upload_to_snowflake, #on_failure_callback = failure_slack_message,
hook.retrieve_file(remote_path, local_path) hook.close_conn() default_args = {'owner': 'airflow', 'start_date': datetime(2017, 12, 19)} # Schedule this DAG to run once. dag = DAG('ah_ftp_hook', description='Manipulating FTPs with PythonOperators+Hooks', schedule_interval='@once', start_date=datetime(2017, 12, 18), default_args=default_args) with dag: kick_off_dag = DummyOperator(task_id='kick_off_dag') upload_file = PythonOperator( task_id='upload_file', python_callable=upload_file, # This passes the params into the function. provide_context=True) download_file = PythonOperator( task_id='download_file', python_callable=download_file, # This passes the date into the function. provide_context=True) # Set dependencies. kick_off_dag >> download_file >> upload_file
from airflow.operators import BranchPythonOperator, DummyOperator from airflow.models import DAG from datetime import datetime, timedelta import random seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_branch_operator', default_args=args) cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) t.set_downstream(dummy_follow)