def dag_bag_multiple(): """ Create a DagBag containing two DAGs, linked by multiple ExternalTaskMarker. """ dag_bag = DagBag(dag_folder=DEV_NULL, include_examples=False) daily_dag = DAG("daily_dag", start_date=DEFAULT_DATE, schedule_interval="@daily") agg_dag = DAG("agg_dag", start_date=DEFAULT_DATE, schedule_interval="@daily") dag_bag.bag_dag(daily_dag, None, daily_dag) dag_bag.bag_dag(agg_dag, None, agg_dag) daily_task = DummyOperator(task_id="daily_tas", dag=daily_dag) start = DummyOperator(task_id="start", dag=agg_dag) for i in range(25): task = ExternalTaskMarker( task_id="{}_{}".format(daily_task.task_id, i), external_dag_id=daily_dag.dag_id, external_task_id=daily_task.task_id, execution_date="{{ macros.ds_add(ds, -1 * %s) }}" % i, dag=agg_dag) start >> task yield dag_bag
def dag_bag_cyclic(): """ Create a DagBag with DAGs having cyclic dependencies set up by ExternalTaskMarker and ExternalTaskSensor. dag_0: task_a_0 >> task_b_0 ^ | | | dag_1: | ---> task_a_1 >> task_b_1 | | --------------------------------- """ dag_bag = DagBag(dag_folder=DEV_NULL, include_examples=False) dag_0 = DAG("dag_0", start_date=DEFAULT_DATE, schedule_interval=None) task_a_0 = DummyOperator(task_id="task_a_0", dag=dag_0) task_b_0 = ExternalTaskMarker( task_id="task_b_0", external_dag_id="dag_1", external_task_id="task_a_1", recursion_depth=3, dag=dag_0 ) task_a_0 >> task_b_0 dag_1 = DAG("dag_1", start_date=DEFAULT_DATE, schedule_interval=None) task_a_1 = ExternalTaskSensor( task_id="task_a_1", external_dag_id=dag_0.dag_id, external_task_id=task_b_0.task_id, dag=dag_1 ) task_b_1 = ExternalTaskMarker( task_id="task_b_1", external_dag_id="dag_0", external_task_id="task_a_0", recursion_depth=2, dag=dag_1 ) task_a_1 >> task_b_1 for dag in [dag_0, dag_1]: dag_bag.bag_dag(dag=dag, root_dag=dag) return dag_bag
def dag_bag_head_tail(): """ Create a DagBag containing one DAG, with task "head" depending on task "tail" of the previous execution_date. 20200501 20200502 20200510 +------+ +------+ +------+ | head | -->head | --> -->head | | | | / | | | / / | | | | v | / | v | / / | v | | body | / | body | / ... / | body | | | |/ | | |/ / | | | | v / | v / / | v | | tail/| | tail/| / | tail | +------+ +------+ +------+ """ dag_bag = DagBag(dag_folder=DEV_NULL, include_examples=False) with DAG("head_tail", start_date=DEFAULT_DATE, schedule_interval="@daily") as dag: head = ExternalTaskSensor(task_id='head', external_dag_id=dag.dag_id, external_task_id="tail", execution_delta=timedelta(days=1), mode="reschedule") body = DummyOperator(task_id="body") tail = ExternalTaskMarker(task_id="tail", external_dag_id=dag.dag_id, external_task_id=head.task_id, execution_date="{{ tomorrow_ds_nodash }}") head >> body >> tail dag_bag.bag_dag(dag=dag, root_dag=dag) yield dag_bag
def dag_bag_ext(): """ Create a DagBag with DAGs looking like this. The dotted lines represent external dependencies set up using ExternalTaskMarker and ExternalTaskSensor. dag_0: task_a_0 >> task_b_0 | | dag_1: ---> task_a_1 >> task_b_1 | | dag_2: ---> task_a_2 >> task_b_2 | | dag_3: ---> task_a_3 >> task_b_3 """ dag_bag = DagBag(dag_folder=DEV_NULL, include_examples=False) dag_0 = DAG("dag_0", start_date=DEFAULT_DATE, schedule_interval=None) task_a_0 = DummyOperator(task_id="task_a_0", dag=dag_0) task_b_0 = ExternalTaskMarker( task_id="task_b_0", external_dag_id="dag_1", external_task_id="task_a_1", recursion_depth=3, dag=dag_0 ) task_a_0 >> task_b_0 dag_1 = DAG("dag_1", start_date=DEFAULT_DATE, schedule_interval=None) task_a_1 = ExternalTaskSensor( task_id="task_a_1", external_dag_id=dag_0.dag_id, external_task_id=task_b_0.task_id, dag=dag_1 ) task_b_1 = ExternalTaskMarker( task_id="task_b_1", external_dag_id="dag_2", external_task_id="task_a_2", recursion_depth=2, dag=dag_1 ) task_a_1 >> task_b_1 dag_2 = DAG("dag_2", start_date=DEFAULT_DATE, schedule_interval=None) task_a_2 = ExternalTaskSensor( task_id="task_a_2", external_dag_id=dag_1.dag_id, external_task_id=task_b_1.task_id, dag=dag_2 ) task_b_2 = ExternalTaskMarker( task_id="task_b_2", external_dag_id="dag_3", external_task_id="task_a_3", recursion_depth=1, dag=dag_2 ) task_a_2 >> task_b_2 dag_3 = DAG("dag_3", start_date=DEFAULT_DATE, schedule_interval=None) task_a_3 = ExternalTaskSensor( task_id="task_a_3", external_dag_id=dag_2.dag_id, external_task_id=task_b_2.task_id, dag=dag_3 ) task_b_3 = DummyOperator(task_id="task_b_3", dag=dag_3) task_a_3 >> task_b_3 for dag in [dag_0, dag_1, dag_2, dag_3]: dag_bag.bag_dag(dag=dag, root_dag=dag) return dag_bag
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join(models.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join(models.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def dags(log_app): dag = DAG(DAG_ID, start_date=DEFAULT_DATE) dag_removed = DAG(DAG_ID_REMOVED, start_date=DEFAULT_DATE) bag = DagBag(include_examples=False) bag.bag_dag(dag=dag, root_dag=dag) bag.bag_dag(dag=dag_removed, root_dag=dag_removed) # Since we don't want to store the code for the DAG defined in this file with unittest.mock.patch.object(settings, "STORE_DAG_CODE", False): dag.sync_to_db() dag_removed.sync_to_db() bag.sync_to_db() log_app.dag_bag = bag return dag, dag_removed
def setUp(self): # Airflow relies on reading the DAG from disk when triggering it. # Therefore write a temp file holding the DAG to trigger. with tempfile.NamedTemporaryFile(mode="w", delete=False) as f: self._tmpfile = f.name f.write(DAG_SCRIPT) f.flush() with create_session() as session: session.add(DagModel(dag_id=TRIGGERED_DAG_ID, fileloc=self._tmpfile)) session.commit() self.dag = DAG(TEST_DAG_ID, default_args={"owner": "airflow", "start_date": DEFAULT_DATE}) dagbag = DagBag(f.name, read_dags_from_db=False, include_examples=False) dagbag.bag_dag(self.dag, root_dag=self.dag) dagbag.sync_to_db()
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): scheduler = SchedulerJob(num_runs=1, executor=executor,) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG(dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): scheduler = SchedulerJob( num_runs=1, executor=executor, ) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_retry_still_in_executor(self): """ Checks if the scheduler does not put a task in limbo, when a task is retried but is still present in the executor. """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG(dag_id='test_retry_still_in_executor', start_date=DEFAULT_DATE, schedule_interval="@once") dag_task1 = BashOperator(task_id='test_retry_handling_op', bash_command='exit 1', retries=1, dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join( settings.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) def run_with_error(task): try: task.run() except AirflowException: pass ti_tuple = six.next(six.itervalues(executor.queued_tasks)) (command, priority, queue, ti) = ti_tuple ti.task = dag_task1 # fail execution run_with_error(ti) self.assertEqual(ti.state, State.UP_FOR_RETRY) self.assertEqual(ti.try_number, 1) ti.refresh_from_db(lock_for_update=True, session=session) ti.state = State.SCHEDULED session.merge(ti) session.commit() # do not schedule do_schedule() self.assertTrue(executor.has_task(ti)) ti.refresh_from_db() self.assertEqual(ti.state, State.SCHEDULED) # now the executor has cleared and it should be allowed the re-queue executor.queued_tasks.clear() do_schedule() ti.refresh_from_db() self.assertEqual(ti.state, State.QUEUED)