def test_scheduler_add_new_task(self): """ Test if a task instance will be added if the dag is updated """ dag = DAG(dag_id='test_scheduler_add_new_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances() self.assertEquals(len(tis), 1) dag_task2 = DummyOperator(task_id='dummy2', dag=dag, owner='airflow') queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) tis = dr.get_task_instances() self.assertEquals(len(tis), 2)
def test_scheduler_verify_max_active_runs(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached """ dag = DAG(dag_id='test_scheduler_verify_max_active_runs', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob(dag.dag_id, run_duration=1) dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr2 = scheduler.create_dag_run(dag) self.assertIsNone(dr2) dag.clear() dag.max_active_runs = 0 scheduler.run() session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag.dag_id).all()), 0)
def _create_xcom_entries(self, dag_id, dag_run_id, execution_date, task_id, session=None): for i in [1, 2]: XCom.set( key=f'test-xcom-key-{i}', value="TEST", execution_date=execution_date, task_id=task_id, dag_id=dag_id, ) dag = DagModel(dag_id=dag_id) session.add(dag) dagrun = DR( dag_id=dag_id, run_id=dag_run_id, execution_date=execution_date, start_date=execution_date, run_type=DagRunType.MANUAL, ) session.add(dagrun)
def test_scheduler_do_not_run_finished(self): dag = DAG(dag_id='test_scheduler_do_not_run_finished', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.SUCCESS session.commit() session.close() queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_refresh_packaged_dag(self, mock_dagmodel): """ Test that we can refresh a packaged DAG """ dag_id = "test_zip_dag" fileloc = os.path.realpath( os.path.join(TEST_DAGS_FOLDER, "test_zip.zip/test_zip.py")) mock_dagmodel.return_value = DagModel() mock_dagmodel.return_value.last_expired = datetime.max.replace( tzinfo=timezone.utc) mock_dagmodel.return_value.fileloc = fileloc class TestDagBag(DagBag): process_file_calls = 0 def process_file(self, filepath, only_if_updated=True, safe_mode=True): if filepath in fileloc: TestDagBag.process_file_calls += 1 return super().process_file(filepath, only_if_updated, safe_mode) dagbag = TestDagBag(dag_folder=os.path.realpath(TEST_DAGS_FOLDER), include_examples=False) self.assertEqual(1, dagbag.process_file_calls) dag = dagbag.get_dag(dag_id) self.assertIsNotNone(dag) self.assertEqual(dag_id, dag.dag_id) self.assertEqual(2, dagbag.process_file_calls)
def _create_dag_model(self, session=None): dag_model = DagModel(dag_id="TEST_DAG_1", fileloc="/tmp/dag_1.py", schedule_interval="2 2 * * *", is_paused=True) session.add(dag_model) return dag_model
def test_scheduler_process_execute_task(self): """ Test if process dag sends a task to the executor """ dag = DAG(dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_called_with( ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None)) tis = dr.get_task_instances(state=State.SCHEDULED) self.assertIsNotNone(tis)
def test_get_dag_without_refresh(self, mock_dagmodel): """ Test that, once a DAG is loaded, it doesn't get refreshed again if it hasn't been expired. """ dag_id = 'example_bash_operator' mock_dagmodel.return_value = DagModel() mock_dagmodel.return_value.last_expired = None mock_dagmodel.return_value.fileloc = 'foo' class TestDagBag(models.DagBag): process_file_calls = 0 def process_file(self, filepath, only_if_updated=True, safe_mode=True): if 'example_bash_operator.py' == os.path.basename(filepath): TestDagBag.process_file_calls += 1 super().process_file(filepath, only_if_updated, safe_mode) dagbag = TestDagBag(include_examples=True) dagbag.process_file_calls # Should not call process_file again, since it's already loaded during init. self.assertEqual(1, dagbag.process_file_calls) self.assertIsNotNone(dagbag.get_dag(dag_id)) self.assertEqual(1, dagbag.process_file_calls)
def _create_invalid_xcom_entries(self, execution_date, session=None): """ Invalid XCom entries to test join query """ for i in [1, 2]: XCom.set( key=f'invalid-xcom-key-{i}', value="TEST", execution_date=execution_date, task_id="invalid_task", dag_id="invalid_dag", ) dag = DagModel(dag_id="invalid_dag") session.add(dag) dagrun = DR( dag_id="invalid_dag", run_id="invalid_run_id", execution_date=execution_date + timedelta(days=1), start_date=execution_date, run_type=DagRunType.MANUAL, ) session.add(dagrun) dagrun = DR( dag_id="invalid_dag_1", run_id="invalid_run_id", execution_date=execution_date, start_date=execution_date, run_type=DagRunType.MANUAL, ) session.commit()
def test_scheduler_max_active_runs_respected_after_clear(self): """ Test if _process_task_instances only schedules ti's up to max_active_runs (related to issue AIRFLOW-137) """ dag = DAG( dag_id='test_scheduler_max_active_runs_respected_after_clear', start_date=DEFAULT_DATE) dag.max_active_runs = 3 dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() # First create up to 3 dagruns in RUNNING state. scheduler.create_dag_run(dag) # Reduce max_active_runs to 1 dag.max_active_runs = 1 queue = mock.Mock() # and schedule them in, so we can check how many # tasks are put on the queue (should be one, not 3) scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE))
def test_scheduler_do_not_schedule_too_early(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_too_early', start_date=datetime.datetime(2200, 1, 1)) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_get_accessible_dag_ids(self): role_name = 'MyRole1' permission_action = ['can_dag_read'] dag_id = 'dag_id' username = "******" self.security_manager.init_role(role_name, [], []) self.security_manager.sync_perm_for_dag( # type: ignore # pylint: disable=no-member dag_id, access_control={role_name: permission_action}) role = self.security_manager.find_role(role_name) user = self.security_manager.add_user( username=username, first_name=username, last_name=username, email=f"{username}@fab.org", role=role, password=username, ) dag_model = DagModel(dag_id="dag_id", fileloc="/tmp/dag_.py", schedule_interval="2 2 * * *") self.session.add(dag_model) self.session.commit() self.assertEqual(self.security_manager.get_accessible_dag_ids(user), {'dag_id'})
def test_emit_scheduling_delay(self, stats_mock): """ Tests that dag scheduling delay stat is set properly once running scheduled dag. dag_run.update_state() invokes the _emit_true_scheduling_delay_stats_for_finished_state method. """ dag = DAG(dag_id='test_emit_dag_stats', start_date=days_ago(1)) dag_task = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel( dag_id=dag.dag_id, has_task_concurrency_limits=False, next_dagrun=dag.start_date, next_dagrun_create_after=dag.following_schedule(dag.start_date), is_active=True, ) session.add(orm_dag) session.flush() dag_run = dag.create_dagrun( run_type=DagRunType.SCHEDULED, state=State.SUCCESS, execution_date=dag.start_date, start_date=dag.start_date, session=session, ) ti = dag_run.get_task_instance(dag_task.task_id) ti.set_state(State.SUCCESS, session) session.commit() session.close() dag_run.update_state() true_delay = (ti.start_date - dag.following_schedule(dag_run.execution_date)).total_seconds() stats_mock.assert_called() sched_delay_stat_call = call(f'dagrun.{dag.dag_id}.first_task_scheduling_delay', true_delay) self.assertIn(sched_delay_stat_call, stats_mock.mock_calls)
def test_should_respond_200_with_schedule_interval_none( self, session=None): dag_model = DagModel( dag_id="TEST_DAG_1", fileloc="/tmp/dag_1.py", schedule_interval=None, ) session.add(dag_model) session.commit() response = self.client.get("/api/v1/dags/TEST_DAG_1", environ_overrides={'REMOTE_USER': "******"}) assert response.status_code == 200 self.assertEqual( { "dag_id": "TEST_DAG_1", "description": None, "fileloc": "/tmp/dag_1.py", "file_token": 'Ii90bXAvZGFnXzEucHki.EnmIdPaUPo26lHQClbWMbDFD1Pk', "is_paused": False, "is_subdag": False, "owners": [], "root_dag_id": None, "schedule_interval": None, "tags": [], }, response.json, )
def test_scheduler_process_check_heartrate(self): """ Test if process dag honors the heartrate """ dag = DAG(dag_id='test_scheduler_process_check_heartrate', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.last_scheduler_run = datetime.datetime.now() session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() scheduler.heartrate = 1000 dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_verify_max_active_runs_and_dagrun_timeout(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached and dagrun_timeout is not reached Test if a a dagrun will be scheduled if max_dag_runs has been reached but dagrun_timeout is also reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs_and_dagrun_timeout', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) # Should not be scheduled as DagRun has not timedout and max_active_runs is reached new_dr = scheduler.create_dag_run(dag) self.assertIsNone(new_dr) # Should be scheduled as dagrun_timeout has passed dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() new_dr = scheduler.create_dag_run(dag) self.assertIsNotNone(new_dr)
def test_dont_get_inaccessible_dag_ids_for_dag_resource_permission(self): # In this test case, # get_readable_dag_ids() don't return DAGs to which the user has CAN_EDIT permission username = "******" role_name = "MyRole1" permission_action = [permissions.ACTION_CAN_EDIT] dag_id = "dag_id" user = fab_utils.create_user( self.app, username, role_name, permissions=[ (permissions.ACTION_CAN_EDIT, permissions.RESOURCE_DAG), ], ) dag_model = DagModel(dag_id=dag_id, fileloc="/tmp/dag_.py", schedule_interval="2 2 * * *") self.session.add(dag_model) self.session.commit() self.security_manager.sync_perm_for_dag( # type: ignore # pylint: disable=no-member dag_id, access_control={role_name: permission_action}) assert self.security_manager.get_readable_dag_ids(user) == set()
def test_refresh_py_dag(self, mock_dagmodel): """ Test that we can refresh an ordinary .py DAG """ EXAMPLE_DAGS_FOLDER = airflow.example_dags.__path__[0] dag_id = "example_bash_operator" fileloc = os.path.realpath( os.path.join(EXAMPLE_DAGS_FOLDER, "example_bash_operator.py")) mock_dagmodel.return_value = DagModel() mock_dagmodel.return_value.last_expired = datetime.max.replace( tzinfo=timezone.utc) mock_dagmodel.return_value.fileloc = fileloc class TestDagBag(DagBag): process_file_calls = 0 def process_file(self, filepath, only_if_updated=True, safe_mode=True): if filepath == fileloc: TestDagBag.process_file_calls += 1 return super().process_file(filepath, only_if_updated, safe_mode) dagbag = TestDagBag(dag_folder=self.empty_dir, include_examples=True) self.assertEqual(1, dagbag.process_file_calls) dag = dagbag.get_dag(dag_id) self.assertIsNotNone(dag) self.assertEqual(dag_id, dag.dag_id) self.assertEqual(2, dagbag.process_file_calls)
def test_get_accessible_dag_ids(self): role_name = 'MyRole1' permission_action = [permissions.ACTION_CAN_READ] dag_id = 'dag_id' username = "******" user = fab_utils.create_user( self.app, username, role_name, permissions=[ (permissions.ACTION_CAN_READ, permissions.RESOURCE_DAG), (permissions.ACTION_CAN_READ, permissions.RESOURCE_DAG), ], ) dag_model = DagModel(dag_id=dag_id, fileloc="/tmp/dag_.py", schedule_interval="2 2 * * *") self.session.add(dag_model) self.session.commit() self.security_manager.sync_perm_for_dag( # type: ignore # pylint: disable=no-member dag_id, access_control={role_name: permission_action}) self.assertEqual(self.security_manager.get_accessible_dag_ids(user), {'dag_id'})
def test_should_response_200_with_schedule_interval_none( self, session=None): dag_model = DagModel( dag_id="TEST_DAG_1", fileloc="/tmp/dag_1.py", schedule_interval=None, ) session.add(dag_model) session.commit() response = self.client.get("/api/v1/dags/TEST_DAG_1", environ_overrides={'REMOTE_USER': "******"}) assert response.status_code == 200 current_response = response.json current_response["fileloc"] = "/tmp/test-dag.py" self.assertEqual( { "dag_id": "TEST_DAG_1", "description": None, "fileloc": "/tmp/test-dag.py", "is_paused": False, "is_subdag": False, "owners": [], "root_dag_id": None, "schedule_interval": None, "tags": [], }, current_response, )
def test_scheduler_process_task_instances(self): """ Test if _process_task_instances puts the right task instances into the queue. """ dag = DAG(dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE))
def _create_dag_runs(self): dates = [ '2020-06-10T18:00:00+00:00', '2020-06-11T18:00:00+00:00', '2020-06-12T18:00:00+00:00', '2020-06-13T18:00:00+00:00', '2020-06-14T18:00:00+00:00', '2020-06-15T18:00:00Z', '2020-06-16T18:00:00Z', '2020-06-17T18:00:00Z', '2020-06-18T18:00:00Z', '2020-06-19T18:00:00Z', ] dag = DagModel(dag_id="TEST_DAG_ID") dag_runs = [ DagRun( dag_id="TEST_DAG_ID", run_id="TEST_START_EXEC_DAY_1" + str(i), run_type=DagRunType.MANUAL, execution_date=timezone.parse(dates[i]), start_date=timezone.parse(dates[i]), external_trigger=True, state='success', ) for i in range(len(dates)) ] with create_session() as session: session.add_all(dag_runs) session.add(dag) return dag_runs
def test_scheduler_fail_dagrun_timeout(self): """ Test if a a dagrun wil be set failed if timeout """ dag = DAG(dag_id='test_scheduler_fail_dagrun_timeout', start_date=DEFAULT_DATE) dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() dr2 = scheduler.create_dag_run(dag) self.assertIsNotNone(dr2) dr.refresh_from_db(session=session) self.assertEquals(dr.state, State.FAILED)
def test_serialize(self): dag_model = DagModel( dag_id="test_dag_id", root_dag_id="test_root_dag_id", is_paused=True, is_subdag=False, fileloc="/root/airflow/dags/my_dag.py", owners="airflow1,airflow2", description="The description", schedule_interval="5 4 * * *", tags=[DagTag(name="tag-1"), DagTag(name="tag-2")], ) serialized_dag = DAGSchema().dump(dag_model) assert { "dag_id": "test_dag_id", "description": "The description", "fileloc": "/root/airflow/dags/my_dag.py", "file_token": SERIALIZER.dumps("/root/airflow/dags/my_dag.py"), "is_paused": True, "is_subdag": False, "owners": ["airflow1", "airflow2"], "root_dag_id": "test_root_dag_id", "schedule_interval": { "__type": "CronExpression", "value": "5 4 * * *" }, "tags": [{ "name": "tag-1" }, { "name": "tag-2" }], } == serialized_dag
def _create_dag_models(self, count, session=None): for num in range(1, count + 1): dag_model = DagModel( dag_id=f"TEST_DAG_{num}", fileloc=f"/tmp/dag_{num}.py", schedule_interval="2 2 * * *", ) session.add(dag_model)
def test_response_400(self, name, url, request_json, expected_response, session): del name dag_instance = DagModel(dag_id="TEST_DAG_ID") session.add(dag_instance) session.commit() response = self.client.post(url, json=request_json, environ_overrides={'REMOTE_USER': "******"}) assert response.status_code == 400, response.data assert expected_response == response.json
def _add_dag_needing_dagrun(): with create_session() as session: orm_dag = DagModel(dag_id="test") orm_dag.is_paused = False orm_dag.is_active = True orm_dag.next_dagrun_create_after = pendulum.now() session.merge(orm_dag) session.commit()
def test_response_400(self, name, url, request_json, expected_response, session): del name dag_instance = DagModel(dag_id="TEST_DAG_ID") session.add(dag_instance) session.commit() response = self.client.post(url, json=request_json) self.assertEqual(response.status_code, 400, response.data) self.assertEqual(expected_response, response.json)
def test_should_response_400_for_naive_datetime_and_bad_datetime(self, data, expected, session): dag_instance = DagModel(dag_id="TEST_DAG_ID") session.add(dag_instance) session.commit() response = self.client.post( "api/v1/dags/TEST_DAG_ID/dagRuns", json=data, environ_overrides={'REMOTE_USER': "******"} ) assert response.status_code == 400 assert response.json['detail'] == expected
def _create_test_dag_run(self, state='running', extra_dag=False, commit=True): dag_runs = [] dags = [DagModel(dag_id="TEST_DAG_ID")] dagrun_model_1 = DagRun( dag_id="TEST_DAG_ID", run_id="TEST_DAG_RUN_ID_1", run_type=DagRunType.MANUAL.value, execution_date=timezone.parse(self.default_time), start_date=timezone.parse(self.default_time), external_trigger=True, state=state, ) dag_runs.append(dagrun_model_1) dagrun_model_2 = DagRun( dag_id="TEST_DAG_ID", run_id="TEST_DAG_RUN_ID_2", run_type=DagRunType.MANUAL.value, execution_date=timezone.parse(self.default_time_2), start_date=timezone.parse(self.default_time), external_trigger=True, ) dag_runs.append(dagrun_model_2) if extra_dag: for i in range(3, 5): dags.append(DagModel(dag_id='TEST_DAG_ID_' + str(i))) dag_runs.append( DagRun( dag_id='TEST_DAG_ID_' + str(i), run_id='TEST_DAG_RUN_ID_' + str(i), run_type=DagRunType.MANUAL.value, execution_date=timezone.parse(self.default_time_2), start_date=timezone.parse(self.default_time), external_trigger=True, )) if commit: with create_session() as session: session.add_all(dag_runs) session.add_all(dags) return dag_runs