def test_dag_with_system_exit(self): """ Test to check that a DAG with a system.exit() doesn't break the scheduler. """ dag_id = 'exit_test_dag' dag_ids = [dag_id] dag_directory = os.path.join(models.DAGS_FOLDER, "..", "dags_with_system_exit") dag_file = os.path.join(dag_directory, 'b_test_scheduler_dags.py') dagbag = DagBag(dag_folder=dag_file) for dag_id in dag_ids: dag = dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, subdir= dag_directory, num_runs=1, **self.default_scheduler_args) scheduler.run() session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
def do_schedule(function, function2): scheduler = SchedulerJob( num_runs=1, executor=executor, ) scheduler.heartrate = 0 scheduler.run()
def test_scheduler_do_not_schedule_removed_task(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_verify_max_active_runs(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr)
def test_scheduler_process_execute_task(self): """ Test if process dag sends a task to the executor """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_called_with( ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None) ) tis = dr.get_task_instances(state=State.SCHEDULED) self.assertIsNotNone(tis)
def test_scheduler_process_task_instances(self): """ Test if _process_task_instances puts the right task instances into the queue. """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE) )
def test_scheduler_do_not_schedule_too_early(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_too_early', start_date=datetime.datetime(2200, 1, 1)) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_does_not_run_excluded(self): dag = DAG( dag_id='test_scheduler_does_not_run_excluded', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.EXCLUDED session.commit() session.close() queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_process_execute_task(self): """ Test if process dag sends a task to the executor """ dag = DAG(dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_called_with( ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None)) tis = dr.get_task_instances(state=State.SCHEDULED) self.assertIsNotNone(tis)
def test_scheduler_pooled_tasks(self): """ Test that the scheduler handles queued tasks correctly See issue #1299 """ session = settings.Session() if not (session.query(Pool).filter( Pool.pool == 'test_queued_pool').first()): pool = Pool(pool='test_queued_pool', slots=5) session.merge(pool) session.commit() session.close() dag_id = 'test_scheduled_queued_tasks' dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_id, num_runs=10) scheduler.run() task_1 = dag.tasks[0] ti = TI(task_1, dag.start_date) ti.refresh_from_db() self.assertEqual(ti.state, State.FAILED) dag.clear()
def test_scheduler_fail_dagrun_timeout(self): """ Test if a a dagrun wil be set failed if timeout """ dag = DAG( dag_id='test_scheduler_fail_dagrun_timeout', start_date=DEFAULT_DATE) dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() dr2 = scheduler.create_dag_run(dag) self.assertIsNotNone(dr2) dr.refresh_from_db(session=session) self.assertEquals(dr.state, State.FAILED)
def test_scheduler_process_check_heartrate(self): """ Test if process dag honors the heartrate """ dag = DAG( dag_id='test_scheduler_process_check_heartrate', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.last_scheduler_run = datetime.datetime.now() session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() scheduler.heartrate = 1000 dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_do_not_run_finished(self): dag = DAG(dag_id='test_scheduler_do_not_run_finished', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.SUCCESS session.commit() session.close() queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_process_task_instances(self): """ Test if _process_task_instances puts the right task instances into the queue. """ dag = DAG(dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE))
def test_scheduler_add_new_task(self): """ Test if a task instance will be added if the dag is updated """ dag = DAG(dag_id='test_scheduler_add_new_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances() self.assertEquals(len(tis), 1) dag_task2 = DummyOperator(task_id='dummy2', dag=dag, owner='airflow') queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) tis = dr.get_task_instances() self.assertEquals(len(tis), 2)
def test_scheduler_pooled_tasks(self): """ Test that the scheduler handles queued tasks correctly See issue #1299 """ session = settings.Session() if not ( session.query(Pool) .filter(Pool.pool == 'test_queued_pool') .first()): pool = Pool(pool='test_queued_pool', slots=5) session.merge(pool) session.commit() session.close() dag_id = 'test_scheduled_queued_tasks' dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_id, num_runs=10) scheduler.run() task_1 = dag.tasks[0] ti = TI(task_1, dag.start_date) ti.refresh_from_db() self.assertEqual(ti.state, State.FAILED) dag.clear()
def test_scheduler_fail_dagrun_timeout(self): """ Test if a a dagrun wil be set failed if timeout """ dag = DAG(dag_id='test_scheduler_fail_dagrun_timeout', start_date=DEFAULT_DATE) dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() dr2 = scheduler.create_dag_run(dag) self.assertIsNotNone(dr2) dr.refresh_from_db(session=session) self.assertEquals(dr.state, State.FAILED)
def test_scheduler_verify_max_active_runs_and_dagrun_timeout(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached and dagrun_timeout is not reached Test if a a dagrun will be scheduled if max_dag_runs has been reached but dagrun_timeout is also reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs_and_dagrun_timeout', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) # Should not be scheduled as DagRun has not timedout and max_active_runs is reached new_dr = scheduler.create_dag_run(dag) self.assertIsNone(new_dr) # Should be scheduled as dagrun_timeout has passed dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() new_dr = scheduler.create_dag_run(dag) self.assertIsNotNone(new_dr)
def _test_schedule(self, interval, schedule_criteria, start_date=True, default_start_date=-10): print("Test {} date of {} schedule job.".format( "start" if start_date else "end", interval)) today = datetime.today() today_utc = datetime.now(timezone.utc) _start_date = today + timedelta(days=default_start_date) _end_date = None catchup = start_date def format(date): if date is None: return "-" else: return date.strftime("%Y/%m/%d %H:%M:%S") deltas = ( (-2, "day_before_yesterday"), (-1, "yesterday"), (0, "today"), (1, "tomorrow"), (2, "day_after_tomorrow"), ) for delta, description in deltas: date = today + timedelta(days=delta) if start_date: _start_date = datetime.today() + timedelta(days=delta) else: _end_date = date dag = DAG(f"dag_starts_{description}", start_date=_start_date, end_date=_end_date, schedule_interval=interval, catchup=catchup) scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) execution_date = None if dr is None else dr.execution_date dates = (f"today={format(today)} (UTC={format(today_utc)}), ", f"start_date={format(dag.start_date)}, ", f"end_date={format(dag.end_date)}, ", f"execution_date={format(execution_date)}") dates = "\n\t" + "\n\t".join(dates) if schedule_criteria(delta): self.assertIsNotNone(dr) print(f"{description} is scheduled {dates}.") else: self.assertIsNone(dr) print(f"{description} is not scheduled {dates}.")
def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join(models.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run()
def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join( settings.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run()
def evaluate_dagrun( self, dag_id, first_task_state, second_task_state, dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob() dag = get_dag(dag_id, TEST_DAGS_FOLDER) dr = scheduler.schedule_dag(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.schedule_dag(dag) ex_date = dr.execution_date # if 'test_dagrun_states_deadlock' in dag_id and run_kwargs: # import ipdb; ipdb.set_trace() try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks task_1, task_2 = dag.tasks ti = TI(task_1, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, first_task_state) ti = TI(task_2, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, second_task_state) # load dagrun dr = session.query(DagRun).filter( DagRun.dag_id == dag.dag_id, DagRun.execution_date == ex_date ).first() # dagrun is running self.assertEqual(dr.state, State.RUNNING) # import ipdb; ipdb.set_trace() dag.get_active_runs() # dagrun failed self.assertEqual(dr.state, dagrun_state)
def test_dag_get_active_runs(self): """ Test to check that a DAG returns it's active runs """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace( minute=0, second=0, microsecond=0) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'get_active_runs_test' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag1.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag1.clear() dr = scheduler.create_dag_run(dag1) # We had better get a dag run self.assertIsNotNone(dr) execution_date = dr.execution_date running_dates = dag1.get_active_runs() try: running_date = running_dates[0] except: running_date = 'Except' self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
def test_dag_get_active_runs(self): """ Test to check that a DAG returns it's active runs """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace(minute=0, second=0, microsecond=0) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'get_active_runs_test' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args ) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag1.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag1.clear() dr = scheduler.create_dag_run(dag1) # We had better get a dag run self.assertIsNotNone(dr) execution_date = dr.execution_date running_dates = dag1.get_active_runs() try: running_date = running_dates[0] except: running_date = 'Except' self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
def test_trigger_controller_dag(self): dag = self.dagbag.get_dag('example_trigger_controller_dag') target_dag = self.dagbag.get_dag('example_trigger_target_dag') dag.clear() target_dag.clear() scheduler = SchedulerJob() queue = mock.Mock() scheduler._process_task_instances(target_dag, queue=queue) self.assertFalse(queue.append.called) job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True ) job.run() scheduler = SchedulerJob() queue = mock.Mock() scheduler._process_task_instances(target_dag, queue=queue) self.assertTrue(queue.append.called) target_dag.clear() dag.clear()
def evaluate_dagrun( self, dag_id, expected_task_states, # dict of task_id: state dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS. This is hackish: a dag run is created but its tasks are run by a backfill. """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob() dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.schedule_dag(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.schedule_dag(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks for task_id, expected_state in expected_task_states.items(): task = dag.get_task(task_id) ti = TI(task, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, expected_state) # load dagrun dr = DagRun.find(dag_id=dag_id, execution_date=ex_date) dr = dr[0] dr.dag = dag # dagrun is running self.assertEqual(dr.state, State.RUNNING) dr.update_state() # dagrun failed self.assertEqual(dr.state, dagrun_state)
def evaluate_dagrun( self, dag_id, expected_task_states, # dict of task_id: state dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS. This is hackish: a dag run is created but its tasks are run by a backfill. """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob(**self.default_scheduler_args) dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.create_dag_run(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.create_dag_run(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks for task_id, expected_state in expected_task_states.items(): task = dag.get_task(task_id) ti = TI(task, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, expected_state) # load dagrun dr = DagRun.find(dag_id=dag_id, execution_date=ex_date) dr = dr[0] dr.dag = dag # dagrun is running self.assertEqual(dr.state, State.RUNNING) dr.update_state() # dagrun failed self.assertEqual(dr.state, dagrun_state)
def test_scheduler_auto_align(self): """ Test if the schedule_interval will be auto aligned with the start_date such that if the start_date coincides with the schedule the first execution_date will be start_date, otherwise it will be start_date + interval. """ dag = DAG( dag_id='test_scheduler_auto_align_1', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="4 5 * * *" ) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 2, 5, 4)) dag = DAG( dag_id='test_scheduler_auto_align_2', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="10 10 * * *" ) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 1, 10, 10))
def evaluate_dagrun(self, dag_id, first_task_state, second_task_state, dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob() dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.schedule_dag(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.schedule_dag(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks task_1, task_2 = dag.tasks ti = TI(task_1, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, first_task_state) ti = TI(task_2, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, second_task_state) # load dagrun dr = session.query(DagRun).filter( DagRun.dag_id == dag.dag_id, DagRun.execution_date == ex_date).first() # dagrun is running self.assertEqual(dr.state, State.RUNNING) dag.get_active_runs() # dagrun failed self.assertEqual(dr.state, dagrun_state)
def test_scheduler_dagrun_once(self): """ Test if the scheduler does not create multiple dagruns if a dag is scheduled with @once and a start_date """ dag = DAG('test_scheduler_dagrun_once', start_date=datetime.datetime(2015, 1, 1), schedule_interval="@once") scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr)
def evaluate_dagrun( self, dag_id, expected_task_states, # dict of task_id: state dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob() dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.schedule_dag(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.schedule_dag(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks for task_id, expected_state in expected_task_states.items(): task = dag.get_task(task_id) ti = TI(task, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, expected_state) # load dagrun dr = session.query(DagRun).filter( DagRun.dag_id == dag.dag_id, DagRun.execution_date == ex_date ).first() # dagrun is running self.assertEqual(dr.state, State.RUNNING) dag.get_active_runs() # dagrun failed self.assertEqual(dr.state, dagrun_state)
def run_single_scheduler_loop_with_no_dags(dags_folder): """ Utility function that runs a single scheduler loop without actually changing/scheduling any dags. This is useful to simulate the other side effects of running a scheduler loop, e.g. to see what parse errors there are in the dags_folder. :param dags_folder: the directory to traverse :type directory: str """ scheduler = SchedulerJob( dag_id='this_dag_doesnt_exist', # We don't want to actually run anything num_runs=1, subdir=os.path.join(dags_folder)) scheduler.heartrate = 0 scheduler.run()
def setUpClass(cls): from tests.compat import MagicMock from airflow.jobs import SchedulerJob cls.dag = DAG( 'test_dag', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) cls.dag.create_dagrun( run_id="manual__1", execution_date=DEFAULT_DATE, state=State.RUNNING ) cls.dag.create_dagrun( run_id="manual__2", execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING ) cls.dag.create_dagrun( run_id="manual__3", execution_date=END_DATE, state=State.RUNNING ) cls.dag_file_processor = SchedulerJob(dag_ids=[], log=MagicMock())
def test_scheduler_dagrun_once(self): """ Test if the scheduler does not create multiple dagruns if a dag is scheduled with @once and a start_date """ dag = DAG( 'test_scheduler_dagrun_once', start_date=datetime.datetime(2015, 1, 1), schedule_interval="@once") scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr)
def test_scheduler_multiprocessing(self): """ Test that the scheduler can successfully queue multiple dags in parallel """ dag_ids = ['test_start_date_scheduling', 'test_dagrun_states_success'] for dag_id in dag_ids: dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, num_runs=2) scheduler.run() # zero tasks ran dag_id = 'test_start_date_scheduling' session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)
def test_scheduler_verify_pool_full(self, mock_pool_full): """ Test task instances not queued when pool is full """ mock_pool_full.return_value = False dag = DAG( dag_id='test_scheduler_verify_pool_full', start_date=DEFAULT_DATE) DummyOperator( task_id='dummy', dag=dag, owner='airflow', pool='test_scheduler_verify_pool_full') session = settings.Session() pool = Pool(pool='test_scheduler_verify_pool_full', slots=1) session.add(pool) orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() # Create 2 dagruns, which will create 2 task instances. dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, DEFAULT_DATE) dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = [] scheduler._process_task_instances(dag, queue=queue) self.assertEquals(len(queue), 2) dagbag = SimpleDagBag([dag]) # Recreated part of the scheduler here, to kick off tasks -> executor for ti_key in queue: task = dag.get_task(ti_key[1]) ti = models.TaskInstance(task, ti_key[2]) # Task starts out in the scheduled state. All tasks in the # scheduled state will be sent to the executor ti.state = State.SCHEDULED # Also save this task instance to the DB. session.merge(ti) session.commit() scheduler._execute_task_instances(dagbag, (State.SCHEDULED, State.UP_FOR_RETRY)) self.assertEquals(len(scheduler.executor.queued_tasks), 1)
def test_retry_handling_job(self): """ Integration test of the scheduler not accidentally resetting the try_numbers for a task """ dag = self.dagbag.get_dag('test_retry_handling_job') dag_task1 = dag.get_task("test_retry_handling_op") dag.clear() scheduler = SchedulerJob(dag_id=dag.dag_id, num_runs=1) scheduler.heartrate = 0 scheduler.run() session = settings.Session() ti = session.query(TI).filter(TI.dag_id == dag.dag_id, TI.task_id == dag_task1.task_id).first() # make sure the counter has increased self.assertEqual(ti.try_number, 2) self.assertEqual(ti.state, State.UP_FOR_RETRY)
def test_scheduler_run_duration(self): """ Verifies that the scheduler run duration limit is followed. """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) expected_run_duration = 5 start_time = datetime.datetime.now() scheduler = SchedulerJob(dag_id, run_duration=expected_run_duration, **self.default_scheduler_args) scheduler.run() end_time = datetime.datetime.now() run_duration = (end_time - start_time).total_seconds() logging.info("Test ran in %.2fs, expected %.2fs", run_duration, expected_run_duration) self.assertLess(run_duration - expected_run_duration, 5.0)
def test_scheduler_max_active_runs_respected_after_clear(self): """ Test if _process_task_instances only schedules ti's up to max_active_runs (related to issue AIRFLOW-137) """ dag = DAG( dag_id='test_scheduler_max_active_runs_respected_after_clear', start_date=DEFAULT_DATE) dag.max_active_runs = 3 dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() # First create up to 3 dagruns in RUNNING state. scheduler.create_dag_run(dag) # Reduce max_active_runs to 1 dag.max_active_runs = 1 queue = mock.Mock() # and schedule them in, so we can check how many # tasks are put on the queue (should be one, not 3) scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE))
def test_scheduler_process_check_heartrate(self): """ Test if process dag honors the heartrate """ dag = DAG(dag_id='test_scheduler_process_check_heartrate', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.last_scheduler_run = datetime.datetime.now() session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() scheduler.heartrate = 1000 dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_run_duration(self): """ Verifies that the scheduler run duration limit is followed. """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) expected_run_duration = 5 start_time = datetime.datetime.now() scheduler = SchedulerJob(dag_id, run_duration=expected_run_duration, **self.default_scheduler_args) scheduler.run() end_time = datetime.datetime.now() run_duration = (end_time - start_time).total_seconds() _log.info("Test ran in %.2fs, expected %.2fs", run_duration, expected_run_duration) assert run_duration - expected_run_duration < 5.0
def test_scheduler_max_active_runs_respected_after_clear(self): """ Test if _process_task_instances only schedules ti's up to max_active_runs (related to issue AIRFLOW-137) """ dag = DAG( dag_id='test_scheduler_max_active_runs_respected_after_clear', start_date=DEFAULT_DATE) dag.max_active_runs = 3 dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() # First create up to 3 dagruns in RUNNING state. scheduler.create_dag_run(dag) # Reduce max_active_runs to 1 dag.max_active_runs = 1 queue = mock.Mock() # and schedule them in, so we can check how many # tasks are put on the queue (should be one, not 3) scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE) )
def test_scheduler_verify_max_active_runs_and_dagrun_timeout(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached and dagrun_timeout is not reached Test if a a dagrun will be scheduled if max_dag_runs has been reached but dagrun_timeout is also reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs_and_dagrun_timeout', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) # Should not be scheduled as DagRun has not timedout and max_active_runs is reached new_dr = scheduler.create_dag_run(dag) self.assertIsNone(new_dr) # Should be scheduled as dagrun_timeout has passed dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() new_dr = scheduler.create_dag_run(dag) self.assertIsNotNone(new_dr)
def test_scheduler_add_new_task(self): """ Test if a task instance will be added if the dag is updated """ dag = DAG( dag_id='test_scheduler_add_new_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances() self.assertEquals(len(tis), 1) dag_task2 = DummyOperator( task_id='dummy2', dag=dag, owner='airflow') queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) tis = dr.get_task_instances() self.assertEquals(len(tis), 2)
def test_dagrun_root_fail_unfinished(self): """ DagRuns with one unfinished and one failed root task -> RUNNING """ # Run both the failed and successful tasks scheduler = SchedulerJob(**self.default_scheduler_args) dag_id = 'test_dagrun_states_root_fail_unfinished' dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.create_dag_run(dag) try: dag.run(start_date=dr.execution_date, end_date=dr.execution_date) except AirflowException: # Expect an exception since there is a failed task pass # Mark the successful task as never having run since we want to see if the # dagrun will be in a running state despite haveing an unfinished task. session = settings.Session() ti = dr.get_task_instance('test_dagrun_unfinished', session=session) ti.state = State.NONE session.commit() dr_state = dr.update_state() self.assertEqual(dr_state, State.RUNNING)
def test_scheduler_do_not_schedule_removed_task(self): dag = DAG(dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) dag = DAG(dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_start_date(self): """ Test that the scheduler respects start_dates, even when DAGS have run """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) scheduler = SchedulerJob(dag_id, num_runs=2, **self.default_scheduler_args) scheduler.run() # zero tasks ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0) # previously, running this backfill would kick off the Scheduler # because it would take the most recent run and start from there # That behavior still exists, but now it will only do so if after the # start date backfill = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) backfill.run() # one task ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) scheduler = SchedulerJob(dag_id, num_runs=2, **self.default_scheduler_args) scheduler.run() # still one task session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
def test_scheduler_pooled_tasks(self): """ Test that the scheduler handles queued tasks correctly See issue #1299 """ session = settings.Session() if not (session.query(Pool).filter( Pool.pool == 'test_queued_pool').first()): pool = Pool(pool='test_queued_pool', slots=5) session.merge(pool) session.commit() session.close() dag_id = 'test_scheduled_queued_tasks' dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_id, num_runs=1, executor=TestExecutor(), **self.default_scheduler_args) scheduler.run() task_1 = dag.tasks[0] logging.info("Trying to find task {}".format(task_1)) ti = TI(task_1, dag.start_date) ti.refresh_from_db() logging.error("TI is: {}".format(ti)) self.assertEqual(ti.state, State.QUEUED) # now we use a DIFFERENT scheduler and executor # to simulate the num-runs CLI arg scheduler2 = SchedulerJob(dag_id, num_runs=5, executor=DEFAULT_EXECUTOR.__class__(), **self.default_scheduler_args) scheduler2.run() ti.refresh_from_db() self.assertEqual(ti.state, State.FAILED) dag.clear()
def test_scheduler_pooled_tasks(self): """ Test that the scheduler handles queued tasks correctly See issue #1299 """ session = settings.Session() if not ( session.query(Pool) .filter(Pool.pool == 'test_queued_pool') .first()): pool = Pool(pool='test_queued_pool', slots=5) session.merge(pool) session.commit() session.close() dag_id = 'test_scheduled_queued_tasks' dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_id, num_runs=1, executor=TestExecutor(), **self.default_scheduler_args) scheduler.run() task_1 = dag.tasks[0] logging.info("Trying to find task {}".format(task_1)) ti = TI(task_1, dag.start_date) ti.refresh_from_db() logging.error("TI is: {}".format(ti)) self.assertEqual(ti.state, State.QUEUED) # now we use a DIFFERENT scheduler and executor # to simulate the num-runs CLI arg scheduler2 = SchedulerJob( dag_id, num_runs=5, executor=DEFAULT_EXECUTOR.__class__(), **self.default_scheduler_args) scheduler2.run() ti.refresh_from_db() self.assertEqual(ti.state, State.FAILED) dag.clear()
def do_schedule(function, function2): scheduler = SchedulerJob(num_runs=1, executor=executor,) scheduler.run()