def test_subdag_deadlock(self): dagbag = DagBag() dag = dagbag.get_dag('test_subdag_deadlock') dag.clear() subdag = dagbag.get_dag('test_subdag_deadlock.subdag') subdag.clear() # first make sure subdag has failed self.assertRaises(AirflowException, subdag.run, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # now make sure dag picks up the subdag error self.assertRaises(AirflowException, dag.run, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
def downgrade(): engine = settings.engine if engine.dialect.has_table(engine, 'task_instance'): connection = op.get_bind() sessionmaker = sa.orm.sessionmaker() session = sessionmaker(bind=connection) dagbag = DagBag(settings.DAGS_FOLDER) query = session.query(sa.func.count(TaskInstance.max_tries)).filter( TaskInstance.max_tries != -1 ) while query.scalar(): tis = session.query(TaskInstance).filter( TaskInstance.max_tries != -1 ).limit(BATCH_SIZE).all() for ti in tis: dag = dagbag.get_dag(ti.dag_id) if not dag or not dag.has_task(ti.task_id): ti.try_number = 0 else: task = dag.get_task(ti.task_id) # max_tries - try_number is number of times a task instance # left to retry by itself. So the current try_number should be # max number of self retry (task.retries) minus number of # times left for task instance to try the task. ti.try_number = max(0, task.retries - (ti.max_tries - ti.try_number)) ti.max_tries = -1 session.merge(ti) session.commit() session.commit() op.drop_column('task_instance', 'max_tries')
def get_task_instance(dag_id, task_id, execution_date): """Return the task object identified by the given dag_id and task_id.""" dagbag = DagBag() # Check DAG exists. if dag_id not in dagbag.dags: error_message = "Dag id {} not found".format(dag_id) raise AirflowException(error_message) # Get DAG object and check Task Exists dag = dagbag.get_dag(dag_id) if not dag.has_task(task_id): error_message = 'Task {} not found in dag {}'.format(task_id, dag_id) raise AirflowException(error_message) # Get DagRun object and check that it exists dagrun = dag.get_dagrun(execution_date=execution_date) if not dagrun: error_message = ('Dag Run for date {} not found in dag {}' .format(execution_date, dag_id)) raise AirflowException(error_message) # Get task instance object and check that it exists task_instance = dagrun.get_task_instance(task_id) if not task_instance: error_message = ('Task {} instance for date {} not found' .format(task_id, execution_date)) raise AirflowException(error_message) return task_instance
def trigger_dag(dag_id, run_id=None, conf=None, execution_date=None): dagbag = DagBag() if dag_id not in dagbag.dags: raise AirflowException("Dag id {} not found".format(dag_id)) dag = dagbag.get_dag(dag_id) if not execution_date: execution_date = datetime.now() if not run_id: run_id = "manual__{0}".format(execution_date.isoformat()) dr = DagRun.find(dag_id=dag_id, run_id=run_id) if dr: raise AirflowException("Run id {} already exists for dag id {}".format( run_id, dag_id )) run_conf = None if conf: run_conf = json.loads(conf) trigger = dag.create_dagrun( run_id=run_id, execution_date=execution_date, state=State.RUNNING, conf=run_conf, external_trigger=True ) return trigger
def test_dag_with_system_exit(self): """ Test to check that a DAG with a system.exit() doesn't break the scheduler. """ dag_id = 'exit_test_dag' dag_ids = [dag_id] dag_directory = os.path.join(models.DAGS_FOLDER, "..", "dags_with_system_exit") dag_file = os.path.join(dag_directory, 'b_test_scheduler_dags.py') dagbag = DagBag(dag_folder=dag_file) for dag_id in dag_ids: dag = dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, subdir= dag_directory, num_runs=1, **self.default_scheduler_args) scheduler.run() session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
def test_if_dag_contains_tasks(): """ Every dag minimum contains: start and end tasks """ dag_bag = DagBag(include_examples=False) dag = dag_bag.get_dag('import_table_tbl') assert len(dag.tasks) > 1
def check_dag_exist(dag_id): logging.info("Executing custom 'check_dag_exist' function") dagbag = DagBag('dags') if dag_id not in dagbag.dags: return ApiResponse.bad_request("Dag id {} not found".format(dag_id)) dag = dagbag.get_dag(dag_id) payload = os.path.exists(dag.full_filepath) return ApiResponse.success(payload)
class TestHelloWorldDAG(unittest.TestCase): """Check HelloWorldDAG expectation""" def setUp(self): self.dagbag = DagBag() def test_task_count(self): """Check task count of hello_world dag""" dag_id = 'hello_world' dag = self.dagbag.get_dag(dag_id) self.assertEqual(len(dag.tasks), 3) def test_contain_tasks(self): """Check task contains in hello_world dag""" dag_id = 'hello_world' dag = self.dagbag.get_dag(dag_id) tasks = dag.tasks task_ids = list(map(lambda task: task.task_id, tasks)) self.assertListEqual(task_ids, ['dummy_task', 'multiplyby5_task', 'hello_task']) def test_dependencies_of_dummy_task(self): """Check the task dependencies of dummy_task in hello_world dag""" dag_id = 'hello_world' dag = self.dagbag.get_dag(dag_id) dummy_task = dag.get_task('dummy_task') upstream_task_ids = list( map(lambda task: task.task_id, dummy_task.upstream_list)) self.assertListEqual(upstream_task_ids, []) downstream_task_ids = list( map(lambda task: task.task_id, dummy_task.downstream_list)) self.assertListEqual(downstream_task_ids, ['hello_task', 'multiplyby5_task']) def test_dependencies_of_hello_task(self): """Check the task dependencies of hello_task in hello_world dag""" dag_id = 'hello_world' dag = self.dagbag.get_dag(dag_id) hello_task = dag.get_task('hello_task') upstream_task_ids = list( map(lambda task: task.task_id, hello_task.upstream_list)) self.assertListEqual(upstream_task_ids, ['dummy_task']) downstream_task_ids = list( map(lambda task: task.task_id, hello_task.downstream_list)) self.assertListEqual(downstream_task_ids, [])
def check_dag_status(dag_id): logging.info("Executing custom 'check_dag_status' function") dagbag = DagBag('dags') if dag_id not in dagbag.dags: return ApiResponse.bad_request("Dag id {} not found".format(dag_id)) dag = dagbag.get_dag(dag_id) payload = not dag.is_paused return ApiResponse.success(payload)
class TestSparkSubmitOperatorJava(unittest.TestCase): """Check HelloWorldDAG expectation""" def setUp(self): self.dagbag = DagBag() def test_task_count(self): """Check task count of hello_world dag""" dag_id = 'SPARK_SUBMIT_TEST' dag = self.dagbag.get_dag(dag_id) self.assertEqual(len(dag.task_count), 1) def test_contain_tasks(self): """Check task contains in hello_world dag""" dag_id = 'SPARK_SUBMIT_TEST' dag = self.dagbag.get_dag(dag_id) tasks = dag.tasks task_ids = list(map(lambda task: task.task_id, tasks)) self.assertListEqual(task_ids, ['spark_submit_job'])
def test_if_dags_exists(load_dag_names, get_dags_import_table): dag_bag = DagBag(include_examples=False) list_dags = [] for name in load_dag_names: dag = dag_bag.get_dag(f'import_table_{name}') list_dags.append(dag) assert len(list_dags) == len(get_dags_import_table)
def execute(self, context: Dict): if isinstance(self.execution_date, datetime.datetime): execution_date = self.execution_date elif isinstance(self.execution_date, str): execution_date = timezone.parse(self.execution_date) self.execution_date = execution_date else: execution_date = timezone.utcnow() if self.trigger_run_id: run_id = self.trigger_run_id else: run_id = DagRun.generate_run_id(DagRunType.MANUAL, execution_date) try: dag_run = trigger_dag( dag_id=self.trigger_dag_id, run_id=run_id, conf=self.conf, execution_date=self.execution_date, replace_microseconds=False, ) except DagRunAlreadyExists as e: if self.reset_dag_run: self.log.info("Clearing %s on %s", self.trigger_dag_id, self.execution_date) # Get target dag object and call clear() dag_model = DagModel.get_current(self.trigger_dag_id) if dag_model is None: raise DagNotFound(f"Dag id {self.trigger_dag_id} not found in DagModel") dag_bag = DagBag(dag_folder=dag_model.fileloc, read_dags_from_db=True) dag = dag_bag.get_dag(self.trigger_dag_id) dag.clear(start_date=self.execution_date, end_date=self.execution_date) dag_run = DagRun.find(dag_id=dag.dag_id, run_id=run_id)[0] else: raise e if self.wait_for_completion: # wait for dag to complete while True: self.log.info( 'Waiting for %s on %s to become allowed state %s ...', self.trigger_dag_id, dag_run.execution_date, self.allowed_states, ) time.sleep(self.poke_interval) dag_run.refresh_from_db() state = dag_run.state if state in self.failed_states: raise AirflowException(f"{self.trigger_dag_id} failed with failed states {state}") if state in self.allowed_states: self.log.info("%s finished with allowed state %s", self.trigger_dag_id, state) return
def test_get_dag_with_dag_serialization(self): """ Test that Serialized DAG is updated in DagBag when it is updated in Serialized DAG table after 'min_serialized_dag_fetch_interval' seconds are passed. """ with freeze_time(tz.datetime(2020, 1, 5, 0, 0, 0)): example_bash_op_dag = DagBag( include_examples=True).dags.get("example_bash_operator") SerializedDagModel.write_dag(dag=example_bash_op_dag) dag_bag = DagBag(read_dags_from_db=True) ser_dag_1 = dag_bag.get_dag("example_bash_operator") ser_dag_1_update_time = dag_bag.dags_last_fetched[ "example_bash_operator"] self.assertEqual(example_bash_op_dag.tags, ser_dag_1.tags) self.assertEqual(ser_dag_1_update_time, tz.datetime(2020, 1, 5, 0, 0, 0)) # Check that if min_serialized_dag_fetch_interval has not passed we do not fetch the DAG # from DB with freeze_time(tz.datetime(2020, 1, 5, 0, 0, 4)): with assert_queries_count(0): self.assertEqual( dag_bag.get_dag("example_bash_operator").tags, ["example", "example2"]) # Make a change in the DAG and write Serialized DAG to the DB with freeze_time(tz.datetime(2020, 1, 5, 0, 0, 6)): example_bash_op_dag.tags += ["new_tag"] SerializedDagModel.write_dag(dag=example_bash_op_dag) # Since min_serialized_dag_fetch_interval is passed verify that calling 'dag_bag.get_dag' # fetches the Serialized DAG from DB with freeze_time(tz.datetime(2020, 1, 5, 0, 0, 8)): with assert_queries_count(2): updated_ser_dag_1 = dag_bag.get_dag("example_bash_operator") updated_ser_dag_1_update_time = dag_bag.dags_last_fetched[ "example_bash_operator"] self.assertCountEqual(updated_ser_dag_1.tags, ["example", "example2", "new_tag"]) self.assertGreater(updated_ser_dag_1_update_time, ser_dag_1_update_time)
def test_handle_failure_callback_with_zobmies_are_correctly_passed_to_dag_file_processor(self): """ Check that the same set of failure callback with zombies are passed to the dag file processors until the next zombie detection logic is invoked. """ test_dag_path = os.path.join(TEST_DAG_FOLDER, 'test_example_bash_operator.py') with conf_vars({('scheduler', 'max_threads'): '1', ('core', 'load_examples'): 'False'}): dagbag = DagBag(test_dag_path) with create_session() as session: session.query(LJ).delete() dag = dagbag.get_dag('test_example_bash_operator') dag.sync_to_db() task = dag.get_task(task_id='run_this_last') ti = TI(task, DEFAULT_DATE, State.RUNNING) local_job = LJ(ti) local_job.state = State.SHUTDOWN local_job.id = 1 ti.job_id = local_job.id session.add(local_job) session.add(ti) session.commit() fake_failure_callback_requests = [ FailureCallbackRequest( full_filepath=dag.full_filepath, simple_task_instance=SimpleTaskInstance(ti), msg="Message" ) ] test_dag_path = os.path.join(TEST_DAG_FOLDER, 'test_example_bash_operator.py') async_mode = 'sqlite' not in conf.get('core', 'sql_alchemy_conn') processor_agent = DagFileProcessorAgent(test_dag_path, 1, FakeDagFileProcessorRunner._fake_dag_processor_factory, timedelta.max, [], False, async_mode) processor_agent.start() parsing_result = [] if not async_mode: processor_agent.run_single_parsing_loop() while not processor_agent.done: if not async_mode: processor_agent.wait_until_finished() parsing_result.extend(processor_agent.harvest_simple_dags()) self.assertEqual(len(fake_failure_callback_requests), len(parsing_result)) self.assertEqual( set(zombie.simple_task_instance.key for zombie in fake_failure_callback_requests), set(result.simple_task_instance.key for result in parsing_result) )
class TestETLValidateTask(unittest.TestCase): def setUp(self): self.dagbag = DagBag() self.dag_id = 'etl_covid_data_dag' def test_task_count(self): dag = self.dagbag.get_dag(self.dag_id) self.assertEqual(len(dag.tasks), 65) def test_contain_tasks(self): dag = self.dagbag.get_dag(self.dag_id) tasks = dag.tasks task_ids = list(map(lambda task: task.task_id, tasks)) self.assertIn("extract", task_ids) self.assertIn("transform", task_ids) self.assertIn("load", task_ids) def test_dependencies_of_transform_task(self): dag = self.dagbag.get_dag(self.dag_id) dummy_task = dag.get_task('transform') upstream_task_ids = list( map(lambda task: task.task_id, dummy_task.upstream_list)) self.assertListEqual(upstream_task_ids, ["extract"]) downstream_task_ids = list( map(lambda task: task.task_id, dummy_task.downstream_list)) self.assertIn("load", downstream_task_ids) def test_dependencies_of_extract_task(self): dag = self.dagbag.get_dag(self.dag_id) hello_task = dag.get_task('extract') upstream_task_ids = list( map(lambda task: task.task_id, hello_task.upstream_list)) self.assertListEqual(upstream_task_ids, []) downstream_task_ids = list( map(lambda task: task.task_id, hello_task.downstream_list)) self.assertIn("transform", downstream_task_ids) self.assertEqual([], upstream_task_ids)
class SyncContryFromZendeskPipeline(TestCase): def setUp(self): self.dagbag = DagBag() self.dag_id = self.dagbag.get_dag('sync_country_from_zendesk_pipeline') self.dag_login_aws = self.dag_id.tasks[0] self.dag_sync_country_from_zendesk_pipeline = self.dag_id.tasks[1] def test_dag_login_aws_should_see_task_id(self): actual = self.dag_login_aws.task_id expected = 'login_aws' self.assertEqual(actual, expected) def test_dag_login_aws_should_see_bash_command(self): actual = self.dag_login_aws.bash_command expected = '$(aws ecr get-login --region eu-west-1 --no-include-email)' self.assertEqual(actual, expected) def test_dag_sync_country_from_zendesk_pipeline_should_see_task_id(self): actual = self.dag_sync_country_from_zendesk_pipeline.task_id expected = 'sync_country_from_zendesk_pipeline' self.assertEqual(actual, expected) @patch.dict('os.environ', { 'DATABASE_HOST': '192.68.33.61', 'ELASTICSEARCH_URL': 'http://192.68.33.61:9200', 'DYNAMODB_HOST': 'http://192.68.33.61:4567', }) def test_dag_sync_country_from_zendesk_pipeline_should_see_environment(self): expected = { 'DATABASE_HOST': '192.68.33.61', 'ELASTICSEARCH_URL': 'http://192.68.33.61:9200', 'DYNAMODB_HOST': 'http://192.68.33.61:4567', } self.assertEqual(os.environ['DATABASE_HOST'], expected.get('DATABASE_HOST')) self.assertEqual(os.environ['ELASTICSEARCH_URL'], expected.get('ELASTICSEARCH_URL')) self.assertEqual(os.environ['DYNAMODB_HOST'], expected.get('DYNAMODB_HOST')) def test_dag_sync_country_from_zendesk_pipeline_should_see_auto_remove(self): actual = self.dag_sync_country_from_zendesk_pipeline.auto_remove expected = True self.assertEqual(actual, expected) def test_dag_sync_country_from_zendesk_pipeline_should_see_image(self): actual = self.dag_sync_country_from_zendesk_pipeline.image expected = '133506877714.dkr.ecr.eu-west-1.amazonaws.com/pronto-dashboard' self.assertIn(expected, actual) def test_dag_sync_country_from_zendesk_pipeline_see_command(self): actual = self.dag_sync_country_from_zendesk_pipeline.command expected = 'python pronto_dashboard/manage.py sync_country_from_zendesk --settings=pronto_dashboard.settings' self.assertIn(expected, actual)
def check_and_get_dag(dag_id: str, task_id: Optional[str] = None) -> DagModel: """Checks that DAG exists and in case it is specified that Task exist""" dagbag = DagBag() if dag_id not in dagbag.dags: error_message = "Dag id {} not found".format(dag_id) raise DagNotFound(error_message) dag = dagbag.get_dag(dag_id) if task_id and not dag.has_task(task_id): error_message = 'Task {} not found in dag {}'.format(task_id, dag_id) raise TaskNotFound(error_message) return dag
class TestImdbPull(unittest.TestCase): def setUp(self): self.dagbag = DagBag() self.dag = self.dagbag.get_dag('imdb_data_pull') def test_expected_endpoints(self): tasks = self.dag.tasks source_tasks = [x for x in tasks if len(x.upstream_list) == 0] self.assertEqual(len(source_tasks), 1) sink_tasks = [x for x in tasks if len(x.downstream_list) == 0] self.assertEqual(len(sink_tasks), 2) def test_db_to_file(self): sqlite_conn = sqlite3.connect(':memory:') sqlite_conn.execute('create table person(name text, age integer)') data = [('Alice', 5), ('Bob', 24), ('Carol', 33), ('Dave', 15)] sqlite_conn.executemany('insert into person values (?, ?)', data) class DBToFileHook(SqliteHook): def get_conn(self): return sqlite_conn hook = DBToFileHook() testfile = tempfile.NamedTemporaryFile() db_to_file(hook, "select name from person where age > 18", testfile.name) lines = testfile.readlines() self.assertEqual(len(lines), 3) def test_csv_to_json(self): testfile = tempfile.NamedTemporaryFile(mode='wt', newline='', suffix='.csv') writer = csv.writer(testfile) writer.writerows( [['name', 'year', 'director_first_name', 'director_last_name'], ['Independence Day', 1996, 'Roland', 'Emmerich'], ['Dodgeball', 2004, 'Rawson', 'Thurber'], ['The Princess Bride', 1987, 'Rob', 'Reiner']]) testfile.flush() csv_to_json(testfile.name) json_file_name = testfile.name.replace('.csv', '.json') self.assertTrue(path.exists(json_file_name)) self.assertGreater(path.getsize(json_file_name), 0)
class TestHitlogDag(TestCase): def setUp(self): self.dagbag = DagBag() self.dag = self.dagbag.get_dag('hitlog-dag') def test_task_count(self): self.assertEqual(len(self.dag.tasks), 1) def test_tasks(self): self.assertListEqual([task.task_id for task in self.dag.tasks], ['process-file'])
class TestMySampleDAG(unittest.TestCase): """Check MyDAG expectation""" def setUp(self): self.dagbag = DagBag() def test_task_count(self): """Check task count of MyDAG""" dag_id = 'simple-airflow' dag = self.dagbag.get_dag(dag_id) self.assertEqual(len(dag.tasks), 2) def test_contain_tasks(self): """Check task contains in MyDAG""" dag_id = 'simple-airflow' dag = self.dagbag.get_dag(dag_id) tasks = dag.tasks task_ids = list(map(lambda task: task.task_id, tasks)) self.assertListEqual(task_ids, ['hello_task1', 'dummy_task1']) def test_dependencies_of_dummy_task1(self): """Check the task dependencies of dummy_task in MyDAG""" dag_id = 'simple-airflow' dag = self.dagbag.get_dag(dag_id) dummy_task = dag.get_task('dummy_task1') upstream_task_ids = list(map(lambda task: task.task_id, dummy_task.upstream_list)) self.assertListEqual(upstream_task_ids, []) downstream_task_ids = list(map(lambda task: task.task_id, dummy_task.downstream_list)) self.assertListEqual(downstream_task_ids, ['hello_task1']) def test_dependencies_of_hello_task1(self): """Check the task dependencies of hello_task in MyDAG""" dag_id = 'simple-airflow' dag = self.dagbag.get_dag(dag_id) hello_task = dag.get_task('hello_task1') upstream_task_ids = list(map(lambda task: task.task_id, hello_task.upstream_list)) self.assertListEqual(upstream_task_ids, ['dummy_task1']) downstream_task_ids = list(map(lambda task: task.task_id, hello_task.downstream_list)) self.assertListEqual(downstream_task_ids, [])
class TestMySecondDAG(unittest.TestCase): """Check DAG expectation""" def setUp(self): self.dagbag = DagBag() def test_task_count(self): """Check task count of my_second_dag dag""" dag_id = 'my_second_dag' dag = self.dagbag.get_dag(dag_id) self.assertEqual(len(dag.tasks), 3) def test_contain_tasks(self): """Check task contains in my_second_dag dag""" dag_id = 'my_second_dag' dag = self.dagbag.get_dag(dag_id) tasks = dag.tasks task_ids = list(map(lambda task: task.task_id, tasks)) self.assertListEqual(task_ids, ['bash_example', 'print_the_context', 'also_run_this']) def test_dependencies_of_dummy_task(self): """Check the task dependencies of `bash_example` task in `my_second_dag` dag""" dag_id = 'my_second_dag' dag = self.dagbag.get_dag(dag_id) bash_task = dag.get_task('bash_example') upstream_task_ids = list(map(lambda task: task.task_id, bash_task.upstream_list)) self.assertListEqual(upstream_task_ids, ['also_run_this']) downstream_task_ids = list(map(lambda task: task.task_id, bash_task.downstream_list)) self.assertListEqual(downstream_task_ids, []) def test_dependencies_of_hello_task(self): """Check the task dependencies of `also_run_this` task in `my_second_dag` dag""" dag_id = 'my_second_dag' dag = self.dagbag.get_dag(dag_id) also_run_this_task = dag.get_task('also_run_this') upstream_task_ids = list(map(lambda task: task.task_id, also_run_this_task.upstream_list)) self.assertListEqual(upstream_task_ids, ['print_the_context']) downstream_task_ids = list(map(lambda task: task.task_id, also_run_this_task.downstream_list)) self.assertListEqual(downstream_task_ids, ['bash_example'])
class TestMyFirstDAG(unittest.TestCase): def setUp(self): self.dagbag=DagBag() self.dagid='MyFirstDAG' def test_task_count(self): """Check the number of tasks in MyFirstDAG""" self.assertEqual(len(self.dagbag.get_dag(self.dagid).tasks),5) def test_task_list(self): """Check the task list in MyFirstDAG""" task_ids = list(map(lambda task: task.task_id, self.dagbag.get_dag(self.dagid).tasks)) self.assertListEqual(task_ids,['Step_1','Step_2','Step_3','Step_4','LastStep']) def test_dependencies_last_step(self): """Check the dependencies for the last task in MyFirstDAG""" last_step_task_id = self.dagbag.get_dag(self.dagid).get_task('LastStep') upstream_task_ids = list(map(lambda task: task.task_id,last_step_task_id.upstream_list)) downstream_task_ids = list(map(lambda task: task.task_id,last_step_task_id.downstream_list)) self.assertListEqual(upstream_task_ids,['Step_4','Step_3']) self.assertListEqual(downstream_task_ids,[])
class WorkerView(Resource, Init): """ WorkerView shows the celery hosts and which host belongs to which queue. Also, shows available and taken worker slots. """ def __init__(self): super().__init__() self.dagbag = DagBag() self.queues = set() self.broker = os.environ['AIRFLOW__CELERY__BROKER_URL'] self.backend = os.environ['AIRFLOW__CELERY__RESULT_BACKEND'] self.celery = Celery('airflow_api', broker=self.broker, backend=self.backend) self.owners = self.args['owners'].lower() def get(self): """ :param owners: DAG owners """ dags = self.session.query( self.DM).filter(self.DM.owners == self.owners).all() for each in dags: dag = self.dagbag.get_dag(each.dag_id) try: queue = dag.default_args.get("queue", "airflow") self.queues.add(queue) except Exception: pass all_hosts = self.celery.control.inspect().stats().keys() data_list = [] for host in all_hosts: celery = self.celery.control.inspect([host]) queue = set([ value[0].get("routing_key") for value in celery.active_queues().values() ]) if self.queues & queue: queue = next(iter(queue)) running = [len(r) for r in celery.active().values()] available = [ value["pool"]["writes"]["inqueues"]['total'] for value in celery.stats().values() ] data_list.append( OrderedDict([('queue', queue), ('host', host[7:]), ('running', running[0]), ('available', available[0] - running[0])])) else: continue return jsonify(data_list)
class TestDagCommiunication(unittest.TestCase): LOAD_SECOND_THRESHOLD = 2 def setUp(self): self.dagbag = DagBag() self.emails='*****@*****.**' self.dag_id = 'hello_world_xcoms' self.from_task ='push_to_xcoms' self.to_task1='pull_from_xcoms' self.to_task2='templated_xcoms_value' def test_xcoms(self): dag = self.dagbag.get_dag(self.dag_id) push_to_xcoms_task = dag.get_task(self.from_task) pull_from_xcoms_task = dag.get_task(self.to_task1) execution_date = datetime.now() push_to_xcoms_ti = TaskInstance(task=push_to_xcoms_task, execution_date=execution_date) context = push_to_xcoms_ti.get_template_context() push_to_xcoms_task.execute(context) pull_from_xcoms_ti = TaskInstance(task=pull_from_xcoms_task, execution_date=execution_date) result = pull_from_xcoms_ti.xcom_pull(key="dummyKey") self.assertEqual(result, 'dummyValue') def test_xcom_in_templated_field(self): dag = self.dagbag.get_dag(self.dag_id) push_to_xcoms_task = dag.get_task(self.from_task) execution_date = datetime.now() push_to_xcoms_ti = TaskInstance(task=push_to_xcoms_task, execution_date=execution_date) context = push_to_xcoms_ti.get_template_context() push_to_xcoms_task.execute(context) templated_xcoms_value_task = dag.get_task(self.to_task2) templated_xcoms_value_ti = TaskInstance(task=templated_xcoms_value_task, execution_date=execution_date) context = templated_xcoms_value_ti.get_template_context() bash_operator_templated_field = 'bash_command' rendered_template = templated_xcoms_value_task.render_template bash_command_value = getattr(templated_xcoms_value_task, bash_operator_templated_field) bash_command_rendered_value = rendered_template(bash_command_value,context) self.assertEqual(bash_command_rendered_value, 'echo dummyValue')
class TestDag(unittest.TestCase): def setUp(self): self.dagbag = DagBag(dag_folder="/Users/oscar.barlow/Projects/work/infinityworks/airflow-hacknight-coke/dags") def test_should_be_s3_sensor_task(self): dag_id = 's3_event' dag = self.dagbag.get_dag(dag_id) tasks = dag.tasks task_ids = list(map(lambda task: task.task_id, tasks)) self.assertListEqual(task_ids, ['s3_sensor', 'print_key']) self.assertEqual(type(tasks[0]), airflow.operators.sensors.S3KeySensor)
def remove_dag(dag_id): logging.info("Executing custom 'remove_dag' function") dagbag = DagBag('dags') dag = dagbag.get_dag(dag_id) # Get Dag File Path dag_path = dag.full_filepath if dag is not None else os.path.join(airflow_dags_folder, dag_id + ".py") rm_dag_cmd_split = ["rm", "-rf", dag_path] cli_output = ApiUtil.execute_cli_command(rm_dag_cmd_split) logging.info("Remove Dag File[{}] Result: {}".format(dag.full_filepath, str(cli_output))) dag_cache_path = os.path.join(airflow_dags_folder, '__pycache__', dag_id + ".cpython-36.pyc") rm_dag_cache_cmd_split = ["rm", "-rf", dag_cache_path] cli_output = ApiUtil.execute_cli_command(rm_dag_cache_cmd_split) logging.info("Remove Dag Cache File[{}] Result: {}".format(dag_cache_path, str(cli_output)))
def trigger_dag(dag_id): logging.info("Executing custom 'trigger_dag' function") # Check dag_id argument if dag_id is None: logging.warning("The dag_id argument wasn't provided") return ApiResponse.bad_request( "The dag_id argument should be provided") dagbag = DagBag('dags') if dag_id not in DagBag('dags').dags: return ApiResponse.bad_request("Dag id {} not found".format(dag_id)) try: # Get Dag From DagBag dag = dagbag.get_dag(dag_id) # Check Dag Status if dag.is_paused: logging.warning("The Dag[{}] is not active".format(dag_id)) # UnPause Dag airflow_cmd_split = ["airflow", "unpause", dag_id] cli_output = ApiUtil.execute_cli_command(airflow_cmd_split) logging.info("UnPause Dag Result: " + str(cli_output)) # Check Dag Status again if not ApiUtil.check_dag_active(dag_id): raise Exception("Dag is still not active") # Trigger Dag By Dag ID airflow_cmd_split = ["airflow", "trigger_dag"] run_id = request.args.get('run_id') if run_id is not None: logging.info("trigger dag run_id: " + str(run_id)) airflow_cmd_split.extend(["-r", run_id]) execution_date = request.args.get('execution_date') if execution_date is not None: logging.info("trigger dag execution_date: " + str(execution_date)) airflow_cmd_split.extend(["-e", execution_date]) airflow_cmd_split.append(dag_id) cli_output = ApiUtil.execute_cli_command(airflow_cmd_split) logging.info("Trigger Dag Result: " + str(cli_output)) except Exception as e: error_message = "An error occurred while trying to trigger the DAG '" + str( dag_id) + "': " + str(e) logging.error(error_message) return ApiResponse.server_error(error_message) return ApiResponse.success("DAG [{}] has been triggered".format(dag_id))
def test_find_zombies(self): manager = DagFileProcessorManager( dag_directory='directory', max_runs=1, processor_factory=MagicMock().return_value, processor_timeout=timedelta.max, signal_conn=MagicMock(), dag_ids=[], pickle_dags=False, async_mode=True, ) dagbag = DagBag(TEST_DAG_FOLDER, read_dags_from_db=False) with create_session() as session: session.query(LJ).delete() dag = dagbag.get_dag('example_branch_operator') dag.sync_to_db() task = dag.get_task(task_id='run_this_first') ti = TI(task, DEFAULT_DATE, State.RUNNING) local_job = LJ(ti) local_job.state = State.SHUTDOWN session.add(local_job) session.commit() ti.job_id = local_job.id session.add(ti) session.commit() manager._last_zombie_query_time = timezone.utcnow() - timedelta( seconds=manager._zombie_threshold_secs + 1) manager._find_zombies() # pylint: disable=no-value-for-parameter requests = manager._callback_to_execute[dag.full_filepath] self.assertEqual(1, len(requests)) self.assertEqual(requests[0].full_filepath, dag.full_filepath) self.assertEqual(requests[0].msg, "Detected as zombie") self.assertEqual(requests[0].is_failure_callback, True) self.assertIsInstance(requests[0].simple_task_instance, SimpleTaskInstance) self.assertEqual(ti.dag_id, requests[0].simple_task_instance.dag_id) self.assertEqual(ti.task_id, requests[0].simple_task_instance.task_id) self.assertEqual(ti.execution_date, requests[0].simple_task_instance.execution_date) session.query(TI).delete() session.query(LJ).delete()
def check_and_get_dag(dag_id: str, task_id: Optional[str] = None) -> DagModel: """Checks that DAG exists and in case it is specified that Task exist""" dag_model = DagModel.get_current(dag_id) if dag_model is None: raise DagNotFound(f"Dag id {dag_id} not found in DagModel") dagbag = DagBag(dag_folder=dag_model.fileloc, read_dags_from_db=True) dag = dagbag.get_dag(dag_id) if not dag: error_message = f"Dag id {dag_id} not found" raise DagNotFound(error_message) if task_id and not dag.has_task(task_id): error_message = f'Task {task_id} not found in dag {dag_id}' raise TaskNotFound(error_message) return dag
def refresh_dagbag(dag_bag=None, dag_id=None, force_fetch=False): if not dag_bag: dag_bag = DagBag(store_serialized_dags=False) try: # Use bulk sync to refresh DAGs (implemented by https://github.com/apache/airflow/pull/7477) dag_bag.sync_to_db() except AttributeError: # Bulk sync not possible if dag_id: dag = dag_bag.get_dag(dag_id) if dag: dag.sync_to_db() else: if force_fetch: # TODO: not possible to update in-memory dagbag instance pass if not force_fetch or not dag_bag.get_dag(dag_id): print(f'++++++++Deactivating unknown DAGs+++++++') # DAG is missing in the DagBag # probably since it's disabled because it's unapproved. # in that case, the DB DAG also needs to be deactivated DAG.deactivate_unknown_dags(dag_bag.dag_ids) else: raise ValueError('Pass "dag_id" to refresh!')
class DAGDependecncy_For_Copy_Table_DAG_Test(unittest.TestCase): @classmethod def setUp(self): os.environ['AIRFLOW_VAR_ENTERPRISE_PROJECT'] = 'test' self.dagbag = DagBag() self.dag_id = 'Copy_Table_BigQuery' def test_contain_tasks(self): """Check task contains in example_dag dag""" dag = self.dagbag.get_dag(self.dag_id) tasks = dag.tasks task_ids = list(map(lambda task: task.task_id, tasks)) assert sorted(task_ids) == sorted(['start', 'copy_records', 'stop']) def test_dependencies_of_print_date(self): """Check the task dependencies of dataflow_task in example_dag dag""" dag = self.dagbag.get_dag(self.dag_id) dataflow_task = dag.get_task('copy_records') upstream_task_ids = list( map(lambda task: task.task_id, dataflow_task.upstream_list)) downstream_task_ids = list( map(lambda task: task.task_id, dataflow_task.downstream_list)) assert sorted(upstream_task_ids) == sorted(['start']) assert sorted(downstream_task_ids) == sorted(['stop'])
def get_dag(dag_id): # check dag_id if dag_id is None: logging.warning("The dag_id argument wasn't provided") raise Exception("The dag_id argument should be provided") try: dagbag = DagBag('dags') if dag_id not in dagbag.dags: raise Exception("Dag id {} not found".format(dag_id)) return dagbag.get_dag(dag_id) except Exception as e: error_message = "An error occurred while trying to get_dag'" + "': " + str( e) logging.error(error_message) return ApiResponse.server_error(error_message)
def execute(self, context): dro = DagRunOrder(run_id='trig__' + timezone.utcnow().isoformat()) dro = self.python_callable(context, dro) if dro: with create_session() as session: dbag = DagBag(settings.DAGS_FOLDER) trigger_dag = dbag.get_dag(self.trigger_dag_id) dr = trigger_dag.create_dagrun(run_id=dro.run_id, state=State.RUNNING, conf=dro.payload, external_trigger=True) self.log.info("Creating DagRun %s", dr) session.add(dr) session.commit() else: self.log.info("Criteria not met, moving on")
def check_dag_file_exist(dag_id): logging.info("Executing custom 'check_dag_file_exist' function") # check dag_id if dag_id is None: logging.warning("The dag_id argument wasn't provided") return ApiResponse.bad_request( "The dag_id argument should be provided") dagbag = DagBag('dags') if dag_id not in dagbag.dags: return ApiResponse.bad_request("Dag id {} not found".format(dag_id)) dag = dagbag.get_dag(dag_id) payload = os.path.exists(dag.full_filepath) return ApiResponse.success(payload)
def get_task(dag_id, task_id): """Return the task object identified by the given dag_id and task_id.""" dagbag = DagBag() # Check DAG exists. if dag_id not in dagbag.dags: error_message = "Dag id {} not found".format(dag_id) raise AirflowException(error_message) # Get DAG object and check Task Exists dag = dagbag.get_dag(dag_id) if not dag.has_task(task_id): error_message = 'Task {} not found in dag {}'.format(task_id, dag_id) raise AirflowException(error_message) # Return the task. return dag.get_task(task_id)
def execute(self, context): dro = DagRunOrder(run_id='trig__' + timezone.utcnow().isoformat()) dro = self.python_callable(context, dro) if dro: with create_session() as session: dbag = DagBag(settings.DAGS_FOLDER) trigger_dag = dbag.get_dag(self.trigger_dag_id) dr = trigger_dag.create_dagrun( run_id=dro.run_id, state=State.RUNNING, conf=dro.payload, external_trigger=True) self.log.info("Creating DagRun %s", dr) session.add(dr) session.commit() else: self.log.info("Criteria not met, moving on")
def upgrade(): op.add_column('task_instance', sa.Column('max_tries', sa.Integer, server_default="-1")) # Check if table task_instance exist before data migration. This check is # needed for database that does not create table until migration finishes. # Checking task_instance table exists prevent the error of querying # non-existing task_instance table. connection = op.get_bind() inspector = Inspector.from_engine(connection) tables = inspector.get_table_names() if 'task_instance' in tables: # Get current session sessionmaker = sa.orm.sessionmaker() session = sessionmaker(bind=connection) dagbag = DagBag(settings.DAGS_FOLDER) query = session.query(sa.func.count(TaskInstance.max_tries)).filter( TaskInstance.max_tries == -1 ) # Separate db query in batch to prevent loading entire table # into memory and cause out of memory error. while query.scalar(): tis = session.query(TaskInstance).filter( TaskInstance.max_tries == -1 ).limit(BATCH_SIZE).all() for ti in tis: dag = dagbag.get_dag(ti.dag_id) if not dag or not dag.has_task(ti.task_id): # task_instance table might not have the up-to-date # information, i.e dag or task might be modified or # deleted in dagbag but is reflected in task instance # table. In this case we do not retry the task that can't # be parsed. ti.max_tries = ti.try_number else: task = dag.get_task(ti.task_id) if task.retries: ti.max_tries = task.retries else: ti.max_tries = ti.try_number session.merge(ti) session.commit() # Commit the current session. session.commit()
def execute(self, context): dro = DagRunOrder(run_id='trig__' + datetime.now().isoformat()) dro = self.python_callable(context, dro) if dro: session = settings.Session() dbag = DagBag(os.path.expanduser(conf.get('core', 'DAGS_FOLDER'))) trigger_dag = dbag.get_dag(self.trigger_dag_id) dr = trigger_dag.create_dagrun( run_id=dro.run_id, state=State.RUNNING, conf=dro.payload, external_trigger=True) logging.info("Creating DagRun {}".format(dr)) session.add(dr) session.commit() session.close() else: logging.info("Criteria not met, moving on")
def test_trigger_dag_for_date(self): url_template = '/api/experimental/dags/{}/dag_runs' dag_id = 'example_bash_operator' hour_from_now = utcnow() + timedelta(hours=1) execution_date = datetime(hour_from_now.year, hour_from_now.month, hour_from_now.day, hour_from_now.hour) datetime_string = execution_date.isoformat() # Test Correct execution response = self.client.post( url_template.format(dag_id), data=json.dumps({'execution_date': datetime_string}), content_type="application/json" ) self.assertEqual(200, response.status_code) dagbag = DagBag() dag = dagbag.get_dag(dag_id) dag_run = dag.get_dagrun(execution_date) self.assertTrue(dag_run, 'Dag Run not found for execution date {}' .format(execution_date)) # Test error for nonexistent dag response = self.client.post( url_template.format('does_not_exist_dag'), data=json.dumps({'execution_date': execution_date.isoformat()}), content_type="application/json" ) self.assertEqual(404, response.status_code) # Test error for bad datetime format response = self.client.post( url_template.format(dag_id), data=json.dumps({'execution_date': 'not_a_datetime'}), content_type="application/json" ) self.assertEqual(400, response.status_code)
def test_find_zombies(self): manager = DagFileProcessorManager( dag_directory='directory', file_paths=['abc.txt'], max_runs=1, processor_factory=MagicMock().return_value, signal_conn=MagicMock(), stat_queue=MagicMock(), result_queue=MagicMock, async_mode=True) dagbag = DagBag(TEST_DAG_FOLDER) with create_session() as session: session.query(LJ).delete() dag = dagbag.get_dag('example_branch_operator') task = dag.get_task(task_id='run_this_first') ti = TI(task, DEFAULT_DATE, State.RUNNING) lj = LJ(ti) lj.state = State.SHUTDOWN lj.id = 1 ti.job_id = lj.id session.add(lj) session.add(ti) session.commit() manager._last_zombie_query_time = timezone.utcnow() - timedelta( seconds=manager._zombie_threshold_secs + 1) zombies = manager._find_zombies() self.assertEqual(1, len(zombies)) self.assertIsInstance(zombies[0], SimpleTaskInstance) self.assertEqual(ti.dag_id, zombies[0].dag_id) self.assertEqual(ti.task_id, zombies[0].task_id) self.assertEqual(ti.execution_date, zombies[0].execution_date) session.query(TI).delete() session.query(LJ).delete()
def test_trigger_dag_for_date(self): url_template = '/api/experimental/dags/{}/dag_runs/{}' dag_id = 'example_bash_operator' now = datetime.now() execution_date = datetime(now.year, now.month, now.day, now.hour + 1) datetime_string = execution_date.isoformat() # Test Correct execution response = self.app.post( quote(url_template.format(dag_id, datetime_string)), data=json.dumps(dict(run_id='my_run'.format(datetime_string))), content_type="application/json" ) self.assertEqual(200, response.status_code) dagbag = DagBag() dag = dagbag.get_dag(dag_id) dag_run = dag.get_dagrun(execution_date) self.assertTrue(dag_run, 'Dag Run not found for execution date {}' .format(execution_date)) # Test error for nonexistent dag response = self.app.post( quote(url_template.format('does_not_exist_dag', datetime_string)), data=json.dumps(dict()), content_type="application/json" ) self.assertEqual(404, response.status_code) # Test error for bad datetime format response = self.app.post( quote( url_template.format(dag_id, 'not_a_datetime')), data=json.dumps(dict(run_id='my_run'.format(datetime_string))), content_type="application/json" ) self.assertEqual(400, response.status_code)
class BackfillJobTest(unittest.TestCase): def setUp(self): self.parser = cli.CLIFactory.get_parser() self.dagbag = DagBag(include_examples=True) @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'), "concurrent access not supported in sqlite") def test_trigger_controller_dag(self): dag = self.dagbag.get_dag('example_trigger_controller_dag') target_dag = self.dagbag.get_dag('example_trigger_target_dag') dag.clear() target_dag.clear() scheduler = SchedulerJob() queue = mock.Mock() scheduler._process_task_instances(target_dag, queue=queue) self.assertFalse(queue.append.called) job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True ) job.run() scheduler = SchedulerJob() queue = mock.Mock() scheduler._process_task_instances(target_dag, queue=queue) self.assertTrue(queue.append.called) target_dag.clear() dag.clear() @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'), "concurrent access not supported in sqlite") def test_backfill_multi_dates(self): dag = self.dagbag.get_dag('example_bash_operator') dag.clear() job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE+datetime.timedelta(days=1), ignore_first_depends_on_past=True ) job.run() session = settings.Session() drs = session.query(DagRun).filter( DagRun.dag_id=='example_bash_operator' ).order_by(DagRun.execution_date).all() self.assertTrue(drs[0].execution_date == DEFAULT_DATE) self.assertTrue(drs[0].state == State.SUCCESS) self.assertTrue(drs[1].execution_date == DEFAULT_DATE+datetime.timedelta(days=1)) self.assertTrue(drs[1].state == State.SUCCESS) dag.clear() session.close() @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'), "concurrent access not supported in sqlite") def test_backfill_examples(self): """ Test backfilling example dags """ # some DAGs really are just examples... but try to make them work! skip_dags = [ 'example_http_operator', 'example_twitter_dag', 'example_trigger_target_dag', 'example_trigger_controller_dag', # tested above 'test_utils', # sleeps forever ] logger = logging.getLogger('BackfillJobTest.test_backfill_examples') dags = [ dag for dag in self.dagbag.dags.values() if 'example_dags' in dag.full_filepath and dag.dag_id not in skip_dags ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): logger.info('*** Running example DAG #{}: {}'.format(i, dag.dag_id)) job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True) job.run() def test_backfill_pooled_tasks(self): """ Test that queued tasks are executed by BackfillJob Test for https://github.com/airbnb/airflow/pull/1225 """ session = settings.Session() pool = Pool(pool='test_backfill_pooled_task_pool', slots=1) session.add(pool) session.commit() dag = self.dagbag.get_dag('test_backfill_pooled_task_dag') dag.clear() job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # run with timeout because this creates an infinite loop if not # caught with timeout(seconds=30): job.run() ti = TI( task=dag.get_task('test_backfill_pooled_task'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS) def test_backfill_depends_on_past(self): """ Test that backfill respects ignore_depends_on_past """ dag = self.dagbag.get_dag('test_depends_on_past') dag.clear() run_date = DEFAULT_DATE + datetime.timedelta(days=5) # backfill should deadlock self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run) BackfillJob( dag=dag, start_date=run_date, end_date=run_date, ignore_first_depends_on_past=True).run() # ti should have succeeded ti = TI(dag.tasks[0], run_date) ti.refresh_from_db() self.assertEquals(ti.state, State.SUCCESS) def test_cli_backfill_depends_on_past(self): """ Test that CLI respects -I argument """ dag_id = 'test_dagrun_states_deadlock' run_date = DEFAULT_DATE + datetime.timedelta(days=1) args = [ 'backfill', dag_id, '-l', '-s', run_date.isoformat(), ] dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', cli.backfill, self.parser.parse_args(args)) cli.backfill(self.parser.parse_args(args + ['-I'])) ti = TI(dag.get_task('test_depends_on_past'), run_date) ti.refresh_from_db() # task ran self.assertEqual(ti.state, State.SUCCESS) dag.clear()
class SchedulerJobTest(unittest.TestCase): # These defaults make the test faster to run default_scheduler_args = {"file_process_interval": 0, "processor_poll_interval": 0.5} def setUp(self): self.dagbag = DagBag() @provide_session def evaluate_dagrun( self, dag_id, expected_task_states, # dict of task_id: state dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS. This is hackish: a dag run is created but its tasks are run by a backfill. """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob(**self.default_scheduler_args) dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.create_dag_run(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.create_dag_run(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks for task_id, expected_state in expected_task_states.items(): task = dag.get_task(task_id) ti = TI(task, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, expected_state) # load dagrun dr = DagRun.find(dag_id=dag_id, execution_date=ex_date) dr = dr[0] dr.dag = dag self.assertEqual(dr.state, dagrun_state) def test_dagrun_fail(self): """ DagRuns with one failed and one incomplete root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_fail', expected_task_states={ 'test_dagrun_fail': State.FAILED, 'test_dagrun_succeed': State.UPSTREAM_FAILED, }, dagrun_state=State.FAILED) def test_dagrun_success(self): """ DagRuns with one failed and one successful root task -> SUCCESS """ self.evaluate_dagrun( dag_id='test_dagrun_states_success', expected_task_states={ 'test_dagrun_fail': State.FAILED, 'test_dagrun_succeed': State.SUCCESS, }, dagrun_state=State.SUCCESS) def test_dagrun_root_fail(self): """ DagRuns with one successful and one failed root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_root_fail', expected_task_states={ 'test_dagrun_succeed': State.SUCCESS, 'test_dagrun_fail': State.FAILED, }, dagrun_state=State.FAILED) def test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date(self): """ DagRun is marked a success if ignore_first_depends_on_past=True Test that an otherwise-deadlocked dagrun is marked as a success if ignore_first_depends_on_past=True and the dagrun execution_date is after the start_date. """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': State.SUCCESS, 'test_depends_on_past_2': State.SUCCESS, }, dagrun_state=State.SUCCESS, advance_execution_date=True, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_dagrun_deadlock_ignore_depends_on_past(self): """ Test that ignore_first_depends_on_past doesn't affect results (this is the same test as test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date except that start_date == execution_date so depends_on_past is irrelevant). """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': State.SUCCESS, 'test_depends_on_past_2': State.SUCCESS, }, dagrun_state=State.SUCCESS, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_scheduler_start_date(self): """ Test that the scheduler respects start_dates, even when DAGS have run """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) scheduler = SchedulerJob(dag_id, num_runs=2, **self.default_scheduler_args) scheduler.run() # zero tasks ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0) # previously, running this backfill would kick off the Scheduler # because it would take the most recent run and start from there # That behavior still exists, but now it will only do so if after the # start date backfill = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) backfill.run() # one task ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) scheduler = SchedulerJob(dag_id, num_runs=2, **self.default_scheduler_args) scheduler.run() # still one task session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) def test_scheduler_multiprocessing(self): """ Test that the scheduler can successfully queue multiple dags in parallel """ dag_ids = ['test_start_date_scheduling', 'test_dagrun_states_success'] for dag_id in dag_ids: dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, file_process_interval=0, processor_poll_interval=0.5, num_runs=2) scheduler.run() # zero tasks ran dag_id = 'test_start_date_scheduling' session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0) def test_scheduler_dagrun_once(self): """ Test if the scheduler does not create multiple dagruns if a dag is scheduled with @once and a start_date """ dag = DAG( 'test_scheduler_dagrun_once', start_date=datetime.datetime(2015, 1, 1), schedule_interval="@once") scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) def test_scheduler_process_task_instances(self): """ Test if _process_task_instances puts the right task instances into the queue. """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE) ) def test_scheduler_do_not_schedule_removed_task(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called() def test_scheduler_do_not_schedule_too_early(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_too_early', start_date=datetime.datetime(2200, 1, 1)) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called() def test_scheduler_do_not_run_finished(self): dag = DAG( dag_id='test_scheduler_do_not_run_finished', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.SUCCESS session.commit() session.close() queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called() def test_scheduler_add_new_task(self): """ Test if a task instance will be added if the dag is updated """ dag = DAG( dag_id='test_scheduler_add_new_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances() self.assertEquals(len(tis), 1) dag_task2 = DummyOperator( task_id='dummy2', dag=dag, owner='airflow') queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) def test_scheduler_does_not_run_excluded(self): dag = DAG( dag_id='test_scheduler_does_not_run_excluded', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.EXCLUDED session.commit() session.close() queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called() def test_scheduler_verify_max_active_runs(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) def test_scheduler_fail_dagrun_timeout(self): """ Test if a a dagrun wil be set failed if timeout """ dag = DAG( dag_id='test_scheduler_fail_dagrun_timeout', start_date=DEFAULT_DATE) dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() dr2 = scheduler.create_dag_run(dag) self.assertIsNotNone(dr2) dr.refresh_from_db(session=session) self.assertEquals(dr.state, State.FAILED) def test_scheduler_verify_max_active_runs_and_dagrun_timeout(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached and dagrun_timeout is not reached Test if a a dagrun will be scheduled if max_dag_runs has been reached but dagrun_timeout is also reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs_and_dagrun_timeout', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) # Should not be scheduled as DagRun has not timedout and max_active_runs is reached new_dr = scheduler.create_dag_run(dag) self.assertIsNone(new_dr) # Should be scheduled as dagrun_timeout has passed dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() new_dr = scheduler.create_dag_run(dag) self.assertIsNotNone(new_dr) def test_scheduler_max_active_runs_respected_after_clear(self): """ Test if _process_task_instances only schedules ti's up to max_active_runs (related to issue AIRFLOW-137) """ dag = DAG( dag_id='test_scheduler_max_active_runs_respected_after_clear', start_date=DEFAULT_DATE) dag.max_active_runs = 3 dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() # First create up to 3 dagruns in RUNNING state. scheduler.create_dag_run(dag) # Reduce max_active_runs to 1 dag.max_active_runs = 1 queue = mock.Mock() # and schedule them in, so we can check how many # tasks are put on the queue (should be one, not 3) scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE) ) @patch.object(TI, 'pool_full') def test_scheduler_verify_pool_full(self, mock_pool_full): """ Test task instances not queued when pool is full """ mock_pool_full.return_value = False dag = DAG( dag_id='test_scheduler_verify_pool_full', start_date=DEFAULT_DATE) DummyOperator( task_id='dummy', dag=dag, owner='airflow', pool='test_scheduler_verify_pool_full') session = settings.Session() pool = Pool(pool='test_scheduler_verify_pool_full', slots=1) session.add(pool) orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() # Create 2 dagruns, which will create 2 task instances. dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, DEFAULT_DATE) dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = [] scheduler._process_task_instances(dag, queue=queue) self.assertEquals(len(queue), 2) dagbag = SimpleDagBag([dag]) # Recreated part of the scheduler here, to kick off tasks -> executor for ti_key in queue: task = dag.get_task(ti_key[1]) ti = models.TaskInstance(task, ti_key[2]) # Task starts out in the scheduled state. All tasks in the # scheduled state will be sent to the executor ti.state = State.SCHEDULED # Also save this task instance to the DB. session.merge(ti) session.commit() scheduler._execute_task_instances(dagbag, (State.SCHEDULED, State.UP_FOR_RETRY)) self.assertEquals(len(scheduler.executor.queued_tasks), 1) def test_scheduler_auto_align(self): """ Test if the schedule_interval will be auto aligned with the start_date such that if the start_date coincides with the schedule the first execution_date will be start_date, otherwise it will be start_date + interval. """ dag = DAG( dag_id='test_scheduler_auto_align_1', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="4 5 * * *" ) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 2, 5, 4)) dag = DAG( dag_id='test_scheduler_auto_align_2', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="10 10 * * *" ) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 1, 10, 10)) def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join(models.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks)) def test_scheduler_run_duration(self): """ Verifies that the scheduler run duration limit is followed. """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) expected_run_duration = 5 start_time = datetime.datetime.now() scheduler = SchedulerJob(dag_id, run_duration=expected_run_duration, **self.default_scheduler_args) scheduler.run() end_time = datetime.datetime.now() run_duration = (end_time - start_time).total_seconds() _log.info("Test ran in %.2fs, expected %.2fs", run_duration, expected_run_duration) assert run_duration - expected_run_duration < 5.0 def test_dag_with_system_exit(self): """ Test to check that a DAG with a system.exit() doesn't break the scheduler. """ dag_id = 'exit_test_dag' dag_ids = [dag_id] dag_directory = os.path.join(models.DAGS_FOLDER, "..", "dags_with_system_exit") dag_file = os.path.join(dag_directory, 'b_test_scheduler_dags.py') dagbag = DagBag(dag_folder=dag_file) for dag_id in dag_ids: dag = dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, subdir= dag_directory, num_runs=1, **self.default_scheduler_args) scheduler.run() session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) def test_dag_get_active_runs(self): """ Test to check that a DAG returns it's active runs """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace(minute=0, second=0, microsecond=0) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'get_active_runs_test' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args ) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag1.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag1.clear() dr = scheduler.create_dag_run(dag1) # We had better get a dag run self.assertIsNotNone(dr) execution_date = dr.execution_date running_dates = dag1.get_active_runs() try: running_date = running_dates[0] except: running_date = 'Except' self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
class BackfillJobTest(unittest.TestCase): def setUp(self): self.parser = cli.CLIFactory.get_parser() self.dagbag = DagBag(include_examples=True) def test_backfill_examples(self): """ Test backfilling example dags """ # some DAGs really are just examples... but try to make them work! skip_dags = [ 'example_http_operator', ] logger = logging.getLogger('BackfillJobTest.test_backfill_examples') dags = [ dag for dag in self.dagbag.dags.values() if 'example_dags' in dag.full_filepath and dag.dag_id not in skip_dags ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): logger.info('*** Running example DAG #{}: {}'.format(i, dag.dag_id)) job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True) job.run() def test_trap_executor_error(self): """ Test that errors setting up tasks (before tasks run) are caught. Executors run tasks with the `airflow run` command. If a task runs, its state (success, failure, or other) is stored in the database and `airflow run` exits without error. However, if an error is raised before the task runs, then the task won't be able to update its status. This can put the executor into an infinite loop of trying to run the task. To counteract that, the executor traps errors coming from `airflow run` (essentially looking for returncode != 0). This unit test creates such an error by trying to run a subdag whose dag_id has changed and therefore can't be found. If the trap is working properly, the error will be caught and the Backfill will report failures. If the trap is not working, the job will run infinitely (the unit test uses a timeout to protect against that case). Test for https://github.com/airbnb/airflow/pull/1220 """ dag = self.dagbag.get_dag('test_raise_executor_error') dag.clear() job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # run with timeout because this creates an infinite loop if not # caught def run_with_timeout(): with timeout(seconds=30): job.run() self.assertRaises(AirflowException, run_with_timeout) def test_backfill_pooled_tasks(self): """ Test that queued tasks are executed by BackfillJob Test for https://github.com/airbnb/airflow/pull/1225 """ session = settings.Session() pool = Pool(pool='test_backfill_pooled_task_pool', slots=1) session.add(pool) session.commit() dag = self.dagbag.get_dag('test_backfill_pooled_task_dag') dag.clear() job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # run with timeout because this creates an infinite loop if not # caught with timeout(seconds=30): job.run() ti = TI( task=dag.get_task('test_backfill_pooled_task'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS) def test_backfill_depends_on_past(self): """ Test that backfill resects ignore_depends_on_past """ dag = self.dagbag.get_dag('test_depends_on_past') dag.clear() run_date = DEFAULT_DATE + datetime.timedelta(days=5) # backfill should deadlock self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run) BackfillJob( dag=dag, start_date=run_date, end_date=run_date, ignore_first_depends_on_past=True).run() # ti should have succeeded ti = TI(dag.tasks[0], run_date) ti.refresh_from_db() self.assertEquals(ti.state, State.SUCCESS) def test_cli_backfill_depends_on_past(self): """ Test that CLI respects -I argument """ dag_id = 'test_dagrun_states_deadlock' run_date = DEFAULT_DATE + datetime.timedelta(days=1) args = [ 'backfill', dag_id, '-l', '-s', run_date.isoformat(), ] dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', cli.backfill, self.parser.parse_args(args)) cli.backfill(self.parser.parse_args(args + ['-I'])) ti = TI(dag.get_task('test_depends_on_past'), run_date) ti.refresh_from_db() # task ran self.assertEqual(ti.state, State.SUCCESS)
class BackfillJobTest(unittest.TestCase): def setUp(self): self.parser = cli.CLIFactory.get_parser() self.dagbag = DagBag(include_examples=True) def test_backfill_examples(self): """ Test backfilling example dags """ # some DAGs really are just examples... but try to make them work! skip_dags = [ 'example_http_operator', 'example_twitter_dag', ] logger = logging.getLogger('BackfillJobTest.test_backfill_examples') dags = [ dag for dag in self.dagbag.dags.values() if 'example_dags' in dag.full_filepath and dag.dag_id not in skip_dags ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): logger.info('*** Running example DAG #{}: {}'.format(i, dag.dag_id)) job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True) job.run() def test_backfill_pooled_tasks(self): """ Test that queued tasks are executed by BackfillJob Test for https://github.com/airbnb/airflow/pull/1225 """ session = settings.Session() pool = Pool(pool='test_backfill_pooled_task_pool', slots=1) session.add(pool) session.commit() dag = self.dagbag.get_dag('test_backfill_pooled_task_dag') dag.clear() job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # run with timeout because this creates an infinite loop if not # caught with timeout(seconds=30): job.run() ti = TI( task=dag.get_task('test_backfill_pooled_task'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS) def test_backfill_depends_on_past(self): """ Test that backfill resects ignore_depends_on_past """ dag = self.dagbag.get_dag('test_depends_on_past') dag.clear() run_date = DEFAULT_DATE + datetime.timedelta(days=5) # backfill should deadlock self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run) BackfillJob( dag=dag, start_date=run_date, end_date=run_date, ignore_first_depends_on_past=True).run() # ti should have succeeded ti = TI(dag.tasks[0], run_date) ti.refresh_from_db() self.assertEquals(ti.state, State.SUCCESS) def test_cli_backfill_depends_on_past(self): """ Test that CLI respects -I argument """ dag_id = 'test_dagrun_states_deadlock' run_date = DEFAULT_DATE + datetime.timedelta(days=1) args = [ 'backfill', dag_id, '-l', '-s', run_date.isoformat(), ] dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', cli.backfill, self.parser.parse_args(args)) cli.backfill(self.parser.parse_args(args + ['-I'])) ti = TI(dag.get_task('test_depends_on_past'), run_date) ti.refresh_from_db() # task ran self.assertEqual(ti.state, State.SUCCESS) dag.clear()
class SchedulerJobTest(unittest.TestCase): # These defaults make the test faster to run default_scheduler_args = {"file_process_interval": 0, "processor_poll_interval": 0.5} def setUp(self): self.dagbag = DagBag() @provide_session def evaluate_dagrun( self, dag_id, expected_task_states, # dict of task_id: state dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS. This is hackish: a dag run is created but its tasks are run by a backfill. """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob(**self.default_scheduler_args) dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.create_dag_run(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.create_dag_run(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks for task_id, expected_state in expected_task_states.items(): task = dag.get_task(task_id) ti = TI(task, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, expected_state) # load dagrun dr = DagRun.find(dag_id=dag_id, execution_date=ex_date) dr = dr[0] dr.dag = dag # dagrun is running self.assertEqual(dr.state, State.RUNNING) dr.update_state() # dagrun failed self.assertEqual(dr.state, dagrun_state) def test_dagrun_fail(self): """ DagRuns with one failed and one incomplete root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_fail', expected_task_states={ 'test_dagrun_fail': State.FAILED, 'test_dagrun_succeed': State.UPSTREAM_FAILED, }, dagrun_state=State.FAILED) def test_dagrun_success(self): """ DagRuns with one failed and one successful root task -> SUCCESS """ self.evaluate_dagrun( dag_id='test_dagrun_states_success', expected_task_states={ 'test_dagrun_fail': State.FAILED, 'test_dagrun_succeed': State.SUCCESS, }, dagrun_state=State.SUCCESS) def test_dagrun_root_fail(self): """ DagRuns with one successful and one failed root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_root_fail', expected_task_states={ 'test_dagrun_succeed': State.SUCCESS, 'test_dagrun_fail': State.FAILED, }, dagrun_state=State.FAILED) def test_scheduler_pooled_tasks(self): """ Test that the scheduler handles queued tasks correctly See issue #1299 """ session = settings.Session() if not ( session.query(Pool) .filter(Pool.pool == 'test_queued_pool') .first()): pool = Pool(pool='test_queued_pool', slots=5) session.merge(pool) session.commit() session.close() dag_id = 'test_scheduled_queued_tasks' dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_id, num_runs=1, executor=TestExecutor(), **self.default_scheduler_args) scheduler.run() task_1 = dag.tasks[0] logging.info("Trying to find task {}".format(task_1)) ti = TI(task_1, dag.start_date) ti.refresh_from_db() logging.error("TI is: {}".format(ti)) self.assertEqual(ti.state, State.QUEUED) # now we use a DIFFERENT scheduler and executor # to simulate the num-runs CLI arg scheduler2 = SchedulerJob( dag_id, num_runs=5, executor=DEFAULT_EXECUTOR.__class__(), **self.default_scheduler_args) scheduler2.run() ti.refresh_from_db() self.assertEqual(ti.state, State.FAILED) dag.clear() def test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date(self): """ DagRun is marked a success if ignore_first_depends_on_past=True Test that an otherwise-deadlocked dagrun is marked as a success if ignore_first_depends_on_past=True and the dagrun execution_date is after the start_date. """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': State.SUCCESS, 'test_depends_on_past_2': State.SUCCESS, }, dagrun_state=State.SUCCESS, advance_execution_date=True, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_dagrun_deadlock_ignore_depends_on_past(self): """ Test that ignore_first_depends_on_past doesn't affect results (this is the same test as test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date except that start_date == execution_date so depends_on_past is irrelevant). """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': State.SUCCESS, 'test_depends_on_past_2': State.SUCCESS, }, dagrun_state=State.SUCCESS, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_scheduler_start_date(self): """ Test that the scheduler respects start_dates, even when DAGS have run """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) scheduler = SchedulerJob(dag_id, num_runs=2, **self.default_scheduler_args) scheduler.run() # zero tasks ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0) # previously, running this backfill would kick off the Scheduler # because it would take the most recent run and start from there # That behavior still exists, but now it will only do so if after the # start date backfill = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) backfill.run() # one task ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) scheduler = SchedulerJob(dag_id, num_runs=2, **self.default_scheduler_args) scheduler.run() # still one task session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) def test_scheduler_multiprocessing(self): """ Test that the scheduler can successfully queue multiple dags in parallel """ dag_ids = ['test_start_date_scheduling', 'test_dagrun_states_success'] for dag_id in dag_ids: dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, file_process_interval=0, processor_poll_interval=0.5, num_runs=2) scheduler.run() # zero tasks ran dag_id = 'test_start_date_scheduling' session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0) def test_scheduler_dagrun_once(self): """ Test if the scheduler does not create multiple dagruns if a dag is scheduled with @once and a start_date """ dag = DAG( 'test_scheduler_dagrun_once', start_date=datetime.datetime(2015, 1, 1), schedule_interval="@once") scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) def test_scheduler_process_task_instances(self): """ Test if _process_task_instances puts the right task instances into the queue. """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE) ) def test_scheduler_do_not_schedule_removed_task(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called() def test_scheduler_do_not_schedule_too_early(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_too_early', start_date=datetime.datetime(2200, 1, 1)) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called() def test_scheduler_do_not_run_finished(self): dag = DAG( dag_id='test_scheduler_do_not_run_finished', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.SUCCESS session.commit() session.close() queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called() def test_scheduler_add_new_task(self): """ Test if a task instance will be added if the dag is updated """ dag = DAG( dag_id='test_scheduler_add_new_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances() self.assertEquals(len(tis), 1) dag_task2 = DummyOperator( task_id='dummy2', dag=dag, owner='airflow') queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) def test_scheduler_verify_max_active_runs(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) def test_scheduler_fail_dagrun_timeout(self): """ Test if a a dagrun wil be set failed if timeout """ dag = DAG( dag_id='test_scheduler_fail_dagrun_timeout', start_date=DEFAULT_DATE) dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() dr2 = scheduler.create_dag_run(dag) self.assertIsNotNone(dr2) dr.refresh_from_db(session=session) self.assertEquals(dr.state, State.FAILED) def test_scheduler_verify_max_active_runs_and_dagrun_timeout(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached and dagrun_timeout is not reached Test if a a dagrun will be scheduled if max_dag_runs has been reached but dagrun_timeout is also reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs_and_dagrun_timeout', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) # Should not be scheduled as DagRun has not timedout and max_active_runs is reached new_dr = scheduler.create_dag_run(dag) self.assertIsNone(new_dr) # Should be scheduled as dagrun_timeout has passed dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() new_dr = scheduler.create_dag_run(dag) self.assertIsNotNone(new_dr) def test_scheduler_auto_align(self): """ Test if the schedule_interval will be auto aligned with the start_date such that if the start_date coincides with the schedule the first execution_date will be start_date, otherwise it will be start_date + interval. """ dag = DAG( dag_id='test_scheduler_auto_align_1', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="4 5 * * *" ) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 2, 5, 4)) dag = DAG( dag_id='test_scheduler_auto_align_2', start_date=datetime.datetime(2016, 1, 1, 10, 10, 0), schedule_interval="10 10 * * *" ) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 1, 10, 10)) def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join(models.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks)) def test_scheduler_run_duration(self): """ Verifies that the scheduler run duration limit is followed. """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) expected_run_duration = 5 start_time = datetime.datetime.now() scheduler = SchedulerJob(dag_id, run_duration=expected_run_duration, **self.default_scheduler_args) scheduler.run() end_time = datetime.datetime.now() run_duration = (end_time - start_time).total_seconds() logging.info("Test ran in %.2fs, expected %.2fs", run_duration, expected_run_duration) assert run_duration - expected_run_duration < 5.0 def test_dag_with_system_exit(self): """ Test to check that a DAG with a system.exit() doesn't break the scheduler. """ dag_id = 'exit_test_dag' dag_ids = [dag_id] dag_directory = os.path.join(models.DAGS_FOLDER, "..", "dags_with_system_exit") dag_file = os.path.join(dag_directory, 'b_test_scheduler_dags.py') dagbag = DagBag(dag_folder=dag_file) for dag_id in dag_ids: dag = dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, subdir= dag_directory, num_runs=1, **self.default_scheduler_args) scheduler.run() session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
class BackfillJobTest(unittest.TestCase): def setUp(self): self.parser = cli.CLIFactory.get_parser() self.dagbag = DagBag() def test_backfill_examples(self): """ Test backfilling example dags """ dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in ('example_bash_operator',)] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for dag in dags: job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) job.run() def test_trap_executor_error(self): """ Test that errors setting up tasks (before tasks run) are caught Test for https://github.com/airbnb/airflow/pull/1220 """ dag = self.dagbag.get_dag('test_raise_executor_error') dag.clear() job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # run with timeout because this creates an infinite loop if not # caught def run_with_timeout(): with timeout(seconds=30): job.run() self.assertRaises(AirflowException, run_with_timeout) def test_backfill_pooled_tasks(self): """ Test that queued tasks are executed by BackfillJob Test for https://github.com/airbnb/airflow/pull/1225 """ session = settings.Session() pool = Pool(pool='test_backfill_pooled_task_pool', slots=1) session.add(pool) session.commit() dag = self.dagbag.get_dag('test_backfill_pooled_task_dag') dag.clear() job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # run with timeout because this creates an infinite loop if not # caught with timeout(seconds=30): job.run() ti = TI( task=dag.get_task('test_backfill_pooled_task'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS) def test_backfill_depends_on_past(self): """ Test that backfill resects ignore_depends_on_past """ dag = self.dagbag.get_dag('test_depends_on_past') dag.clear() run_date = DEFAULT_DATE + datetime.timedelta(days=5) # backfill should deadlock self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run) BackfillJob( dag=dag, start_date=run_date, end_date=run_date, ignore_first_depends_on_past=True).run() # ti should have succeeded ti = TI(dag.tasks[0], run_date) ti.refresh_from_db() self.assertEquals(ti.state, State.SUCCESS) def test_cli_backfill_depends_on_past(self): """ Test that CLI respects -I argument """ dag_id = 'test_dagrun_states_deadlock' run_date = DEFAULT_DATE + datetime.timedelta(days=1) args = [ 'backfill', dag_id, '-l', '-s', run_date.isoformat(), ] dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', cli.backfill, self.parser.parse_args(args)) cli.backfill(self.parser.parse_args(args + ['-I'])) ti = TI(dag.get_task('test_depends_on_past'), run_date) ti.refresh_from_db() # task ran self.assertEqual(ti.state, State.SUCCESS)
class SchedulerJobTest(unittest.TestCase): def setUp(self): self.dagbag = DagBag() @provide_session def evaluate_dagrun( self, dag_id, first_task_state, second_task_state, dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob() dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.schedule_dag(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.schedule_dag(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks task_1, task_2 = dag.tasks ti = TI(task_1, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, first_task_state) ti = TI(task_2, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, second_task_state) # load dagrun dr = session.query(DagRun).filter( DagRun.dag_id == dag.dag_id, DagRun.execution_date == ex_date ).first() # dagrun is running self.assertEqual(dr.state, State.RUNNING) dag.get_active_runs() # dagrun failed self.assertEqual(dr.state, dagrun_state) def test_dagrun_fail(self): """ DagRuns with one failed and one incomplete root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_fail', first_task_state=State.FAILED, second_task_state=State.UPSTREAM_FAILED, dagrun_state=State.FAILED) def test_dagrun_success(self): """ DagRuns with one failed and one successful root task -> SUCCESS """ self.evaluate_dagrun( dag_id='test_dagrun_states_success', first_task_state=State.FAILED, second_task_state=State.SUCCESS, dagrun_state=State.SUCCESS) def test_dagrun_root_fail(self): """ DagRuns with one successful and one failed root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_root_fail', first_task_state=State.SUCCESS, second_task_state=State.FAILED, dagrun_state=State.FAILED) def test_dagrun_deadlock(self): """ Deadlocked DagRun is marked a failure Test that a deadlocked dagrun is marked as a failure by having depends_on_past and an execution_date after the start_date """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', first_task_state=None, second_task_state=None, dagrun_state=State.FAILED, advance_execution_date=True) def test_scheduler_pooled_tasks(self): """ Test that the scheduler handles queued tasks correctly See issue #1299 """ session = settings.Session() if not ( session.query(Pool) .filter(Pool.pool == 'test_queued_pool') .first()): pool = Pool(pool='test_queued_pool', slots=5) session.merge(pool) session.commit() session.close() dag_id = 'test_scheduled_queued_tasks' dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_id, num_runs=10) scheduler.run() task_1 = dag.tasks[0] ti = TI(task_1, dag.start_date) ti.refresh_from_db() self.assertEqual(ti.state, State.FAILED) dag.clear() def test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date(self): """ DagRun is marked a success if ignore_first_depends_on_past=True Test that an otherwise-deadlocked dagrun is marked as a success if ignore_first_depends_on_past=True and the dagrun execution_date is after the start_date. """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', first_task_state=State.SUCCESS, second_task_state=State.SUCCESS, dagrun_state=State.SUCCESS, advance_execution_date=True, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_dagrun_deadlock_ignore_depends_on_past(self): """ Test that ignore_first_depends_on_past doesn't affect results (this is the same test as test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date except that start_date == execution_date so depends_on_past is irrelevant). """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', first_task_state=State.SUCCESS, second_task_state=State.SUCCESS, dagrun_state=State.SUCCESS, run_kwargs=dict(ignore_first_depends_on_past=True))
class SchedulerJobTest(unittest.TestCase): def setUp(self): self.dagbag = DagBag() @provide_session def evaluate_dagrun( self, dag_id, expected_task_states, # dict of task_id: state dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS. This is hackish: a dag run is created but its tasks are run by a backfill. """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob() dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.schedule_dag(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.schedule_dag(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks for task_id, expected_state in expected_task_states.items(): task = dag.get_task(task_id) ti = TI(task, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, expected_state) # load dagrun dr = DagRun.find(dag_id=dag_id, execution_date=ex_date) dr = dr[0] dr.dag = dag # dagrun is running self.assertEqual(dr.state, State.RUNNING) dr.update_state() # dagrun failed self.assertEqual(dr.state, dagrun_state) def test_dagrun_fail(self): """ DagRuns with one failed and one incomplete root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_fail', expected_task_states={ 'test_dagrun_fail': State.FAILED, 'test_dagrun_succeed': State.UPSTREAM_FAILED, }, dagrun_state=State.FAILED) def test_dagrun_success(self): """ DagRuns with one failed and one successful root task -> SUCCESS """ self.evaluate_dagrun( dag_id='test_dagrun_states_success', expected_task_states={ 'test_dagrun_fail': State.FAILED, 'test_dagrun_succeed': State.SUCCESS, }, dagrun_state=State.SUCCESS) def test_dagrun_root_fail(self): """ DagRuns with one successful and one failed root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_root_fail', expected_task_states={ 'test_dagrun_succeed': State.SUCCESS, 'test_dagrun_fail': State.FAILED, }, dagrun_state=State.FAILED) def test_dagrun_deadlock(self): """ Deadlocked DagRun is marked a failure Test that a deadlocked dagrun is marked as a failure by having depends_on_past and an execution_date after the start_date """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': None, 'test_depends_on_past_2': None, }, dagrun_state=State.FAILED, advance_execution_date=True) def test_scheduler_pooled_tasks(self): """ Test that the scheduler handles queued tasks correctly See issue #1299 """ session = settings.Session() if not ( session.query(Pool) .filter(Pool.pool == 'test_queued_pool') .first()): pool = Pool(pool='test_queued_pool', slots=5) session.merge(pool) session.commit() session.close() dag_id = 'test_scheduled_queued_tasks' dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_id, num_runs=1) scheduler.run() task_1 = dag.tasks[0] logging.info("Trying to find task {}".format(task_1)) ti = TI(task_1, dag.start_date) ti.refresh_from_db() self.assertEqual(ti.state, State.QUEUED) # now we use a DIFFERENT scheduler and executor # to simulate the num-runs CLI arg scheduler2 = SchedulerJob( dag_id, num_runs=5, executor=DEFAULT_EXECUTOR.__class__()) scheduler2.run() ti.refresh_from_db() self.assertEqual(ti.state, State.FAILED) dag.clear() def test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date(self): """ DagRun is marked a success if ignore_first_depends_on_past=True Test that an otherwise-deadlocked dagrun is marked as a success if ignore_first_depends_on_past=True and the dagrun execution_date is after the start_date. """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': State.SUCCESS, 'test_depends_on_past_2': State.SUCCESS, }, dagrun_state=State.SUCCESS, advance_execution_date=True, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_dagrun_deadlock_ignore_depends_on_past(self): """ Test that ignore_first_depends_on_past doesn't affect results (this is the same test as test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date except that start_date == execution_date so depends_on_past is irrelevant). """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': State.SUCCESS, 'test_depends_on_past_2': State.SUCCESS, }, dagrun_state=State.SUCCESS, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_scheduler_start_date(self): """ Test that the scheduler respects start_dates, even when DAGS have run """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) scheduler = SchedulerJob(dag_id, num_runs=2) scheduler.run() # zero tasks ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0) # previously, running this backfill would kick off the Scheduler # because it would take the most recent run and start from there # That behavior still exists, but now it will only do so if after the # start date backfill = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) backfill.run() # one task ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) scheduler = SchedulerJob(dag_id, num_runs=2) scheduler.run() # still one task session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) def test_scheduler_multiprocessing(self): """ Test that the scheduler can successfully queue multiple dags in parallel """ dag_ids = ['test_start_date_scheduling', 'test_dagrun_states_success'] for dag_id in dag_ids: dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, num_runs=2) scheduler.run() # zero tasks ran dag_id = 'test_start_date_scheduling' session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0) def test_scheduler_dagrun_once(self): """ Test if the scheduler does not create multiple dagruns if a dag is scheduled with @once and a start_date """ dag = DAG( 'test_scheduler_dagrun_once', start_date=datetime.datetime(2015, 1, 1), schedule_interval="@once") scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) dr = scheduler.schedule_dag(dag) self.assertIsNone(dr) def test_scheduler_process_execute_task(self): """ Test if process dag sends a task to the executor """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_called_with( ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None) ) tis = dr.get_task_instances(state=State.SCHEDULED) self.assertIsNotNone(tis) def test_scheduler_process_check_heartrate(self): """ Test if process dag honors the heartrate """ dag = DAG( dag_id='test_scheduler_process_check_heartrate', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.last_scheduler_run = datetime.datetime.now() session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() scheduler.heartrate = 1000 dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
class SchedulerJobTest(unittest.TestCase): def setUp(self): self.dagbag = DagBag() @provide_session def evaluate_dagrun( self, dag_id, expected_task_states, # dict of task_id: state dagrun_state, run_kwargs=None, advance_execution_date=False, session=None): """ Helper for testing DagRun states with simple two-task DAGS """ if run_kwargs is None: run_kwargs = {} scheduler = SchedulerJob() dag = self.dagbag.get_dag(dag_id) dag.clear() dr = scheduler.schedule_dag(dag) if advance_execution_date: # run a second time to schedule a dagrun after the start_date dr = scheduler.schedule_dag(dag) ex_date = dr.execution_date try: dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs) except AirflowException: pass # test tasks for task_id, expected_state in expected_task_states.items(): task = dag.get_task(task_id) ti = TI(task, ex_date) ti.refresh_from_db() self.assertEqual(ti.state, expected_state) # load dagrun dr = session.query(DagRun).filter( DagRun.dag_id == dag.dag_id, DagRun.execution_date == ex_date ).first() # dagrun is running self.assertEqual(dr.state, State.RUNNING) dag.get_active_runs() # dagrun failed self.assertEqual(dr.state, dagrun_state) def test_dagrun_fail(self): """ DagRuns with one failed and one incomplete root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_fail', expected_task_states={ 'test_dagrun_fail': State.FAILED, 'test_dagrun_succeed': State.UPSTREAM_FAILED, }, dagrun_state=State.FAILED) def test_dagrun_success(self): """ DagRuns with one failed and one successful root task -> SUCCESS """ self.evaluate_dagrun( dag_id='test_dagrun_states_success', expected_task_states={ 'test_dagrun_fail': State.FAILED, 'test_dagrun_succeed': State.SUCCESS, }, dagrun_state=State.SUCCESS) def test_dagrun_root_fail(self): """ DagRuns with one successful and one failed root task -> FAILED """ self.evaluate_dagrun( dag_id='test_dagrun_states_root_fail', expected_task_states={ 'test_dagrun_succeed': State.SUCCESS, 'test_dagrun_fail': State.FAILED, }, dagrun_state=State.FAILED) def test_dagrun_deadlock(self): """ Deadlocked DagRun is marked a failure Test that a deadlocked dagrun is marked as a failure by having depends_on_past and an execution_date after the start_date """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': None, 'test_depends_on_past_2': None, }, dagrun_state=State.FAILED, advance_execution_date=True) def test_scheduler_pooled_tasks(self): """ Test that the scheduler handles queued tasks correctly See issue #1299 """ session = settings.Session() if not ( session.query(Pool) .filter(Pool.pool == 'test_queued_pool') .first()): pool = Pool(pool='test_queued_pool', slots=5) session.merge(pool) session.commit() session.close() dag_id = 'test_scheduled_queued_tasks' dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_id, num_runs=10) scheduler.run() task_1 = dag.tasks[0] logging.info("Trying to find task {}".format(task_1)) ti = TI(task_1, dag.start_date) ti.refresh_from_db() self.assertEqual(ti.state, State.FAILED) dag.clear() def test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date(self): """ DagRun is marked a success if ignore_first_depends_on_past=True Test that an otherwise-deadlocked dagrun is marked as a success if ignore_first_depends_on_past=True and the dagrun execution_date is after the start_date. """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': State.SUCCESS, 'test_depends_on_past_2': State.SUCCESS, }, dagrun_state=State.SUCCESS, advance_execution_date=True, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_dagrun_deadlock_ignore_depends_on_past(self): """ Test that ignore_first_depends_on_past doesn't affect results (this is the same test as test_dagrun_deadlock_ignore_depends_on_past_advance_ex_date except that start_date == execution_date so depends_on_past is irrelevant). """ self.evaluate_dagrun( dag_id='test_dagrun_states_deadlock', expected_task_states={ 'test_depends_on_past': State.SUCCESS, 'test_depends_on_past_2': State.SUCCESS, }, dagrun_state=State.SUCCESS, run_kwargs=dict(ignore_first_depends_on_past=True)) def test_scheduler_start_date(self): """ Test that the scheduler respects start_dates, even when DAGS have run """ dag_id = 'test_start_date_scheduling' dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertTrue(dag.start_date > DEFAULT_DATE) scheduler = SchedulerJob(dag_id, num_runs=2) scheduler.run() # zero tasks ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0) # previously, running this backfill would kick off the Scheduler # because it would take the most recent run and start from there # That behavior still exists, but now it will only do so if after the # start date backfill = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) backfill.run() # one task ran session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) scheduler = SchedulerJob(dag_id, num_runs=2) scheduler.run() # still one task session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1) def test_scheduler_multiprocessing(self): """ Test that the scheduler can successfully queue multiple dags in parallel """ dag_ids = ['test_start_date_scheduling', 'test_dagrun_states_success'] for dag_id in dag_ids: dag = self.dagbag.get_dag(dag_id) dag.clear() scheduler = SchedulerJob(dag_ids=dag_ids, num_runs=2) scheduler.run() # zero tasks ran dag_id = 'test_start_date_scheduling' session = settings.Session() self.assertEqual( len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)