def test_subdag_deadlock(self): dagbag = DagBag() dag = dagbag.get_dag('test_subdag_deadlock') dag.clear() subdag = dagbag.get_dag('test_subdag_deadlock.subdag') subdag.clear() # first make sure subdag has failed self.assertRaises(AirflowException, subdag.run, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # now make sure dag picks up the subdag error self.assertRaises(AirflowException, dag.run, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
def downgrade(): engine = settings.engine if engine.dialect.has_table(engine, 'task_instance'): connection = op.get_bind() sessionmaker = sa.orm.sessionmaker() session = sessionmaker(bind=connection) dagbag = DagBag(settings.DAGS_FOLDER) query = session.query(sa.func.count(TaskInstance.max_tries)).filter( TaskInstance.max_tries != -1 ) while query.scalar(): tis = session.query(TaskInstance).filter( TaskInstance.max_tries != -1 ).limit(BATCH_SIZE).all() for ti in tis: dag = dagbag.get_dag(ti.dag_id) if not dag or not dag.has_task(ti.task_id): ti.try_number = 0 else: task = dag.get_task(ti.task_id) # max_tries - try_number is number of times a task instance # left to retry by itself. So the current try_number should be # max number of self retry (task.retries) minus number of # times left for task instance to try the task. ti.try_number = max(0, task.retries - (ti.max_tries - ti.try_number)) ti.max_tries = -1 session.merge(ti) session.commit() session.commit() op.drop_column('task_instance', 'max_tries')
def get_task_instance(dag_id, task_id, execution_date): """Return the task object identified by the given dag_id and task_id.""" dagbag = DagBag() # Check DAG exists. if dag_id not in dagbag.dags: error_message = "Dag id {} not found".format(dag_id) raise DagNotFound(error_message) # Get DAG object and check Task Exists dag = dagbag.get_dag(dag_id) if not dag.has_task(task_id): error_message = 'Task {} not found in dag {}'.format(task_id, dag_id) raise TaskNotFound(error_message) # Get DagRun object and check that it exists dagrun = dag.get_dagrun(execution_date=execution_date) if not dagrun: error_message = ('Dag Run for date {} not found in dag {}' .format(execution_date, dag_id)) raise DagRunNotFound(error_message) # Get task instance object and check that it exists task_instance = dagrun.get_task_instance(task_id) if not task_instance: error_message = ('Task {} instance for date {} not found' .format(task_id, execution_date)) raise TaskInstanceNotFound(error_message) return task_instance
def test_trigger_dag_for_date(self): url_template = '/api/experimental/dags/{}/dag_runs' dag_id = 'example_bash_operator' hour_from_now = utcnow() + timedelta(hours=1) execution_date = datetime(hour_from_now.year, hour_from_now.month, hour_from_now.day, hour_from_now.hour) datetime_string = execution_date.isoformat() # Test Correct execution response = self.client.post(url_template.format(dag_id), data=json.dumps( {'execution_date': datetime_string}), content_type="application/json") self.assertEqual(200, response.status_code) dagbag = DagBag() dag = dagbag.get_dag(dag_id) dag_run = dag.get_dagrun(execution_date) self.assertTrue( dag_run, 'Dag Run not found for execution date {}'.format(execution_date)) # Test error for nonexistent dag response = self.client.post( url_template.format('does_not_exist_dag'), data=json.dumps({'execution_date': execution_date.isoformat()}), content_type="application/json") self.assertEqual(404, response.status_code) # Test error for bad datetime format response = self.client.post(url_template.format(dag_id), data=json.dumps( {'execution_date': 'not_a_datetime'}), content_type="application/json") self.assertEqual(400, response.status_code)
def get_task(dag_id, task_id): """Return the task object identified by the given dag_id and task_id.""" dagbag = DagBag() # Check DAG exists. if dag_id not in dagbag.dags: error_message = "Dag id {} not found".format(dag_id) raise DagNotFound(error_message) # Get DAG object and check Task Exists dag = dagbag.get_dag(dag_id) if not dag.has_task(task_id): error_message = 'Task {} not found in dag {}'.format(task_id, dag_id) raise TaskNotFound(error_message) # Return the task. return dag.get_task(task_id)
def upgrade(): op.add_column('task_instance', sa.Column('max_tries', sa.Integer, server_default="-1")) # Check if table task_instance exist before data migration. This check is # needed for database that does not create table until migration finishes. # Checking task_instance table exists prevent the error of querying # non-existing task_instance table. connection = op.get_bind() inspector = Inspector.from_engine(connection) tables = inspector.get_table_names() if 'task_instance' in tables: # Get current session sessionmaker = sa.orm.sessionmaker() session = sessionmaker(bind=connection) dagbag = DagBag(settings.DAGS_FOLDER) query = session.query(sa.func.count(TaskInstance.max_tries)).filter( TaskInstance.max_tries == -1 ) # Separate db query in batch to prevent loading entire table # into memory and cause out of memory error. while query.scalar(): tis = session.query(TaskInstance).filter( TaskInstance.max_tries == -1 ).limit(BATCH_SIZE).all() for ti in tis: dag = dagbag.get_dag(ti.dag_id) if not dag or not dag.has_task(ti.task_id): # task_instance table might not have the up-to-date # information, i.e dag or task might be modified or # deleted in dagbag but is reflected in task instance # table. In this case we do not retry the task that can't # be parsed. ti.max_tries = ti.try_number else: task = dag.get_task(ti.task_id) if task.retries: ti.max_tries = task.retries else: ti.max_tries = ti.try_number session.merge(ti) session.commit() # Commit the current session. session.commit()
def get_dag_run_state(dag_id, execution_date): """Return the task object identified by the given dag_id and task_id.""" dagbag = DagBag() # Check DAG exists. if dag_id not in dagbag.dags: error_message = "Dag id {} not found".format(dag_id) raise DagNotFound(error_message) # Get DAG object and check Task Exists dag = dagbag.get_dag(dag_id) # Get DagRun object and check that it exists dagrun = dag.get_dagrun(execution_date=execution_date) if not dagrun: error_message = ('Dag Run for date {} not found in dag {}'.format( execution_date, dag_id)) raise DagRunNotFound(error_message) return {'state': dagrun.get_state()}
def test_find_zombies(self): manager = DagFileProcessorManager( dag_directory='directory', file_paths=['abc.txt'], max_runs=1, processor_factory=MagicMock().return_value, signal_conn=MagicMock(), stat_queue=MagicMock(), result_queue=MagicMock, async_mode=True) dagbag = DagBag(TEST_DAG_FOLDER) with create_session() as session: session.query(LJ).delete() dag = dagbag.get_dag('example_branch_operator') task = dag.get_task(task_id='run_this_first') ti = TI(task, DEFAULT_DATE, State.RUNNING) lj = LJ(ti) lj.state = State.SHUTDOWN lj.id = 1 ti.job_id = lj.id session.add(lj) session.add(ti) session.commit() manager._last_zombie_query_time = timezone.utcnow() - timedelta( seconds=manager._zombie_threshold_secs + 1) zombies = manager._find_zombies() self.assertEquals(1, len(zombies)) self.assertIsInstance(zombies[0], SimpleTaskInstance) self.assertEquals(ti.dag_id, zombies[0].dag_id) self.assertEquals(ti.task_id, zombies[0].task_id) self.assertEquals(ti.execution_date, zombies[0].execution_date) session.query(TI).delete() session.query(LJ).delete()
class ImpersonationTest(unittest.TestCase): def setUp(self): self.dagbag = DagBag( dag_folder=TEST_DAG_FOLDER, include_examples=False, ) logger.info('Loaded DAGS:') logger.info(self.dagbag.dagbag_report()) try: subprocess.check_output( ['sudo', 'useradd', '-m', TEST_USER, '-g', str(os.getegid())]) except OSError as e: if e.errno == errno.ENOENT: raise unittest.SkipTest( "The 'useradd' command did not exist so unable to test " "impersonation; Skipping Test. These tests can only be run on a " "linux host that supports 'useradd'.") else: raise unittest.SkipTest( "The 'useradd' command exited non-zero; Skipping tests. Does the " "current user have permission to run 'useradd' without a password " "prompt (check sudoers file)?") def tearDown(self): subprocess.check_output(['sudo', 'userdel', '-r', TEST_USER]) def run_backfill(self, dag_id, task_id): dag = self.dagbag.get_dag(dag_id) dag.clear() jobs.BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE).run() ti = models.TaskInstance(task=dag.get_task(task_id), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS) def test_impersonation(self): """ Tests that impersonating a unix user works """ self.run_backfill('test_impersonation', 'test_impersonated_user') def test_no_impersonation(self): """ If default_impersonation=None, tests that the job is run as the current user (which will be a sudoer) """ self.run_backfill( 'test_no_impersonation', 'test_superuser', ) def test_default_impersonation(self): """ If default_impersonation=TEST_USER, tests that the job defaults to running as TEST_USER for a test without run_as_user set """ os.environ['AIRFLOW__CORE__DEFAULT_IMPERSONATION'] = TEST_USER try: self.run_backfill('test_default_impersonation', 'test_deelevated_user') finally: del os.environ['AIRFLOW__CORE__DEFAULT_IMPERSONATION'] def test_impersonation_custom(self): """ Tests that impersonation using a unix user works with custom packages in PYTHONPATH """ # PYTHONPATH is already set in script triggering tests assert 'PYTHONPATH' in os.environ self.run_backfill('impersonation_with_custom_pkg', 'exec_python_fn') def test_impersonation_subdag(self): """ Tests that impersonation using a subdag correctly passes the right configuration :return: """ self.run_backfill('impersonation_subdag', 'test_subdag_operation')
def _run_dag(self): dag_bag = DagBag(dag_folder=TESTS_DAG_FOLDER, include_examples=False) self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} dag = dag_bag.get_dag(self.dag_id) dag.clear(reset_dag_runs=True) dag.run(ignore_first_depends_on_past=True, verbose=True)