def _find_zombies(self, session): """ Find zombie task instances, which are tasks haven't heartbeated for too long. :return: Zombie task instances in SimpleTaskInstance format. """ now = timezone.utcnow() zombies = [] if (now - self._last_zombie_query_time).total_seconds() \ > self._zombie_query_interval: # to avoid circular imports from airflow.jobs import LocalTaskJob as LJ self.log.info("Finding 'running' jobs without a recent heartbeat") TI = airflow.models.TaskInstance limit_dttm = timezone.utcnow() - timedelta( seconds=self._zombie_threshold_secs) self.log.info("Failing jobs without heartbeat after %s", limit_dttm) tis = ( session.query(TI) .join(LJ, TI.job_id == LJ.id) .filter(TI.state == State.RUNNING) .filter( or_( LJ.state != State.RUNNING, LJ.latest_heartbeat < limit_dttm, ) ).all() ) self._last_zombie_query_time = timezone.utcnow() for ti in tis: zombies.append(SimpleTaskInstance(ti)) return zombies
def assert_tasks_on_executor(self, executor): # start the executor executor.start() success_command = ['true', 'some_parameter'] fail_command = ['false', 'some_parameter'] executor.execute_async(key='success', command=success_command) executor.execute_async(key='fail', command=fail_command) success_future = next( k for k, v in executor.futures.items() if v == 'success') fail_future = next( k for k, v in executor.futures.items() if v == 'fail') # wait for the futures to execute, with a timeout timeout = timezone.utcnow() + timedelta(seconds=30) while not (success_future.done() and fail_future.done()): if timezone.utcnow() > timeout: raise ValueError( 'The futures should have finished; there is probably ' 'an error communciating with the Dask cluster.') # both tasks should have finished self.assertTrue(success_future.done()) self.assertTrue(fail_future.done()) # check task exceptions self.assertTrue(success_future.exception() is None) self.assertTrue(fail_future.exception() is not None)
def test_file_transfer_no_intermediate_dir_error_get(self): configuration.conf.set("core", "enable_xcom_pickling", "True") test_remote_file_content = \ "This is remote file content \n which is also multiline " \ "another line here \n this is last line. EOF" # create a test file remotely create_file_task = SSHOperator( task_id="test_create_file", ssh_hook=self.hook, command="echo '{0}' > {1}".format(test_remote_file_content, self.test_remote_filepath), do_xcom_push=True, dag=self.dag ) self.assertIsNotNone(create_file_task) ti1 = TaskInstance(task=create_file_task, execution_date=timezone.utcnow()) ti1.run() # Try to GET test file from remote # This should raise an error with "No such file" as the directory # does not exist with self.assertRaises(Exception) as error: get_test_task = SFTPOperator( task_id="test_sftp", ssh_hook=self.hook, local_filepath=self.test_local_filepath_int_dir, remote_filepath=self.test_remote_filepath, operation=SFTPOperation.GET, dag=self.dag ) self.assertIsNotNone(get_test_task) ti2 = TaskInstance(task=get_test_task, execution_date=timezone.utcnow()) ti2.run() self.assertIn('No such file', str(error.exception))
def heartbeat(self): """ Override the scheduler heartbeat to determine when the test is complete """ super(SchedulerMetricsJob, self).heartbeat() session = settings.Session() # Get all the relevant task instances TI = TaskInstance successful_tis = ( session .query(TI) .filter(TI.dag_id.in_(DAG_IDS)) .filter(TI.state.in_([State.SUCCESS])) .all() ) session.commit() dagbag = DagBag(SUBDIR) dags = [dagbag.dags[dag_id] for dag_id in DAG_IDS] # the tasks in perf_dag_1 and per_dag_2 have a daily schedule interval. num_task_instances = sum([(timezone.utcnow() - task.start_date).days for dag in dags for task in dag.tasks]) if (len(successful_tis) == num_task_instances or (timezone.utcnow() - self.start_date).total_seconds() > MAX_RUNTIME_SECS): if len(successful_tis) == num_task_instances: self.log.info("All tasks processed! Printing stats.") else: self.log.info("Test timeout reached. " "Printing available stats.") self.print_stats() set_dags_paused_state(True) sys.exit()
def test_dask_executor_functions(self): executor = DaskExecutor(cluster_address=self.cluster.scheduler_address) # start the executor executor.start() success_command = 'echo 1' fail_command = 'exit 1' executor.execute_async(key='success', command=success_command) executor.execute_async(key='fail', command=fail_command) success_future = next( k for k, v in executor.futures.items() if v == 'success') fail_future = next( k for k, v in executor.futures.items() if v == 'fail') # wait for the futures to execute, with a timeout timeout = timezone.utcnow() + timedelta(seconds=30) while not (success_future.done() and fail_future.done()): if timezone.utcnow() > timeout: raise ValueError( 'The futures should have finished; there is probably ' 'an error communciating with the Dask cluster.') # both tasks should have finished self.assertTrue(success_future.done()) self.assertTrue(fail_future.done()) # check task exceptions self.assertTrue(success_future.exception() is None) self.assertTrue(fail_future.exception() is not None)
def test_symlink_latest_log_directory(self): handler = FileProcessorHandler(base_log_folder=self.base_log_folder, filename_template=self.filename) handler.dag_dir = self.dag_dir date1 = (timezone.utcnow() + timedelta(days=1)).strftime("%Y-%m-%d") date2 = (timezone.utcnow() + timedelta(days=2)).strftime("%Y-%m-%d") p1 = os.path.join(self.base_log_folder, date1, "log1") p2 = os.path.join(self.base_log_folder, date1, "log2") if os.path.exists(p1): os.remove(p1) if os.path.exists(p2): os.remove(p2) link = os.path.join(self.base_log_folder, "latest") with freeze_time(date1): handler.set_context(filename=os.path.join(self.dag_dir, "log1")) self.assertTrue(os.path.islink(link)) self.assertEqual(os.path.basename(os.readlink(link)), date1) self.assertTrue(os.path.exists(os.path.join(link, "log1"))) with freeze_time(date2): handler.set_context(filename=os.path.join(self.dag_dir, "log2")) self.assertTrue(os.path.islink(link)) self.assertEqual(os.path.basename(os.readlink(link)), date2) self.assertTrue(os.path.exists(os.path.join(link, "log2")))
def test_should_fail_before_reschedule_date_multiple(self, find_for_task_instance): find_for_task_instance.return_value = [ self._get_task_reschedule(utcnow() - timedelta(minutes=19)), self._get_task_reschedule(utcnow() - timedelta(minutes=9)), self._get_task_reschedule(utcnow() + timedelta(minutes=1)), ] ti = self._get_task_instance(State.UP_FOR_RESCHEDULE) self.assertFalse(ReadyToRescheduleDep().is_met(ti=ti))
def poke(self, context): self.log.info('Poking until weekday is in %s, Today is %s', self.week_day, WeekDay(timezone.utcnow().isoweekday()).name) if self.use_task_execution_day: return context['execution_date'].isoweekday() in self._week_day_num else: return timezone.utcnow().isoweekday() in self._week_day_num
def _print_stat(self): """ Occasionally print out stats about how fast the files are getting processed """ if ((timezone.utcnow() - self.last_stat_print_time).total_seconds() > self.print_stats_interval): if len(self._file_paths) > 0: self._log_file_processing_stats(self._file_paths) self.last_stat_print_time = timezone.utcnow()
def execute(self, context): started_at = timezone.utcnow() while not self.poke(context): if (timezone.utcnow() - started_at).total_seconds() > self.timeout: if self.soft_fail: raise AirflowSkipException('Snap. Time is OUT.') else: raise AirflowSensorTimeout('Snap. Time is OUT.') sleep(self.poke_interval) self.log.info("Success criteria met. Exiting.")
def execute(self, context): started_at = timezone.utcnow() while not self.poke(context): if (timezone.utcnow() - started_at).total_seconds() > self.timeout: # If sensor is in soft fail mode but will be retried then # give it a chance and fail with timeout. # This gives the ability to set up non-blocking AND soft-fail sensors. if self.soft_fail and not context['ti'].is_eligible_to_retry(): self._do_skip_downstream_tasks(context) raise AirflowSkipException('Snap. Time is OUT.') else: raise AirflowSensorTimeout('Snap. Time is OUT.') sleep(self.poke_interval) self.log.info("Success criteria met. Exiting.")
def test_post_execute_hook(self): """ Test that post_execute hook is called with the Operator's result. The result ('error') will cause an error to be raised and trapped. """ class TestError(Exception): pass class TestOperator(PythonOperator): def post_execute(self, context, result): if result == 'error': raise TestError('expected error.') dag = models.DAG(dag_id='test_post_execute_dag') task = TestOperator( task_id='test_operator', dag=dag, python_callable=lambda: 'error', owner='airflow', start_date=timezone.datetime(2017, 2, 1)) ti = TI(task=task, execution_date=timezone.utcnow()) with self.assertRaises(TestError): ti.run()
def test_xcom_pull_different_execution_date(self): """ tests xcom fetch behavior with different execution dates, using both xcom_pull with "include_prior_dates" and without """ key = 'xcom_key' value = 'xcom_value' dag = models.DAG(dag_id='test_xcom', schedule_interval='@monthly') task = DummyOperator( task_id='test_xcom', dag=dag, pool='test_xcom', owner='airflow', start_date=timezone.datetime(2016, 6, 2, 0, 0, 0)) exec_date = timezone.utcnow() ti = TI( task=task, execution_date=exec_date) ti.run(mark_success=True) ti.xcom_push(key=key, value=value) self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value) ti.run() exec_date += datetime.timedelta(days=1) ti = TI( task=task, execution_date=exec_date) ti.run() # We have set a new execution date (and did not pass in # 'include_prior_dates'which means this task should now have a cleared # xcom value self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), None) # We *should* get a value using 'include_prior_dates' self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key, include_prior_dates=True), value)
def health(): """ An endpoint helping check the health status of the Airflow instance, including metadatabase and scheduler. """ session = settings.Session() BJ = jobs.BaseJob payload = {} scheduler_health_check_threshold = timedelta(seconds=conf.getint('scheduler', 'scheduler_health_check_threshold' )) latest_scheduler_heartbeat = None payload['metadatabase'] = {'status': 'healthy'} try: latest_scheduler_heartbeat = session.query(func.max(BJ.latest_heartbeat)). \ filter(BJ.state == 'running', BJ.job_type == 'SchedulerJob'). \ scalar() except Exception: payload['metadatabase']['status'] = 'unhealthy' if not latest_scheduler_heartbeat: scheduler_status = 'unhealthy' else: if timezone.utcnow() - latest_scheduler_heartbeat <= scheduler_health_check_threshold: scheduler_status = 'healthy' else: scheduler_status = 'unhealthy' payload['scheduler'] = {'status': scheduler_status, 'latest_scheduler_heartbeat': str(latest_scheduler_heartbeat)} return wwwutils.json_response(payload)
def _log_file_processing_stats(self, known_file_paths): """ Print out stats about how files are getting processed. :param known_file_paths: a list of file paths that may contain Airflow DAG definitions :type known_file_paths: list[unicode] :return: None """ # File Path: Path to the file containing the DAG definition # PID: PID associated with the process that's processing the file. May # be empty. # Runtime: If the process is currently running, how long it's been # running for in seconds. # Last Runtime: If the process ran before, how long did it take to # finish in seconds # Last Run: When the file finished processing in the previous run. headers = ["File Path", "PID", "Runtime", "Last Runtime", "Last Run"] rows = [] for file_path in known_file_paths: last_runtime = self.get_last_runtime(file_path) processor_pid = self.get_pid(file_path) processor_start_time = self.get_start_time(file_path) runtime = ((timezone.utcnow() - processor_start_time).total_seconds() if processor_start_time else None) last_run = self.get_last_finish_time(file_path) rows.append((file_path, processor_pid, runtime, last_runtime, last_run)) # Sort by longest last runtime. (Can't sort None values in python3) rows = sorted(rows, key=lambda x: x[3] or 0.0) formatted_rows = [] for file_path, pid, runtime, last_runtime, last_run in rows: formatted_rows.append((file_path, pid, "{:.2f}s".format(runtime) if runtime else None, "{:.2f}s".format(last_runtime) if last_runtime else None, last_run.strftime("%Y-%m-%dT%H:%M:%S") if last_run else None)) log_str = ("\n" + "=" * 80 + "\n" + "DAG File Processing Stats\n\n" + tabulate(formatted_rows, headers=headers) + "\n" + "=" * 80) self.log.info(log_str)
def test_with_skip_in_branch_downstream_dependencies2(self): self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_2') self.branch_op >> self.branch_1 >> self.branch_2 self.branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == 'branch_2': self.assertEqual(ti.state, State.NONE) else: raise Exception
def test_xcom_pull_after_success(self): """ tests xcom set/clear relative to a task in a 'success' rerun scenario """ key = 'xcom_key' value = 'xcom_value' dag = models.DAG(dag_id='test_xcom', schedule_interval='@monthly') task = DummyOperator( task_id='test_xcom', dag=dag, pool='test_xcom', owner='airflow', start_date=timezone.datetime(2016, 6, 2, 0, 0, 0)) exec_date = timezone.utcnow() ti = TI( task=task, execution_date=exec_date) ti.run(mark_success=True) ti.xcom_push(key=key, value=value) self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value) ti.run() # The second run and assert is to handle AIRFLOW-131 (don't clear on # prior success) self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value) # Test AIRFLOW-703: Xcom shouldn't be cleared if the task doesn't # execute, even if dependencies are ignored ti.run(ignore_all_deps=True, mark_success=True) self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value) # Xcom IS finally cleared once task has executed ti.run(ignore_all_deps=True) self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), None)
def _make_dag_run(self): return self.dag.create_dagrun( run_id='manual__', start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING )
def test_file_transfer_no_intermediate_dir_error_put(self): configuration.conf.set("core", "enable_xcom_pickling", "True") test_local_file_content = \ b"This is local file content \n which is multiline " \ b"continuing....with other character\nanother line here \n this is last line" # create a test file locally with open(self.test_local_filepath, 'wb') as f: f.write(test_local_file_content) # Try to put test file to remote # This should raise an error with "No such file" as the directory # does not exist with self.assertRaises(Exception) as error: put_test_task = SFTPOperator( task_id="test_sftp", ssh_hook=self.hook, local_filepath=self.test_local_filepath, remote_filepath=self.test_remote_filepath_int_dir, operation=SFTPOperation.PUT, create_intermediate_dirs=False, dag=self.dag ) self.assertIsNotNone(put_test_task) ti2 = TaskInstance(task=put_test_task, execution_date=timezone.utcnow()) ti2.run() self.assertIn('No such file', str(error.exception))
def test_xcom_enable_pickle_type(self): json_obj = {"key": "value"} execution_date = timezone.utcnow() key = "xcom_test2" dag_id = "test_dag2" task_id = "test_task2" configuration.set("core", "enable_xcom_pickling", "True") XCom.set(key=key, value=json_obj, dag_id=dag_id, task_id=task_id, execution_date=execution_date) ret_value = XCom.get_one(key=key, dag_id=dag_id, task_id=task_id, execution_date=execution_date) self.assertEqual(ret_value, json_obj) session = settings.Session() ret_value = session.query(XCom).filter(XCom.key == key, XCom.dag_id == dag_id, XCom.task_id == task_id, XCom.execution_date == execution_date ).first().value self.assertEqual(ret_value, json_obj)
def test_reschedule_with_test_mode(self): sensor = self._make_sensor( return_value=None, poke_interval=10, timeout=25, mode='reschedule') sensor.poke = Mock(side_effect=[False]) dr = self._make_dag_run() # poke returns False and AirflowRescheduleException is raised date1 = timezone.utcnow() with freeze_time(date1): for dt in self.dag.date_range(DEFAULT_DATE, end_date=DEFAULT_DATE): TaskInstance(sensor, dt).run( ignore_ti_state=True, test_mode=True) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # in test mode state is not modified self.assertEquals(ti.state, State.NONE) # in test mode no reschedule request is recorded task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 0) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE)
def test_xcom_get_many(self): json_obj = {"key": "value"} execution_date = timezone.utcnow() key = "xcom_test4" dag_id1 = "test_dag4" task_id1 = "test_task4" dag_id2 = "test_dag5" task_id2 = "test_task5" configuration.set("core", "xcom_enable_pickling", "True") XCom.set(key=key, value=json_obj, dag_id=dag_id1, task_id=task_id1, execution_date=execution_date) XCom.set(key=key, value=json_obj, dag_id=dag_id2, task_id=task_id2, execution_date=execution_date) results = XCom.get_many(key=key, execution_date=execution_date) for result in results: self.assertEqual(result.value, json_obj)
def execute(self, context): # If the DAG Run is externally triggered, then return without # skipping downstream tasks if context['dag_run'] and context['dag_run'].external_trigger: self.log.info("Externally triggered DAG_Run: allowing execution to proceed.") return now = timezone.utcnow() left_window = context['dag'].following_schedule( context['execution_date']) right_window = context['dag'].following_schedule(left_window) self.log.info( 'Checking latest only with left_window: %s right_window: %s now: %s', left_window, right_window, now ) if not left_window < now <= right_window: self.log.info('Not latest execution, skipping downstream.') downstream_tasks = context['task'].get_flat_relatives(upstream=False) self.log.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) self.log.info('Done.') else: self.log.info('Latest, allowing execution to proceed.')
def test_soft_fail_with_reschedule(self): sensor = self._make_sensor( return_value=False, poke_interval=10, timeout=5, soft_fail=True, mode='reschedule') dr = self._make_dag_run() # first poke returns False and task is re-scheduled date1 = timezone.utcnow() with freeze_time(date1): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.NONE) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # second poke returns False, timeout occurs date2 = date1 + timedelta(seconds=sensor.poke_interval) with freeze_time(date2): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: self.assertEquals(ti.state, State.SKIPPED)
def test_with_dag_run(self): value = False dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks {}".format(dag.tasks)) dr = dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
def test_check_and_change_state_before_execution_dep_not_met(self): dag = models.DAG(dag_id='test_check_and_change_state_before_execution') task = DummyOperator(task_id='task', dag=dag, start_date=DEFAULT_DATE) task2 = DummyOperator(task_id='task2', dag=dag, start_date=DEFAULT_DATE) task >> task2 ti = TI( task=task2, execution_date=timezone.utcnow()) self.assertFalse(ti._check_and_change_state_before_execution())
def test_ok_with_custom_reschedule_exception(self): sensor = self._make_sensor( return_value=None, mode='reschedule') date1 = timezone.utcnow() date2 = date1 + timedelta(seconds=60) date3 = date1 + timedelta(seconds=120) sensor.poke = Mock(side_effect=[ AirflowRescheduleException(date2), AirflowRescheduleException(date3), True, ]) dr = self._make_dag_run() # first poke returns False and task is re-scheduled with freeze_time(date1): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # verify task is re-scheduled, i.e. state set to NONE self.assertEquals(ti.state, State.NONE) # verify one row in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 1) self.assertEquals(task_reschedules[0].start_date, date1) self.assertEquals(task_reschedules[0].reschedule_date, date2) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # second poke returns False and task is re-scheduled with freeze_time(date2): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # verify task is re-scheduled, i.e. state set to NONE self.assertEquals(ti.state, State.NONE) # verify two rows in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 2) self.assertEquals(task_reschedules[1].start_date, date2) self.assertEquals(task_reschedules[1].reschedule_date, date3) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # third poke returns True and task succeeds with freeze_time(date3): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.SUCCESS) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE)
def test_s3_to_sftp_operation(self): # Setting configuration.conf.set("core", "enable_xcom_pickling", "True") test_remote_file_content = \ "This is remote file content \n which is also multiline " \ "another line here \n this is last line. EOF" # Test for creation of s3 bucket conn = boto3.client('s3') conn.create_bucket(Bucket=self.s3_bucket) self.assertTrue((self.s3_hook.check_for_bucket(self.s3_bucket))) with open(LOCAL_FILE_PATH, 'w') as f: f.write(test_remote_file_content) self.s3_hook.load_file(LOCAL_FILE_PATH, self.s3_key, bucket_name=BUCKET) # Check if object was created in s3 objects_in_dest_bucket = conn.list_objects(Bucket=self.s3_bucket, Prefix=self.s3_key) # there should be object found, and there should only be one object found self.assertEqual(len(objects_in_dest_bucket['Contents']), 1) # the object found should be consistent with dest_key specified earlier self.assertEqual(objects_in_dest_bucket['Contents'][0]['Key'], self.s3_key) # get remote file to local run_task = S3ToSFTPOperator( s3_bucket=BUCKET, s3_key=S3_KEY, sftp_path=SFTP_PATH, sftp_conn_id=SFTP_CONN_ID, s3_conn_id=S3_CONN_ID, task_id=TASK_ID, dag=self.dag ) self.assertIsNotNone(run_task) run_task.execute(None) # Check that the file is created remotely check_file_task = SSHOperator( task_id="test_check_file", ssh_hook=self.hook, command="cat {0}".format(self.sftp_path), do_xcom_push=True, dag=self.dag ) self.assertIsNotNone(check_file_task) ti3 = TaskInstance(task=check_file_task, execution_date=timezone.utcnow()) ti3.run() self.assertEqual( ti3.xcom_pull(task_ids='test_check_file', key='return_value').strip(), test_remote_file_content.encode('utf-8')) # Clean up after finishing with test conn.delete_object(Bucket=self.s3_bucket, Key=self.s3_key) conn.delete_bucket(Bucket=self.s3_bucket) self.assertFalse((self.s3_hook.check_for_bucket(self.s3_bucket)))
def test_ok_with_reschedule(self): sensor = self._make_sensor( return_value=None, poke_interval=10, timeout=25, mode='reschedule') sensor.poke = Mock(side_effect=[False, False, True]) dr = self._make_dag_run() # first poke returns False and task is re-scheduled date1 = timezone.utcnow() with freeze_time(date1): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # verify task is re-scheduled, i.e. state set to NONE self.assertEquals(ti.state, State.UP_FOR_RESCHEDULE) # verify one row in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 1) self.assertEquals(task_reschedules[0].start_date, date1) self.assertEquals(task_reschedules[0].reschedule_date, date1 + timedelta(seconds=sensor.poke_interval)) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # second poke returns False and task is re-scheduled date2 = date1 + timedelta(seconds=sensor.poke_interval) with freeze_time(date2): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # verify task is re-scheduled, i.e. state set to NONE self.assertEquals(ti.state, State.UP_FOR_RESCHEDULE) # verify two rows in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 2) self.assertEquals(task_reschedules[1].start_date, date2) self.assertEquals(task_reschedules[1].reschedule_date, date2 + timedelta(seconds=sensor.poke_interval)) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # third poke returns True and task succeeds date3 = date2 + timedelta(seconds=sensor.poke_interval) with freeze_time(date3): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.SUCCESS) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE)
def _refresh_dag_dir(self): """ Refresh file paths from dag dir if we haven't done it for too long. """ elapsed_time_since_refresh = (timezone.utcnow() - self.last_dag_dir_refresh_time).total_seconds() if elapsed_time_since_refresh > self.dag_dir_list_interval: # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self._dag_directory) self._file_paths = list_py_file_paths(self._dag_directory) self.last_dag_dir_refresh_time = timezone.utcnow() self.log.info("There are %s files in %s", len(self._file_paths), self._dag_directory) self.set_file_paths(self._file_paths) try: self.log.debug("Removing old import errors") self.clear_nonexistent_import_errors() except Exception: self.log.exception("Error removing old import errors")
def update_state(self, session: Session = None) -> List[TI]: """ Determines the overall state of the DagRun based on the state of its TaskInstances. :param session: Sqlalchemy ORM Session :type session: Session :return: ready_tis: the tis that can be scheduled in the current loop :rtype ready_tis: list[airflow.models.TaskInstance] """ dag = self.get_dag() ready_tis: List[TI] = [] tis = list( self.get_task_instances(session=session, state=State.task_states + (State.SHUTDOWN, ))) self.log.debug("number of tis tasks for %s: %s task(s)", self, len(tis)) for ti in tis: ti.task = dag.get_task(ti.task_id) start_dttm = timezone.utcnow() unfinished_tasks = [t for t in tis if t.state in State.unfinished()] finished_tasks = [ t for t in tis if t.state in State.finished() + [State.UPSTREAM_FAILED] ] none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) if unfinished_tasks: scheduleable_tasks = [ ut for ut in unfinished_tasks if ut.state in SCHEDULEABLE_STATES ] self.log.debug("number of scheduleable tasks for %s: %s task(s)", self, len(scheduleable_tasks)) ready_tis, changed_tis = self._get_ready_tis( scheduleable_tasks, finished_tasks, session) self.log.debug("ready tis length for %s: %s task(s)", self, len(ready_tis)) if none_depends_on_past and none_task_concurrency: # small speed up are_runnable_tasks = ready_tis or self._are_premature_tis( unfinished_tasks, finished_tasks, session) or changed_tis duration = (timezone.utcnow() - start_dttm) Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) leaf_task_ids = {t.task_id for t in dag.leaves} leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids] # if all roots finished and at least one failed, the run failed if not unfinished_tasks and any( leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED} for leaf_ti in leaf_tis): self.log.error('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all leafs succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( leaf_ti.state in {State.SUCCESS, State.SKIPPED} for leaf_ti in leaf_tis): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and not are_runnable_tasks): self.log.error('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return ready_tis
def task_instance_link(attr): """Generates a URL to the Graph view for a TaskInstance.""" dag_id = attr.get('dag_id') task_id = attr.get('task_id') execution_date = attr.get('dag_run.execution_date') or attr.get('execution_date') or timezone.utcnow() url = url_for('Airflow.task', dag_id=dag_id, task_id=task_id, execution_date=execution_date.isoformat()) url_root = url_for( 'Airflow.graph', dag_id=dag_id, root=task_id, execution_date=execution_date.isoformat() ) return Markup( """ <span style="white-space: nowrap;"> <a href="{url}">{task_id}</a> <a href="{url_root}" title="Filter on this task and upstream"> <span class="material-icons" style="margin-left:0;" aria-hidden="true">filter_alt</span> </a> </span> """ ).format(url=url, task_id=task_id, url_root=url_root)
def _execute(self): self._enable_task_listeners() self.task_runner = get_task_runner(self) def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.task_runner.terminate() self.handle_task_exit(128 + signum) signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance.check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool, external_executor_id=self.external_executor_id, ): self.log.info("Task is not able to be run") return try: self.task_runner.start() heartbeat_time_limit = conf.getint( 'scheduler', 'scheduler_zombie_task_threshold') # LocalTaskJob should not run callbacks, which are handled by TaskInstance._run_raw_task # 1, LocalTaskJob does not parse DAG, thus cannot run callbacks # 2, The run_as_user of LocalTaskJob is likely not same as the TaskInstance._run_raw_task. # When run_as_user is specified, the process owner of the LocalTaskJob must be sudoable. # It is not secure to run callbacks with sudoable users. # If _run_raw_task receives SIGKILL, scheduler will mark it as zombie and invoke callbacks # If LocalTaskJob receives SIGTERM, LocalTaskJob passes SIGTERM to _run_raw_task # If the state of task_instance is changed, LocalTaskJob sends SIGTERM to _run_raw_task while not self.terminating: # Monitor the task to see if it's done. Wait in a syscall # (`os.wait`) for as long as possible so we notice the # subprocess finishing as quick as we can max_wait_time = max( 0, # Make sure this value is never negative, min( (heartbeat_time_limit - (timezone.utcnow() - self.latest_heartbeat).total_seconds() * 0.75), self.heartrate, ), ) return_code = self.task_runner.return_code( timeout=max_wait_time) if return_code is not None: self.handle_task_exit(return_code) return self.heartbeat() # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. # This can only really happen if the worker can't read the DB for a long time time_since_last_heartbeat = ( timezone.utcnow() - self.latest_heartbeat).total_seconds() if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limit exceeded!") raise AirflowException( f"Time since last heartbeat({time_since_last_heartbeat:.2f}s) exceeded limit " f"({heartbeat_time_limit}s).") finally: self.on_kill()
def update_state( self, session: Session = NEW_SESSION, execute_callbacks: bool = True ) -> Tuple[List[TI], Optional[DagCallbackRequest]]: """ Determines the overall state of the DagRun based on the state of its TaskInstances. :param session: Sqlalchemy ORM Session :param execute_callbacks: Should dag callbacks (success/failure, SLA etc) be invoked directly (default: true) or recorded as a pending request in the ``callback`` property :return: Tuple containing tis that can be scheduled in the current loop & `callback` that needs to be executed """ # Callback to execute in case of Task Failures callback: Optional[DagCallbackRequest] = None start_dttm = timezone.utcnow() self.last_scheduling_decision = start_dttm with Stats.timer(f"dagrun.dependency-check.{self.dag_id}"): dag = self.get_dag() info = self.task_instance_scheduling_decisions(session) tis = info.tis schedulable_tis = info.schedulable_tis changed_tis = info.changed_tis finished_tis = info.finished_tis unfinished_tis = info.unfinished_tis none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tis) none_task_concurrency = all(t.task.max_active_tis_per_dag is None for t in unfinished_tis) none_deferred = all(t.state != State.DEFERRED for t in unfinished_tis) if unfinished_tis and none_depends_on_past and none_task_concurrency and none_deferred: # small speed up are_runnable_tasks = (schedulable_tis or self._are_premature_tis( unfinished_tis, finished_tis, session) or changed_tis) leaf_task_ids = {t.task_id for t in dag.leaves} leaf_tis = [ ti for ti in tis if ti.task_id in leaf_task_ids if ti.state != TaskInstanceState.REMOVED ] # if all roots finished and at least one failed, the run failed if not unfinished_tis and any(leaf_ti.state in State.failed_states for leaf_ti in leaf_tis): self.log.error('Marking run %s failed', self) self.set_state(DagRunState.FAILED) if execute_callbacks: dag.handle_callback(self, success=False, reason='task_failure', session=session) elif dag.has_on_failure_callback: callback = DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, run_id=self.run_id, is_failure_callback=True, msg='task_failure', ) # if all leaves succeeded and no unfinished tasks, the run succeeded elif not unfinished_tis and all(leaf_ti.state in State.success_states for leaf_ti in leaf_tis): self.log.info('Marking run %s successful', self) self.set_state(DagRunState.SUCCESS) if execute_callbacks: dag.handle_callback(self, success=True, reason='success', session=session) elif dag.has_on_success_callback: callback = DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, run_id=self.run_id, is_failure_callback=False, msg='success', ) # if *all tasks* are deadlocked, the run failed elif (unfinished_tis and none_depends_on_past and none_task_concurrency and none_deferred and not are_runnable_tasks): self.log.error('Deadlock; marking run %s failed', self) self.set_state(DagRunState.FAILED) if execute_callbacks: dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) elif dag.has_on_failure_callback: callback = DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, run_id=self.run_id, is_failure_callback=True, msg='all_tasks_deadlocked', ) # finally, if the roots aren't done, the dag is still running else: self.set_state(DagRunState.RUNNING) if self._state == DagRunState.FAILED or self._state == DagRunState.SUCCESS: msg = ("DagRun Finished: dag_id=%s, execution_date=%s, run_id=%s, " "run_start_date=%s, run_end_date=%s, run_duration=%s, " "state=%s, external_trigger=%s, run_type=%s, " "data_interval_start=%s, data_interval_end=%s, dag_hash=%s") self.log.info( msg, self.dag_id, self.execution_date, self.run_id, self.start_date, self.end_date, (self.end_date - self.start_date).total_seconds() if self.start_date and self.end_date else None, self._state, self.external_trigger, self.run_type, self.data_interval_start, self.data_interval_end, self.dag_hash, ) session.flush() self._emit_true_scheduling_delay_stats_for_finished_state(finished_tis) self._emit_duration_stats_for_finished_state() session.merge(self) # We do not flush here for performance reasons(It increases queries count by +20) return schedulable_tis, callback
def collect_dags( self, dag_folder=None, only_if_updated=True, include_examples=configuration.conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=configuration.conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ start_dttm = timezone.utcnow() dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] FileLoadStat = namedtuple('FileLoadStat', "file duration dag_num task_num dags") dag_folder = correct_maybe_zipped(dag_folder) dags_by_name = {} for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: ts = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) dag_ids = [dag.dag_id for dag in found_dags] dag_id_names = str(dag_ids) td = timezone.utcnow() - ts td = td.total_seconds() + (float(td.microseconds) / 1000000) dags_by_name[dag_id_names] = dag_ids stats.append( FileLoadStat( filepath.replace(dag_folder, ''), td, len(found_dags), sum([len(dag.tasks) for dag in found_dags]), dag_id_names, )) except Exception as e: self.log.exception(e) Stats.gauge('collect_dags', (timezone.utcnow() - start_dttm).total_seconds(), 1) Stats.gauge('dagbag_size', len(self.dags), 1) Stats.gauge('dagbag_import_errors', len(self.import_errors), 1) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True) for file_stat in self.dagbag_stats: dag_ids = dags_by_name[file_stat.dags] if file_stat.dag_num >= 1: # if we found multiple dags per file, the stat is 'dag_id1 _ dag_id2' dag_names = '_'.join(dag_ids) Stats.timing('dag.loading-duration.{}'.format(dag_names), file_stat.duration)
def start(self): """ Use multiple processes to parse and generate tasks for the DAGs in parallel. By processing them in separate processes, we can get parallelism and isolation from potentially harmful user code. """ self.register_exit_signals() # Start a new process group os.setpgid(0, 0) self.log.info("Processing files using up to %s processes at a time ", self._parallelism) self.log.info("Process each file at most once every %s seconds", self._file_process_interval) self.log.info("Checking for new files in %s every %s seconds", self._dag_directory, self.dag_dir_list_interval) # In sync mode we want timeout=None -- wait forever until a message is received if self._async_mode: poll_time = 0.0 else: poll_time = None # Used to track how long it takes us to get once around every file in the DAG folder. self._parsing_start_time = timezone.utcnow() while True: loop_start_time = time.time() # pylint: disable=no-else-break if self._signal_conn.poll(poll_time): agent_signal = self._signal_conn.recv() self.log.debug("Received %s signal from DagFileProcessorAgent", agent_signal) if agent_signal == DagParsingSignal.TERMINATE_MANAGER: self.terminate() break elif agent_signal == DagParsingSignal.END_MANAGER: self.end() sys.exit(os.EX_OK) elif agent_signal == DagParsingSignal.AGENT_HEARTBEAT: # continue the loop to parse dags pass elif isinstance(agent_signal, FailureCallbackRequest): self._add_callback_to_queue(agent_signal) else: raise AirflowException("Invalid message") elif not self._async_mode: # In "sync" mode we don't want to parse the DAGs until we # are told to (as that would open another connection to the # SQLite DB which isn't a good practice continue # pylint: enable=no-else-break self._refresh_dag_dir() self._find_zombies() # pylint: disable=no-value-for-parameter self._kill_timed_out_processors() # Generate more file paths to process if we processed all the files # already. if not self._file_path_queue: self.emit_metrics() self.prepare_file_path_queue() self.start_new_processes() # Update number of loop iteration. self._num_run += 1 simple_dags = self.collect_results() for simple_dag in simple_dags: self._signal_conn.send(simple_dag) if not self._async_mode: self.log.debug( "Waiting for processors to finish since we're using sqlite" ) # Wait until the running DAG processors are finished before # sending a DagParsingStat message back. This means the Agent # can tell we've got to the end of this iteration when it sees # this type of message self.wait_until_finished() # Collect anything else that has finished, but don't kick off any more processors simple_dags = self.collect_results() for simple_dag in simple_dags: self._signal_conn.send(simple_dag) self._print_stat() all_files_processed = all( self.get_last_finish_time(x) is not None for x in self.file_paths) max_runs_reached = self.max_runs_reached() dag_parsing_stat = DagParsingStat( self._file_paths, max_runs_reached, all_files_processed, ) self._signal_conn.send(dag_parsing_stat) if max_runs_reached: self.log.info( "Exiting dag parsing loop as all files " "have been processed %s times", self._max_runs) break if self._async_mode: loop_duration = time.time() - loop_start_time if loop_duration < 1: poll_time = 1 - loop_duration else: poll_time = 0.0
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: State """ dag = self.get_dag() tis = self.get_task_instances(session=session) self.log.debug("Updating state for %s considering %s task(s)", self, len(tis)) for ti in list(tis): # skip in db? if ti.state == State.REMOVED: tis.remove(ti) else: ti.task = dag.get_task(ti.task_id) # pre-calculate # db is faster start_dttm = timezone.utcnow() unfinished_tasks = self.get_task_instances(state=State.unfinished(), session=session) none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) # small speed up if unfinished_tasks and none_depends_on_past and none_task_concurrency: # todo: this can actually get pretty slow: one task costs between 0.01-015s no_dependencies_met = True for ut in unfinished_tasks: # We need to flag upstream and check for changes because upstream # failures/re-schedules can result in deadlock false positives old_state = ut.state deps_met = ut.are_dependencies_met(dep_context=DepContext( flag_upstream_failed=True, ignore_in_retry_period=True, ignore_in_reschedule_period=True), session=session) if deps_met or old_state != ut.current_state(session=session): no_dependencies_met = False break duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) root_ids = [t.task_id for t in dag.roots] roots = [t for t in tis if t.task_id in root_ids] # if all roots finished and at least one failed, the run failed if (not unfinished_tasks and any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)): self.log.info('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all roots succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( r.state in (State.SUCCESS, State.SKIPPED) for r in roots): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and no_dependencies_met): self.log.info('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return self.state
def __init__(self, dag_directory, file_paths, max_runs, processor_factory, signal_conn, stat_queue, result_queue, async_mode=True): """ :param dag_directory: Directory where DAG definitions are kept. All files in file_paths should be under this directory :type dag_directory: unicode :param file_paths: list of file paths that contain DAG definitions :type file_paths: list[unicode] :param max_runs: The number of times to parse and schedule each file. -1 for unlimited. :type max_runs: int :param processor_factory: function that creates processors for DAG definition files. Arguments are (dag_definition_path) :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor) :param signal_conn: connection to communicate signal with processor agent. :type signal_conn: Connection :param stat_queue: the queue to use for passing back parsing stat to agent. :type stat_queue: multiprocessing.Queue :param result_queue: the queue to use for passing back the result to agent. :type result_queue: multiprocessing.Queue :param async_mode: whether to start the manager in async mode :type async_mode: bool """ self._file_paths = file_paths self._file_path_queue = [] self._dag_directory = dag_directory self._max_runs = max_runs self._processor_factory = processor_factory self._signal_conn = signal_conn self._stat_queue = stat_queue self._result_queue = result_queue self._async_mode = async_mode self._parallelism = conf.getint('scheduler', 'max_threads') if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: self.log.error("Cannot use more than 1 thread when using sqlite. " "Setting parallelism to 1") self._parallelism = 1 # Parse and schedule each file no faster than this interval. self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval') # How often to print out DAG file processing stats to the log. Default to # 30 seconds. self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval') # How many seconds do we wait for tasks to heartbeat before mark them as zombies. self._zombie_threshold_secs = ( conf.getint('scheduler', 'scheduler_zombie_task_threshold')) # Map from file path to the processor self._processors = {} # Map from file path to the last runtime self._last_runtime = {} # Map from file path to the last finish time self._last_finish_time = {} self._last_zombie_query_time = timezone.utcnow() # Last time that the DAG dir was traversed to look for files self.last_dag_dir_refresh_time = timezone.utcnow() # Last time stats were printed self.last_stat_print_time = timezone.datetime(2000, 1, 1) # TODO: Remove magic number self._zombie_query_interval = 10 # Map from file path to the number of runs self._run_count = defaultdict(int) # Manager heartbeat key. self._heart_beat_key = 'heart-beat' # How often to scan the DAGs directory for new files. Default to 5 minutes. self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval') self._log = logging.getLogger('airflow.processor_manager') signal.signal(signal.SIGINT, self._exit_gracefully) signal.signal(signal.SIGTERM, self._exit_gracefully)
import airflow from airflow.models import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.dummy_operator import DummyOperator from airflow.utils import timezone from datetime import datetime, timedelta args = { 'owner': 'airflow', 'start_date': timezone.utcnow() - timedelta(hours=1) } dag = DAG( dag_id='lab_01', default_args=args, schedule_interval='0 * * * *', dagrun_timeout=timedelta(minutes=60), ) start = DummyOperator( task_id='start', dag=dag, ) step0 = DummyOperator( task_id='from_kafka_to_logstash', dag=dag, ) step1 = DummyOperator( task_id='from_logstash_to_clickhouse',
def set_state(self, state): if self._state != state: self._state = state self.end_date = timezone.utcnow() if self._state in State.finished( ) else None
def __init__(self, delta: datetime.timedelta): super().__init__(moment=timezone.utcnow() + delta)
def test_xcom_push_and_pull(self, mock_conn, mock_run_query): ti = TaskInstance(task=self.operator, execution_date=timezone.utcnow()) ti.run() self.assertEqual(ti.xcom_pull(task_ids=MOCK_DATA['task_id']), str(MOCK_RESULT))
def prepare_file_path_queue(self): """Generate more file paths to process. Result are saved in _file_path_queue.""" self._parsing_start_time = time.perf_counter() # If the file path is already being processed, or if a file was # processed recently, wait until the next batch file_paths_in_progress = self._processors.keys() now = timezone.utcnow() # Sort the file paths by the parsing order mode list_mode = conf.get("scheduler", "file_parsing_sort_mode") files_with_mtime = {} file_paths = [] is_mtime_mode = list_mode == "modified_time" file_paths_recently_processed = [] for file_path in self._file_paths: if is_mtime_mode: try: files_with_mtime[file_path] = os.path.getmtime(file_path) except FileNotFoundError: self.log.warning("Skipping processing of missing file: %s", file_path) continue file_modified_time = timezone.make_aware(datetime.fromtimestamp(files_with_mtime[file_path])) else: file_paths.append(file_path) file_modified_time = None # Find file paths that were recently processed to exclude them # from being added to file_path_queue # unless they were modified recently and parsing mode is "modified_time" # in which case we don't honor "self._file_process_interval" (min_file_process_interval) last_finish_time = self.get_last_finish_time(file_path) if ( last_finish_time is not None and (now - last_finish_time).total_seconds() < self._file_process_interval and not (is_mtime_mode and file_modified_time and (file_modified_time > last_finish_time)) ): file_paths_recently_processed.append(file_path) # Sort file paths via last modified time if is_mtime_mode: file_paths = sorted(files_with_mtime, key=files_with_mtime.get, reverse=True) elif list_mode == "alphabetical": file_paths = sorted(file_paths) elif list_mode == "random_seeded_by_host": # Shuffle the list seeded by hostname so multiple schedulers can work on different # set of files. Since we set the seed, the sort order will remain same per host random.Random(get_hostname()).shuffle(file_paths) files_paths_at_run_limit = [ file_path for file_path, stat in self._file_stats.items() if stat.run_count == self._max_runs ] file_paths_to_exclude = set(file_paths_in_progress).union( file_paths_recently_processed, files_paths_at_run_limit ) # Do not convert the following list to set as set does not preserve the order # and we need to maintain the order of file_paths for `[scheduler] file_parsing_sort_mode` files_paths_to_queue = [ file_path for file_path in file_paths if file_path not in file_paths_to_exclude ] for file_path, processor in self._processors.items(): self.log.debug( "File path %s is still being processed (started: %s)", processor.file_path, processor.start_time.isoformat(), ) self.log.debug("Queuing the following files for processing:\n\t%s", "\n\t".join(files_paths_to_queue)) for file_path in files_paths_to_queue: if file_path not in self._file_stats: self._file_stats[file_path] = DagFileStat( num_dags=0, import_errors=0, last_finish_time=None, last_duration=None, run_count=0 ) self._file_path_queue.extend(files_paths_to_queue)
def _log_file_processing_stats(self, known_file_paths): """ Print out stats about how files are getting processed. :param known_file_paths: a list of file paths that may contain Airflow DAG definitions :return: None """ # File Path: Path to the file containing the DAG definition # PID: PID associated with the process that's processing the file. May # be empty. # Runtime: If the process is currently running, how long it's been # running for in seconds. # Last Runtime: If the process ran before, how long did it take to # finish in seconds # Last Run: When the file finished processing in the previous run. headers = ["File Path", "PID", "Runtime", "# DAGs", "# Errors", "Last Runtime", "Last Run"] rows = [] now = timezone.utcnow() for file_path in known_file_paths: last_runtime = self.get_last_runtime(file_path) num_dags = self.get_last_dag_count(file_path) num_errors = self.get_last_error_count(file_path) file_name = os.path.basename(file_path) file_name = os.path.splitext(file_name)[0].replace(os.sep, '.') processor_pid = self.get_pid(file_path) processor_start_time = self.get_start_time(file_path) runtime = (now - processor_start_time) if processor_start_time else None last_run = self.get_last_finish_time(file_path) if last_run: seconds_ago = (now - last_run).total_seconds() Stats.gauge(f'dag_processing.last_run.seconds_ago.{file_name}', seconds_ago) rows.append((file_path, processor_pid, runtime, num_dags, num_errors, last_runtime, last_run)) # Sort by longest last runtime. (Can't sort None values in python3) rows = sorted(rows, key=lambda x: x[3] or 0.0) formatted_rows = [] for file_path, pid, runtime, num_dags, num_errors, last_runtime, last_run in rows: formatted_rows.append( ( file_path, pid, f"{runtime.total_seconds():.2f}s" if runtime else None, num_dags, num_errors, f"{last_runtime:.2f}s" if last_runtime else None, last_run.strftime("%Y-%m-%dT%H:%M:%S") if last_run else None, ) ) log_str = ( "\n" + "=" * 80 + "\n" + "DAG File Processing Stats\n\n" + tabulate(formatted_rows, headers=headers) + "\n" + "=" * 80 ) self.log.info(log_str)
def collect_dags(self, dag_folder=None, only_if_updated=True, include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ if self.store_serialized_dags: return self.log.info("Filling up the DagBag from %s", dag_folder) start_dttm = timezone.utcnow() dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] FileLoadStat = namedtuple('FileLoadStat', "file duration dag_num task_num dags") dag_folder = correct_maybe_zipped(dag_folder) for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: ts = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) dag_ids = [dag.dag_id for dag in found_dags] dag_id_names = str(dag_ids) td = timezone.utcnow() - ts td = td.total_seconds() + (float(td.microseconds) / 1000000) stats.append( FileLoadStat( filepath.replace(settings.DAGS_FOLDER, ''), td, len(found_dags), sum([len(dag.tasks) for dag in found_dags]), dag_id_names, )) except Exception as e: self.log.exception(e) Stats.gauge('collect_dags', (timezone.utcnow() - start_dttm).total_seconds(), 1) Stats.gauge('dagbag_size', len(self.dags), 1) Stats.gauge('dagbag_import_errors', len(self.import_errors), 1) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True) for file_stat in self.dagbag_stats: # file_stat.file similar format: /subdir/dag_name.py # TODO: Remove for Airflow 2.0 filename = file_stat.file.split('/')[-1].replace('.py', '') Stats.timing('dag.loading-duration.{}'.format(filename), file_stat.duration)
def test_next_execution(self): # A scaffolding function def reset_dr_db(dag_id): session = Session() dr = session.query(models.DagRun).filter_by(dag_id=dag_id) dr.delete() session.commit() session.close() EXAMPLE_DAGS_FOLDER = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.realpath(__file__)) ) ), "airflow/example_dags" ) dagbag = models.DagBag(dag_folder=EXAMPLE_DAGS_FOLDER, include_examples=False) dag_ids = ['example_bash_operator', # schedule_interval is '0 0 * * *' 'latest_only', # schedule_interval is timedelta(hours=4) 'example_python_operator', # schedule_interval=None 'example_xcom'] # schedule_interval="@once" # The details below is determined by the schedule_interval of example DAGs now = timezone.utcnow() next_execution_time_for_dag1 = pytz.utc.localize( datetime.combine( now.date() + timedelta(days=1), time(0) ) ) next_execution_time_for_dag2 = now + timedelta(hours=4) expected_output = [str(next_execution_time_for_dag1), str(next_execution_time_for_dag2), "None", "None"] for i in range(len(dag_ids)): dag_id = dag_ids[i] # Clear dag run so no execution history fo each DAG reset_dr_db(dag_id) p = subprocess.Popen(["airflow", "next_execution", dag_id, "--subdir", EXAMPLE_DAGS_FOLDER], stdout=subprocess.PIPE) p.wait() stdout = [] for line in p.stdout: stdout.append(str(line.decode("utf-8").rstrip())) # `next_execution` function is inapplicable if no execution record found # It prints `None` in such cases self.assertEqual(stdout[-1], "None") dag = dagbag.dags[dag_id] # Create a DagRun for each DAG, to prepare for next step dag.create_dagrun( run_id='manual__' + now.isoformat(), execution_date=now, start_date=now, state=State.FAILED ) p = subprocess.Popen(["airflow", "next_execution", dag_id, "--subdir", EXAMPLE_DAGS_FOLDER], stdout=subprocess.PIPE) p.wait() stdout = [] for line in p.stdout: stdout.append(str(line.decode("utf-8").rstrip())) self.assertEqual(stdout[-1], expected_output[i]) reset_dr_db(dag_id)
def execute(self, context: Context): if isinstance(self.execution_date, datetime.datetime): parsed_execution_date = self.execution_date elif isinstance(self.execution_date, str): parsed_execution_date = timezone.parse(self.execution_date) else: parsed_execution_date = timezone.utcnow() if self.trigger_run_id: run_id = self.trigger_run_id else: run_id = DagRun.generate_run_id(DagRunType.MANUAL, parsed_execution_date) try: dag_run = trigger_dag( dag_id=self.trigger_dag_id, run_id=run_id, conf=self.conf, execution_date=parsed_execution_date, replace_microseconds=False, ) except DagRunAlreadyExists as e: if self.reset_dag_run: self.log.info("Clearing %s on %s", self.trigger_dag_id, parsed_execution_date) # Get target dag object and call clear() dag_model = DagModel.get_current(self.trigger_dag_id) if dag_model is None: raise DagNotFound( f"Dag id {self.trigger_dag_id} not found in DagModel") dag_bag = DagBag(dag_folder=dag_model.fileloc, read_dags_from_db=True) dag = dag_bag.get_dag(self.trigger_dag_id) dag.clear(start_date=parsed_execution_date, end_date=parsed_execution_date) dag_run = DagRun.find(dag_id=dag.dag_id, run_id=run_id)[0] else: raise e if dag_run is None: raise RuntimeError("The dag_run should be set here!") # Store the execution date from the dag run (either created or found above) to # be used when creating the extra link on the webserver. ti = context['task_instance'] ti.xcom_push(key=XCOM_EXECUTION_DATE_ISO, value=dag_run.execution_date.isoformat()) ti.xcom_push(key=XCOM_RUN_ID, value=dag_run.run_id) if self.wait_for_completion: # wait for dag to complete while True: self.log.info( 'Waiting for %s on %s to become allowed state %s ...', self.trigger_dag_id, dag_run.execution_date, self.allowed_states, ) time.sleep(self.poke_interval) dag_run.refresh_from_db() state = dag_run.state if state in self.failed_states: raise AirflowException( f"{self.trigger_dag_id} failed with failed states {state}" ) if state in self.allowed_states: self.log.info("%s finished with allowed state %s", self.trigger_dag_id, state) return
def _mark_multi_state(self, operator, poke_hash, encoded_poke_context, state, session=None): """ Mark state for multiple tasks in the task_instance table to a new state if they have the same signature as the poke_hash. :param operator: The sensor's operator class name. :param poke_hash: The hash code generated from sensor's poke context. :param encoded_poke_context: The raw encoded poke_context. :param state: Set multiple sensor tasks to this state. :param session: The sqlalchemy session. """ def mark_state(ti, sensor_instance): ti.state = state sensor_instance.state = state if state in State.finished: ti.end_date = end_date ti.set_duration() SI = SensorInstance TI = TaskInstance count_marked = 0 query_result = [] try: query_result = (session.query(TI, SI).join( TI, and_( TI.dag_id == SI.dag_id, TI.task_id == SI.task_id, TI.execution_date == SI.execution_date, ), ).filter(SI.state == State.SENSING).filter( SI.hashcode == poke_hash).filter( SI.operator == operator).with_for_update().all()) end_date = timezone.utcnow() for ti, sensor_instance in query_result: if sensor_instance.poke_context != encoded_poke_context: continue ti.hostname = self.hostname if ti.state == State.SENSING: mark_state(ti=ti, sensor_instance=sensor_instance) count_marked += 1 else: # ti.state != State.SENSING sensor_instance.state = ti.state session.commit() except Exception: # pylint: disable=broad-except self.log.warning( "Exception _mark_multi_state in smart sensor for hashcode %s", str(poke_hash), # cast to str in advance for highlighting exc_info=True, ) self.log.info("Marked %s tasks out of %s to state %s", count_marked, len(query_result), state)
def date_range(start_date, end_date=None, num=None, delta=None): """ Get a set of dates as a list based on a start, end and delta, delta can be something that can be added to ``datetime.datetime`` or a cron expression as a ``str`` :param start_date: anchor date to start the series from :type start_date: datetime.datetime :param end_date: right boundary for the date range :type end_date: datetime.datetime :param num: alternatively to end_date, you can specify the number of number of entries you want in the range. This number can be negative, output will always be sorted regardless :type num: int >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1)) [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)] >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *') [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)] >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *") [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)] """ if not delta: return [] if end_date and start_date > end_date: raise Exception("Wait. start_date needs to be before end_date") if end_date and num: raise Exception("Wait. Either specify end_date OR num") if not end_date and not num: end_date = timezone.utcnow() delta_iscron = False tz = start_date.tzinfo if isinstance(delta, six.string_types): delta_iscron = True start_date = timezone.make_naive(start_date, tz) cron = croniter(delta, start_date) elif isinstance(delta, timedelta): delta = abs(delta) dates = [] if end_date: if timezone.is_naive(start_date): end_date = timezone.make_naive(end_date, tz) while start_date <= end_date: if timezone.is_naive(start_date): dates.append(timezone.make_aware(start_date, tz)) else: dates.append(start_date) if delta_iscron: start_date = cron.get_next(datetime) else: start_date += delta else: for _ in range(abs(num)): if timezone.is_naive(start_date): dates.append(timezone.make_aware(start_date, tz)) else: dates.append(start_date) if delta_iscron: if num > 0: start_date = cron.get_next(datetime) else: start_date = cron.get_prev(datetime) else: if num > 0: start_date += delta else: start_date -= delta return sorted(dates)
# limitations under the License. import json import unittest from freezegun import freeze_time from mock import patch from airflow import AirflowException from airflow.api.client.local_client import Client from airflow import models from airflow import settings from airflow.utils import timezone from airflow.utils.state import State EXECDATE = timezone.utcnow() EXECDATE_NOFRACTIONS = EXECDATE.replace(microsecond=0) EXECDATE_ISO = EXECDATE_NOFRACTIONS.isoformat() class TestLocalClient(unittest.TestCase): @classmethod def setUpClass(cls): super(TestLocalClient, cls).setUpClass() session = settings.Session() session.query(models.Pool).delete() session.commit() session.close() def setUp(self): super(TestLocalClient, self).setUp()
def test_clear(self): self.dag.clear( start_date=DEFAULT_DATE, end_date=timezone.utcnow())
# Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.models.dag import DAG from airflow.utils import timezone from airflow.ti_deps.met_handlers.aiflow_met_handler import AIFlowMetHandler from airflow.models.event import Event from airflow.operators.send_event_operator import SendEventOperator from airflow.operators.bash_operator import BashOperator dag = DAG(dag_id='event_scheduler_example_bash_dag', start_date=timezone.utcnow(), schedule_interval="@once") op_0 = BashOperator(task_id='0_job', dag=dag, bash_command='echo "0 hello word!"') op_1 = BashOperator(task_id='1_job', dag=dag, bash_command='echo "1 hello word!"') op_2 = SendEventOperator(task_id='2_job', dag=dag, uri='localhost:50051', event=Event(key='key_1', value='value_1', event_type='UNDEFINED')) op_3 = SendEventOperator(task_id='3_job', dag=dag,
def get_dag(self, dag_id, session: Session = None): """ Gets the DAG out of the dictionary, and refreshes it if expired :param dag_id: DAG Id :type dag_id: str """ # Avoid circular import from airflow.models.dag import DagModel if self.read_dags_from_db: # Import here so that serialized dag is only imported when serialization is enabled from airflow.models.serialized_dag import SerializedDagModel if dag_id not in self.dags: # Load from DB if not (yet) in the bag self._add_dag_from_db(dag_id=dag_id, session=session) return self.dags.get(dag_id) # If DAG is in the DagBag, check the following # 1. if time has come to check if DAG is updated (controlled by min_serialized_dag_fetch_secs) # 2. check the last_updated column in SerializedDag table to see if Serialized DAG is updated # 3. if (2) is yes, fetch the Serialized DAG. min_serialized_dag_fetch_secs = timedelta( seconds=settings.MIN_SERIALIZED_DAG_FETCH_INTERVAL) if (dag_id in self.dags_last_fetched and timezone.utcnow() > self.dags_last_fetched[dag_id] + min_serialized_dag_fetch_secs): sd_last_updated_datetime = SerializedDagModel.get_last_updated_datetime( dag_id=dag_id, session=session, ) if sd_last_updated_datetime and sd_last_updated_datetime > self.dags_last_fetched[ dag_id]: self._add_dag_from_db(dag_id=dag_id, session=session) return self.dags.get(dag_id) # If asking for a known subdag, we want to refresh the parent dag = None root_dag_id = dag_id if dag_id in self.dags: dag = self.dags[dag_id] if dag.is_subdag: root_dag_id = dag.parent_dag.dag_id # type: ignore # If DAG Model is absent, we can't check last_expired property. Is the DAG not yet synchronized? orm_dag = DagModel.get_current(root_dag_id, session=session) if not orm_dag: return self.dags.get(dag_id) # If the dag corresponding to root_dag_id is absent or expired is_missing = root_dag_id not in self.dags is_expired = orm_dag.last_expired and dag and dag.last_loaded < orm_dag.last_expired if is_missing or is_expired: # Reprocess source file found_dags = self.process_file(filepath=correct_maybe_zipped( orm_dag.fileloc), only_if_updated=False) # If the source file no longer exports `dag_id`, delete it from self.dags if found_dags and dag_id in [ found_dag.dag_id for found_dag in found_dags ]: return self.dags[dag_id] elif dag_id in self.dags: del self.dags[dag_id] return self.dags.get(dag_id)
def poke(self, context): self.log.info('Checking if the time (%s) has come', self.target_time) return timezone.make_naive(timezone.utcnow(), self.dag.timezone).time() > self.target_time
def _per_task_process(key, ti, session=None): # pylint: disable=too-many-return-statements ti.refresh_from_db(lock_for_update=True, session=session) task = self.dag.get_task(ti.task_id, include_subdags=True) ti.task = task ignore_depends_on_past = self.ignore_first_depends_on_past and ti.execution_date == ( start_date or ti.start_date) self.log.debug("Task instance to run %s state %s", ti, ti.state) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == State.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return elif ti.state == State.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime elif ti.state == State.NONE: self.log.warning( "FIXME: task instance {} state was set to None " "externally. This should not happen") ti.set_state(State.SCHEDULED, session=session) if self.rerun_failed_tasks: # Rerun failed tasks or upstreamed failed tasks if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance %s with state %s", ti, ti.state) if key in ti_status.running: ti_status.running.pop(key) # Reset the failed task in backfill to scheduled state ti.set_state(State.SCHEDULED, session=session) else: # Default behaviour which works for subdag. if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance %s with state %s", ti, ti.state) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return backfill_context = DepContext( deps=BACKFILL_QUEUED_DEPS, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True, ) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met(dep_context=backfill_context, session=session, verbose=self.verbose): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor " "waiting for queue to clear", ti) else: self.log.debug('Sending %s to executor', ti) # Skip scheduled state, we are executing immediately ti.state = State.QUEUED ti.queued_by_job_id = self.id ti.queued_dttm = timezone.utcnow() session.merge(ti) cfg_path = None if self.executor_class in ( executor_constants.LOCAL_EXECUTOR, executor_constants.SEQUENTIAL_EXECUTOR, ): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path, ) ti_status.running[key] = ti ti_status.to_run.pop(key) session.commit() return if ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # special case if ti.state == State.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # special case if ti.state == State.UP_FOR_RESCHEDULE: self.log.debug( "Task instance %s reschedule period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # all remaining tasks self.log.debug('Adding %s to not_ready', ti) ti_status.not_ready.add(key)
def _retry_or_fail_task(self, sensor_work, error, session=None): """ Change single task state for sensor task. For final state, set the end_date. Since smart sensor take care all retries in one process. Failed sensor tasks logically experienced all retries and the try_number should be set to max_tries. :param sensor_work: The sensor_work with exception. :type sensor_work: SensorWork :param error: The error message for this sensor_work. :type error: str. :param session: The sqlalchemy session. """ def email_alert(task_instance, error_info): try: subject, html_content, _ = task_instance.get_email_subject_content( error_info) email = sensor_work.execution_context.get('email') send_email(email, subject, html_content) except Exception: # pylint: disable=broad-except sensor_work.log.warning("Exception alerting email.", exc_info=True) def handle_failure(sensor_work, ti): if sensor_work.execution_context.get( 'retries') and ti.try_number <= ti.max_tries: # retry ti.state = State.UP_FOR_RETRY if sensor_work.execution_context.get( 'email_on_retry' ) and sensor_work.execution_context.get('email'): sensor_work.log.info("%s sending email alert for retry", sensor_work.ti_key) email_alert(ti, error) else: ti.state = State.FAILED if sensor_work.execution_context.get( 'email_on_failure' ) and sensor_work.execution_context.get('email'): sensor_work.log.info("%s sending email alert for failure", sensor_work.ti_key) email_alert(ti, error) try: dag_id, task_id, execution_date = sensor_work.ti_key TI = TaskInstance SI = SensorInstance sensor_instance = (session.query(SI).filter( SI.dag_id == dag_id, SI.task_id == task_id, SI.execution_date == execution_date).with_for_update().first()) if sensor_instance.hashcode != sensor_work.hashcode: # Return without setting state return ti = (session.query(TI).filter( TI.dag_id == dag_id, TI.task_id == task_id, TI.execution_date == execution_date).with_for_update().first()) if ti: if ti.state == State.SENSING: ti.hostname = self.hostname handle_failure(sensor_work, ti) sensor_instance.state = State.FAILED ti.end_date = timezone.utcnow() ti.set_duration() else: sensor_instance.state = ti.state session.merge(sensor_instance) session.merge(ti) session.commit() sensor_work.log.info( "Task %s got an error: %s. Set the state to failed. Exit.", str(sensor_work.ti_key), error) sensor_work.close_sensor_logger() except AirflowException: sensor_work.log.warning("Exception on failing %s", sensor_work.ti_key, exc_info=True)
def dt(attr): f = attr.get(attr_name) f = f.isoformat() if f else '' if timezone.utcnow().isoformat()[:4] == f[:4]: f = f[5:] return Markup("<nobr>{}</nobr>").format(f)
def manage_slas(self, dag: DAG, session: Session = None) -> None: """ Finding all tasks that have SLAs defined, and sending alert emails where needed. New SLA misses are also recorded in the database. We are assuming that the scheduler runs often, so we only check for tasks that should have succeeded in the past hour. """ self.log.info("Running SLA Checks for %s", dag.dag_id) if not any(isinstance(ti.sla, timedelta) for ti in dag.tasks): self.log.info( "Skipping SLA check for %s because no tasks in DAG have SLAs", dag) return qry = (session.query( TI.task_id, func.max(DR.execution_date).label('max_ti')).join( TI.dag_run).with_hint( TI, 'USE INDEX (PRIMARY)', dialect_name='mysql').filter( TI.dag_id == dag.dag_id).filter( or_(TI.state == State.SUCCESS, TI.state == State.SKIPPED)).filter( TI.task_id.in_(dag.task_ids)).group_by( TI.task_id).subquery('sq')) # get recorded SlaMiss recorded_slas_query = set( session.query(SlaMiss.dag_id, SlaMiss.task_id, SlaMiss.execution_date).filter( SlaMiss.dag_id == dag.dag_id, SlaMiss.task_id.in_(dag.task_ids))) max_tis: Iterator[TI] = (session.query(TI).join(TI.dag_run).filter( TI.dag_id == dag.dag_id, TI.task_id == qry.c.task_id, DR.execution_date == qry.c.max_ti, )) ts = timezone.utcnow() for ti in max_tis: task = dag.get_task(ti.task_id) if not task.sla: continue if not isinstance(task.sla, timedelta): raise TypeError( f"SLA is expected to be timedelta object, got " f"{type(task.sla)} in {task.dag_id}:{task.task_id}") sla_misses = [] next_info = dag.next_dagrun_info(dag.get_run_data_interval( ti.dag_run), restricted=False) if next_info is None: self.log.info( "Skipping SLA check for %s because task does not have scheduled date", ti) else: while next_info.logical_date < ts: next_info = dag.next_dagrun_info(next_info.data_interval, restricted=False) if next_info is None: break if (ti.dag_id, ti.task_id, next_info.logical_date) in recorded_slas_query: break if next_info.logical_date + task.sla < ts: sla_miss = SlaMiss( task_id=ti.task_id, dag_id=ti.dag_id, execution_date=next_info.logical_date, timestamp=ts, ) sla_misses.append(sla_miss) if sla_misses: session.add_all(sla_misses) session.commit() slas: List[SlaMiss] = ( session.query(SlaMiss).filter(SlaMiss.notification_sent == False, SlaMiss.dag_id == dag.dag_id) # noqa .all()) if slas: sla_dates: List[datetime.datetime] = [ sla.execution_date for sla in slas ] fetched_tis: List[TI] = (session.query(TI).filter( TI.state != State.SUCCESS, TI.execution_date.in_(sla_dates), TI.dag_id == dag.dag_id).all()) blocking_tis: List[TI] = [] for ti in fetched_tis: if ti.task_id in dag.task_ids: ti.task = dag.get_task(ti.task_id) blocking_tis.append(ti) else: session.delete(ti) session.commit() task_list = "\n".join(sla.task_id + ' on ' + sla.execution_date.isoformat() for sla in slas) blocking_task_list = "\n".join(ti.task_id + ' on ' + ti.execution_date.isoformat() for ti in blocking_tis) # Track whether email or any alert notification sent # We consider email or the alert callback as notifications email_sent = False notification_sent = False if dag.sla_miss_callback: # Execute the alert callback self.log.info('Calling SLA miss callback') try: dag.sla_miss_callback(dag, task_list, blocking_task_list, slas, blocking_tis) notification_sent = True except Exception: self.log.exception( "Could not call sla_miss_callback for DAG %s", dag.dag_id) email_content = f"""\ Here's a list of tasks that missed their SLAs: <pre><code>{task_list}\n<code></pre> Blocking tasks: <pre><code>{blocking_task_list}<code></pre> Airflow Webserver URL: {conf.get(section='webserver', key='base_url')} """ tasks_missed_sla = [] for sla in slas: try: task = dag.get_task(sla.task_id) except TaskNotFound: # task already deleted from DAG, skip it self.log.warning( "Task %s doesn't exist in DAG anymore, skipping SLA miss notification.", sla.task_id) continue tasks_missed_sla.append(task) emails: Set[str] = set() for task in tasks_missed_sla: if task.email: if isinstance(task.email, str): emails |= set(get_email_address_list(task.email)) elif isinstance(task.email, (list, tuple)): emails |= set(task.email) if emails: try: send_email(emails, f"[airflow] SLA miss on DAG={dag.dag_id}", email_content) email_sent = True notification_sent = True except Exception: Stats.incr('sla_email_notification_failure') self.log.exception( "Could not send SLA Miss email notification for DAG %s", dag.dag_id) # If we sent any notification, update the sla_miss table if notification_sent: for sla in slas: sla.email_sent = email_sent sla.notification_sent = True session.merge(sla) session.commit()
def heartbeat(self): """ This should be periodically called by the scheduler. This method will kick off new processes to process DAG definition files and read the results from the finished processors. :return: a list of SimpleDags that were produced by processors that have finished since the last time this was called :rtype: list[SimpleDag] """ finished_processors = {} """:type : dict[unicode, AbstractDagFileProcessor]""" running_processors = {} """:type : dict[unicode, AbstractDagFileProcessor]""" for file_path, processor in self._processors.items(): if processor.done: self.log.debug("Processor for %s finished", file_path) now = timezone.utcnow() finished_processors[file_path] = processor self._last_runtime[file_path] = ( now - processor.start_time).total_seconds() self._last_finish_time[file_path] = now self._run_count[file_path] += 1 else: running_processors[file_path] = processor self._processors = running_processors self.log.debug("%s/%s scheduler processes running", len(self._processors), self._parallelism) self.log.debug("%s file paths queued for processing", len(self._file_path_queue)) # Collect all the DAGs that were found in the processed files simple_dags = [] for file_path, processor in finished_processors.items(): if processor.result is None: self.log.warning( "Processor for %s exited with return code %s.", processor.file_path, processor.exit_code) else: for simple_dag in processor.result: simple_dags.append(simple_dag) # Generate more file paths to process if we processed all the files # already. if len(self._file_path_queue) == 0: # If the file path is already being processed, or if a file was # processed recently, wait until the next batch file_paths_in_progress = self._processors.keys() now = timezone.utcnow() file_paths_recently_processed = [] for file_path in self._file_paths: last_finish_time = self.get_last_finish_time(file_path) if (last_finish_time is not None and (now - last_finish_time).total_seconds() < self._process_file_interval): file_paths_recently_processed.append(file_path) files_paths_at_run_limit = [ file_path for file_path, num_runs in self._run_count.items() if num_runs == self._max_runs ] files_paths_to_queue = list( set(self._file_paths) - set(file_paths_in_progress) - set(file_paths_recently_processed) - set(files_paths_at_run_limit)) for file_path, processor in self._processors.items(): self.log.debug( "File path %s is still being processed (started: %s)", processor.file_path, processor.start_time.isoformat()) self.log.debug("Queuing the following files for processing:\n\t%s", "\n\t".join(files_paths_to_queue)) self._file_path_queue.extend(files_paths_to_queue) # Start more processors if we have enough slots and files to process while (self._parallelism - len(self._processors) > 0 and len(self._file_path_queue) > 0): file_path = self._file_path_queue.pop(0) processor = self._processor_factory(file_path) processor.start() self.log.debug( "Started a process (PID: %s) to generate tasks for %s", processor.pid, file_path) self._processors[file_path] = processor # Update scheduler heartbeat count. self._run_count[self._heart_beat_key] += 1 return simple_dags
def test_utcnow(self): now = timezone.utcnow() self.assertTrue(timezone.is_localized(now)) self.assertEqual(now.replace(tzinfo=None), now.astimezone(UTC).replace(tzinfo=None))