def execute(self, context): started_at = timezone.utcnow() self.hostname = get_hostname() while True: poke_start_time = timezone.utcnow() self.flush_cached_sensor_poke_results() self._load_sensor_works() self.log.info("Loaded %s sensor_works", len(self.sensor_works)) Stats.gauge("smart_sensor_operator.loaded_tasks", len(self.sensor_works)) for sensor_work in self.sensor_works: self._execute_sensor_work(sensor_work) duration = (timezone.utcnow() - poke_start_time).total_seconds() self.log.info("Taking %s to execute %s tasks.", duration, len(self.sensor_works)) Stats.timing("smart_sensor_operator.loop_duration", duration) Stats.gauge("smart_sensor_operator.executed_tasks", len(self.sensor_works)) self._emit_loop_stats() if duration < self.poke_interval: sleep(self.poke_interval - duration) if (timezone.utcnow() - started_at).total_seconds() > self.timeout: self.log.info("Time is out for smart senosr.") return
def _emit_loop_stats(self): try: count_poke = 0 count_poke_success = 0 count_poke_exception = 0 count_exception_failures = 0 count_infra_failure = 0 for cached_work in self.cached_dedup_works.values(): if cached_work.state is None: continue count_poke += 1 if cached_work.state == PokeState.LANDED: count_poke_success += 1 elif cached_work.state == PokeState.POKE_EXCEPTION: count_poke_exception += 1 for cached_exception in self.cached_sensor_exceptions.values(): if cached_exception.is_infra_failure and cached_exception.fail_current_run: count_infra_failure += 1 if cached_exception.fail_current_run: count_exception_failures += 1 Stats.gauge("smart_sensor_operator.poked_tasks", count_poke) Stats.gauge("smart_sensor_operator.poked_success", count_poke_success) Stats.gauge("smart_sensor_operator.poked_exception", count_poke_exception) Stats.gauge("smart_sensor_operator.exception_failures", count_exception_failures) Stats.gauge("smart_sensor_operator.infra_failures", count_infra_failure) except Exception as e: # pylint: disable=broad-except self.log.exception("Exception at getting loop stats %s") self.log.exception(e, exc_info=True)
def _log_file_processing_stats(self, known_file_paths): """ Print out stats about how files are getting processed. :param known_file_paths: a list of file paths that may contain Airflow DAG definitions :type known_file_paths: list[unicode] :return: None """ # File Path: Path to the file containing the DAG definition # PID: PID associated with the process that's processing the file. May # be empty. # Runtime: If the process is currently running, how long it's been # running for in seconds. # Last Runtime: If the process ran before, how long did it take to # finish in seconds # Last Run: When the file finished processing in the previous run. headers = ["File Path", "PID", "Runtime", "Last Runtime", "Last Run"] rows = [] for file_path in known_file_paths: last_runtime = self.get_last_runtime(file_path) file_name = os.path.basename(file_path) file_name = os.path.splitext(file_name)[0].replace(os.sep, '.') if last_runtime: Stats.gauge('dag_processing.last_runtime.{}'.format(file_name), last_runtime) processor_pid = self.get_pid(file_path) processor_start_time = self.get_start_time(file_path) runtime = ((timezone.utcnow() - processor_start_time).total_seconds() if processor_start_time else None) last_run = self.get_last_finish_time(file_path) if last_run: seconds_ago = (timezone.utcnow() - last_run).total_seconds() Stats.gauge( 'dag_processing.last_run.seconds_ago.{}'.format(file_name), seconds_ago) rows.append( (file_path, processor_pid, runtime, last_runtime, last_run)) # Sort by longest last runtime. (Can't sort None values in python3) rows = sorted(rows, key=lambda x: x[3] or 0.0) formatted_rows = [] for file_path, pid, runtime, last_runtime, last_run in rows: formatted_rows.append( (file_path, pid, "{:.2f}s".format(runtime) if runtime else None, "{:.2f}s".format(last_runtime) if last_runtime else None, last_run.strftime("%Y-%m-%dT%H:%M:%S") if last_run else None)) log_str = ("\n" + "=" * 80 + "\n" + "DAG File Processing Stats\n\n" + tabulate(formatted_rows, headers=headers) + "\n" + "=" * 80) self.log.info(log_str)
def emit_metrics(self): """ Emit metrics about dag parsing summary This is called once every time around the parsing "loop" - i.e. after all files have been parsed. """ parse_time = (timezone.utcnow() - self._parsing_start_time).total_seconds() Stats.gauge('dag_processing.total_parse_time', parse_time) Stats.gauge('dagbag_size', sum(stat.num_dags for stat in self._file_stats.values())) Stats.gauge('dag_processing.import_errors', sum(stat.import_errors for stat in self._file_stats.values())) # TODO: Remove before Airflow 2.0 Stats.gauge('collect_dags', parse_time) Stats.gauge('dagbag_import_errors', sum(stat.import_errors for stat in self._file_stats.values()))
def heartbeat(self) -> None: """ Heartbeat sent to trigger new jobs. """ if not self.parallelism: open_slots = len(self.queued_tasks) else: open_slots = self.parallelism - len(self.running) num_running_tasks = len(self.running) num_queued_tasks = len(self.queued_tasks) self.log.debug("%s running task instances", num_running_tasks) self.log.debug("%s in queue", num_queued_tasks) self.log.debug("%s open slots", open_slots) Stats.gauge('executor.open_slots', open_slots) Stats.gauge('executor.queued_tasks', num_queued_tasks) Stats.gauge('executor.running_tasks', num_running_tasks) self.trigger_tasks(open_slots) # Calling child class sync method self.log.debug("Calling the %s sync method", self.__class__) self.sync()
def heartbeat(self): # Triggering new jobs if not self.parallelism: open_slots = len(self.queued_tasks) else: open_slots = self.parallelism - len(self.running) num_running_tasks = len(self.running) num_queued_tasks = len(self.queued_tasks) self.log.debug("%s running task instances", num_running_tasks) self.log.debug("%s in queue", num_queued_tasks) self.log.debug("%s open slots", open_slots) Stats.gauge('executor.open_slots', open_slots) Stats.gauge('executor.queued_tasks', num_queued_tasks) Stats.gauge('executor.running_tasks', num_running_tasks) sorted_queue = sorted( [(k, v) for k, v in self.queued_tasks.items()], key=lambda x: x[1][1], reverse=True) for i in range(min((open_slots, len(self.queued_tasks)))): key, (command, _, queue, simple_ti) = sorted_queue.pop(0) self.queued_tasks.pop(key) self.running[key] = command self.execute_async(key=key, command=command, queue=queue, executor_config=simple_ti.executor_config) # Calling child class sync method self.log.debug("Calling the %s sync method", self.__class__) self.sync()
def heartbeat(self): # Triggering new jobs if not self.parallelism: open_slots = len(self.queued_tasks) else: open_slots = self.parallelism - len(self.running) num_running_tasks = len(self.running) num_queued_tasks = len(self.queued_tasks) self.log.debug("%s running task instances", num_running_tasks) self.log.debug("%s in queue", num_queued_tasks) self.log.debug("%s open slots", open_slots) Stats.gauge('executor.open_slots', open_slots) Stats.gauge('executor.queued_tasks', num_queued_tasks) Stats.gauge('executor.running_tasks', num_running_tasks) sorted_queue = sorted([(k, v) for k, v in self.queued_tasks.items()], key=lambda x: x[1][1], reverse=True) for i in range(min((open_slots, len(self.queued_tasks)))): key, (command, _, queue, simple_ti) = sorted_queue.pop(0) self.queued_tasks.pop(key) self.running[key] = command self.execute_async(key=key, command=command, queue=queue, executor_config=simple_ti.executor_config) # Calling child class sync method self.log.debug("Calling the %s sync method", self.__class__) self.sync()
def _emit_pool_metrics(self, session: Session = None) -> None: pools = models.Pool.slots_stats(session=session) for pool_name, slot_stats in pools.items(): Stats.gauge(f'pool.open_slots.{pool_name}', slot_stats["open"]) Stats.gauge(f'pool.queued_slots.{pool_name}', slot_stats[State.QUEUED]) Stats.gauge(f'pool.running_slots.{pool_name}', slot_stats[State.RUNNING])
def collect_dags( self, dag_folder=None, only_if_updated=True, include_examples=configuration.conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=configuration.conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ start_dttm = timezone.utcnow() dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] FileLoadStat = namedtuple('FileLoadStat', "file duration dag_num task_num dags") dag_folder = correct_maybe_zipped(dag_folder) for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: ts = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) td = timezone.utcnow() - ts td = td.total_seconds() + (float(td.microseconds) / 1000000) stats.append( FileLoadStat( filepath.replace(dag_folder, ''), td, len(found_dags), sum([len(dag.tasks) for dag in found_dags]), str([dag.dag_id for dag in found_dags]), )) except Exception as e: self.log.exception(e) Stats.gauge('collect_dags', (timezone.utcnow() - start_dttm).total_seconds(), 1) Stats.gauge('dagbag_size', len(self.dags), 1) Stats.gauge('dagbag_import_errors', len(self.import_errors), 1) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True)
def emit_metrics(self): """ Emit metrics about dag parsing summary This is called once every time around the parsing "loop" - i.e. after all files have been parsed. """ parse_time = time.perf_counter() - self._parsing_start_time Stats.gauge('dag_processing.total_parse_time', parse_time) Stats.gauge('dagbag_size', sum(stat.num_dags for stat in self._file_stats.values())) Stats.gauge( 'dag_processing.import_errors', sum(stat.import_errors for stat in self._file_stats.values()) )
def collect_dags(self, dag_folder=None, only_if_updated=True, include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ if self.store_serialized_dags: return self.log.info("Filling up the DagBag from %s", dag_folder) start_dttm = timezone.utcnow() dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] dag_folder = correct_maybe_zipped(dag_folder) for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: file_parse_start_dttm = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) file_parse_end_dttm = timezone.utcnow() stats.append( FileLoadStat( file=filepath.replace(settings.DAGS_FOLDER, ''), duration=file_parse_end_dttm - file_parse_start_dttm, dag_num=len(found_dags), task_num=sum([len(dag.tasks) for dag in found_dags]), dags=str([dag.dag_id for dag in found_dags]), )) except Exception as e: # pylint: disable=broad-except self.log.exception(e) end_dttm = timezone.utcnow() durations = (end_dttm - start_dttm).total_seconds() Stats.gauge('collect_dags', durations, 1) Stats.gauge('dagbag_size', len(self.dags), 1) Stats.gauge('dagbag_import_errors', len(self.import_errors), 1) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True) for file_stat in self.dagbag_stats: # file_stat.file similar format: /subdir/dag_name.py # TODO: Remove for Airflow 2.0 filename = file_stat.file.split('/')[-1].replace('.py', '') Stats.timing('dag.loading-duration.{}'.format(filename), file_stat.duration)
def _log_file_processing_stats(self, known_file_paths): """ Print out stats about how files are getting processed. :param known_file_paths: a list of file paths that may contain Airflow DAG definitions :type known_file_paths: list[unicode] :return: None """ # File Path: Path to the file containing the DAG definition # PID: PID associated with the process that's processing the file. May # be empty. # Runtime: If the process is currently running, how long it's been # running for in seconds. # Last Runtime: If the process ran before, how long did it take to # finish in seconds # Last Run: When the file finished processing in the previous run. headers = ["File Path", "PID", "Runtime", "Last Runtime", "Last Run"] rows = [] for file_path in known_file_paths: last_runtime = self.get_last_runtime(file_path) file_name = os.path.basename(file_path) file_name = os.path.splitext(file_name)[0].replace(os.sep, '.') if last_runtime: Stats.gauge( 'dag_processing.last_runtime.{}'.format(file_name), last_runtime ) processor_pid = self.get_pid(file_path) processor_start_time = self.get_start_time(file_path) runtime = ((timezone.utcnow() - processor_start_time).total_seconds() if processor_start_time else None) last_run = self.get_last_finish_time(file_path) if last_run: seconds_ago = (timezone.utcnow() - last_run).total_seconds() Stats.gauge( 'dag_processing.last_run.seconds_ago.{}'.format(file_name), seconds_ago ) rows.append((file_path, processor_pid, runtime, last_runtime, last_run)) # Sort by longest last runtime. (Can't sort None values in python3) rows = sorted(rows, key=lambda x: x[3] or 0.0) formatted_rows = [] for file_path, pid, runtime, last_runtime, last_run in rows: formatted_rows.append((file_path, pid, "{:.2f}s".format(runtime) if runtime else None, "{:.2f}s".format(last_runtime) if last_runtime else None, last_run.strftime("%Y-%m-%dT%H:%M:%S") if last_run else None)) log_str = ("\n" + "=" * 80 + "\n" + "DAG File Processing Stats\n\n" + tabulate(formatted_rows, headers=headers) + "\n" + "=" * 80) self.log.info(log_str)
def emit_metrics(self): Stats.gauge('triggers.running', len(self.runner.triggers))
def _executable_task_instances_to_queued(self, max_tis: int, session: Session = None) -> List[TI]: """ Finds TIs that are ready for execution with respect to pool limits, dag max_active_tasks, executor state, and priority. :param max_tis: Maximum number of TIs to queue in this loop. :type max_tis: int :return: list[airflow.models.TaskInstance] """ executable_tis: List[TI] = [] # Get the pool settings. We get a lock on the pool rows, treating this as a "critical section" # Throws an exception if lock cannot be obtained, rather than blocking pools = models.Pool.slots_stats(lock_rows=True, session=session) # If the pools are full, there is no point doing anything! # If _somehow_ the pool is overfull, don't let the limit go negative - it breaks SQL pool_slots_free = max(0, sum(pool['open'] for pool in pools.values())) if pool_slots_free == 0: self.log.debug("All pools are full!") return executable_tis max_tis = min(max_tis, pool_slots_free) # Get all task instances associated with scheduled # DagRuns which are not backfilled, in the given states, # and the dag is not paused query = ( session.query(TI) .join(TI.dag_run) .options(eagerload(TI.dag_run)) .filter(DR.run_type != DagRunType.BACKFILL_JOB, DR.state != DagRunState.QUEUED) .join(TI.dag_model) .filter(not_(DM.is_paused)) .filter(TI.state == State.SCHEDULED) .options(selectinload('dag_model')) .order_by(-TI.priority_weight, DR.execution_date) ) starved_pools = [pool_name for pool_name, stats in pools.items() if stats['open'] <= 0] if starved_pools: query = query.filter(not_(TI.pool.in_(starved_pools))) query = query.limit(max_tis) task_instances_to_examine: List[TI] = with_row_locks( query, of=TI, session=session, **skip_locked(session=session), ).all() # TODO[HA]: This was wrong before anyway, as it only looked at a sub-set of dags, not everything. # Stats.gauge('scheduler.tasks.pending', len(task_instances_to_examine)) if len(task_instances_to_examine) == 0: self.log.debug("No tasks to consider for execution.") return executable_tis # Put one task instance on each line task_instance_str = "\n\t".join(repr(x) for x in task_instances_to_examine) self.log.info("%s tasks up for execution:\n\t%s", len(task_instances_to_examine), task_instance_str) pool_to_task_instances: DefaultDict[str, List[models.Pool]] = defaultdict(list) for task_instance in task_instances_to_examine: pool_to_task_instances[task_instance.pool].append(task_instance) # dag_id to # of running tasks and (dag_id, task_id) to # of running tasks. dag_max_active_tasks_map: DefaultDict[str, int] task_concurrency_map: DefaultDict[Tuple[str, str], int] dag_max_active_tasks_map, task_concurrency_map = self.__get_concurrency_maps( states=list(EXECUTION_STATES), session=session ) num_tasks_in_executor = 0 # Number of tasks that cannot be scheduled because of no open slot in pool num_starving_tasks_total = 0 # Go through each pool, and queue up a task for execution if there are # any open slots in the pool. for pool, task_instances in pool_to_task_instances.items(): pool_name = pool if pool not in pools: self.log.warning("Tasks using non-existent pool '%s' will not be scheduled", pool) continue open_slots = pools[pool]["open"] num_ready = len(task_instances) self.log.info( "Figuring out tasks to run in Pool(name=%s) with %s open slots " "and %s task instances ready to be queued", pool, open_slots, num_ready, ) priority_sorted_task_instances = sorted( task_instances, key=lambda ti: (-ti.priority_weight, ti.execution_date) ) num_starving_tasks = 0 for current_index, task_instance in enumerate(priority_sorted_task_instances): if open_slots <= 0: self.log.info("Not scheduling since there are %s open slots in pool %s", open_slots, pool) # Can't schedule any more since there are no more open slots. num_unhandled = len(priority_sorted_task_instances) - current_index num_starving_tasks += num_unhandled num_starving_tasks_total += num_unhandled break # Check to make sure that the task max_active_tasks of the DAG hasn't been # reached. dag_id = task_instance.dag_id current_max_active_tasks_per_dag = dag_max_active_tasks_map[dag_id] max_active_tasks_per_dag_limit = task_instance.dag_model.max_active_tasks self.log.info( "DAG %s has %s/%s running and queued tasks", dag_id, current_max_active_tasks_per_dag, max_active_tasks_per_dag_limit, ) if current_max_active_tasks_per_dag >= max_active_tasks_per_dag_limit: self.log.info( "Not executing %s since the number of tasks running or queued " "from DAG %s is >= to the DAG's max_active_tasks limit of %s", task_instance, dag_id, max_active_tasks_per_dag_limit, ) continue task_concurrency_limit: Optional[int] = None if task_instance.dag_model.has_task_concurrency_limits: # Many dags don't have a task_concurrency, so where we can avoid loading the full # serialized DAG the better. serialized_dag = self.dagbag.get_dag(dag_id, session=session) if serialized_dag.has_task(task_instance.task_id): task_concurrency_limit = serialized_dag.get_task( task_instance.task_id ).max_active_tis_per_dag if task_concurrency_limit is not None: current_task_concurrency = task_concurrency_map[ (task_instance.dag_id, task_instance.task_id) ] if current_task_concurrency >= task_concurrency_limit: self.log.info( "Not executing %s since the task concurrency for" " this task has been reached.", task_instance, ) continue if task_instance.pool_slots > open_slots: self.log.info( "Not executing %s since it requires %s slots " "but there are %s open slots in the pool %s.", task_instance, task_instance.pool_slots, open_slots, pool, ) num_starving_tasks += 1 num_starving_tasks_total += 1 # Though we can execute tasks with lower priority if there's enough room continue executable_tis.append(task_instance) open_slots -= task_instance.pool_slots dag_max_active_tasks_map[dag_id] += 1 task_concurrency_map[(task_instance.dag_id, task_instance.task_id)] += 1 Stats.gauge(f'pool.starving_tasks.{pool_name}', num_starving_tasks) Stats.gauge('scheduler.tasks.starving', num_starving_tasks_total) Stats.gauge('scheduler.tasks.running', num_tasks_in_executor) Stats.gauge('scheduler.tasks.executable', len(executable_tis)) task_instance_str = "\n\t".join(repr(x) for x in executable_tis) self.log.info("Setting the following tasks to queued state:\n\t%s", task_instance_str) if len(executable_tis) > 0: # set TIs to queued state filter_for_tis = TI.filter_for_tis(executable_tis) session.query(TI).filter(filter_for_tis).update( # TODO[ha]: should we use func.now()? How does that work with DB timezone # on mysql when it's not UTC? {TI.state: State.QUEUED, TI.queued_dttm: timezone.utcnow(), TI.queued_by_job_id: self.id}, synchronize_session=False, ) for ti in executable_tis: make_transient(ti) return executable_tis