コード例 #1
0
    def execute(self, context):
        started_at = timezone.utcnow()

        self.hostname = get_hostname()
        while True:
            poke_start_time = timezone.utcnow()

            self.flush_cached_sensor_poke_results()

            self._load_sensor_works()
            self.log.info("Loaded %s sensor_works", len(self.sensor_works))
            Stats.gauge("smart_sensor_operator.loaded_tasks", len(self.sensor_works))

            for sensor_work in self.sensor_works:
                self._execute_sensor_work(sensor_work)

            duration = (timezone.utcnow() - poke_start_time).total_seconds()

            self.log.info("Taking %s to execute %s tasks.", duration, len(self.sensor_works))

            Stats.timing("smart_sensor_operator.loop_duration", duration)
            Stats.gauge("smart_sensor_operator.executed_tasks", len(self.sensor_works))
            self._emit_loop_stats()

            if duration < self.poke_interval:
                sleep(self.poke_interval - duration)
            if (timezone.utcnow() - started_at).total_seconds() > self.timeout:
                self.log.info("Time is out for smart senosr.")
                return
コード例 #2
0
    def _emit_loop_stats(self):
        try:
            count_poke = 0
            count_poke_success = 0
            count_poke_exception = 0
            count_exception_failures = 0
            count_infra_failure = 0
            for cached_work in self.cached_dedup_works.values():
                if cached_work.state is None:
                    continue
                count_poke += 1
                if cached_work.state == PokeState.LANDED:
                    count_poke_success += 1
                elif cached_work.state == PokeState.POKE_EXCEPTION:
                    count_poke_exception += 1
            for cached_exception in self.cached_sensor_exceptions.values():
                if cached_exception.is_infra_failure and cached_exception.fail_current_run:
                    count_infra_failure += 1
                if cached_exception.fail_current_run:
                    count_exception_failures += 1

            Stats.gauge("smart_sensor_operator.poked_tasks", count_poke)
            Stats.gauge("smart_sensor_operator.poked_success", count_poke_success)
            Stats.gauge("smart_sensor_operator.poked_exception", count_poke_exception)
            Stats.gauge("smart_sensor_operator.exception_failures", count_exception_failures)
            Stats.gauge("smart_sensor_operator.infra_failures", count_infra_failure)
        except Exception as e:  # pylint: disable=broad-except
            self.log.exception("Exception at getting loop stats %s")
            self.log.exception(e, exc_info=True)
コード例 #3
0
    def _log_file_processing_stats(self, known_file_paths):
        """
        Print out stats about how files are getting processed.

        :param known_file_paths: a list of file paths that may contain Airflow
            DAG definitions
        :type known_file_paths: list[unicode]
        :return: None
        """

        # File Path: Path to the file containing the DAG definition
        # PID: PID associated with the process that's processing the file. May
        # be empty.
        # Runtime: If the process is currently running, how long it's been
        # running for in seconds.
        # Last Runtime: If the process ran before, how long did it take to
        # finish in seconds
        # Last Run: When the file finished processing in the previous run.
        headers = ["File Path", "PID", "Runtime", "Last Runtime", "Last Run"]

        rows = []
        for file_path in known_file_paths:
            last_runtime = self.get_last_runtime(file_path)
            file_name = os.path.basename(file_path)
            file_name = os.path.splitext(file_name)[0].replace(os.sep, '.')
            if last_runtime:
                Stats.gauge('dag_processing.last_runtime.{}'.format(file_name),
                            last_runtime)

            processor_pid = self.get_pid(file_path)
            processor_start_time = self.get_start_time(file_path)
            runtime = ((timezone.utcnow() -
                        processor_start_time).total_seconds()
                       if processor_start_time else None)
            last_run = self.get_last_finish_time(file_path)
            if last_run:
                seconds_ago = (timezone.utcnow() - last_run).total_seconds()
                Stats.gauge(
                    'dag_processing.last_run.seconds_ago.{}'.format(file_name),
                    seconds_ago)

            rows.append(
                (file_path, processor_pid, runtime, last_runtime, last_run))

        # Sort by longest last runtime. (Can't sort None values in python3)
        rows = sorted(rows, key=lambda x: x[3] or 0.0)

        formatted_rows = []
        for file_path, pid, runtime, last_runtime, last_run in rows:
            formatted_rows.append(
                (file_path, pid,
                 "{:.2f}s".format(runtime) if runtime else None,
                 "{:.2f}s".format(last_runtime) if last_runtime else None,
                 last_run.strftime("%Y-%m-%dT%H:%M:%S") if last_run else None))
        log_str = ("\n" + "=" * 80 + "\n" + "DAG File Processing Stats\n\n" +
                   tabulate(formatted_rows, headers=headers) + "\n" + "=" * 80)

        self.log.info(log_str)
コード例 #4
0
    def emit_metrics(self):
        """
        Emit metrics about dag parsing summary

        This is called once every time around the parsing "loop" - i.e. after
        all files have been parsed.
        """
        parse_time = (timezone.utcnow() - self._parsing_start_time).total_seconds()
        Stats.gauge('dag_processing.total_parse_time', parse_time)
        Stats.gauge('dagbag_size', sum(stat.num_dags for stat in self._file_stats.values()))
        Stats.gauge('dag_processing.import_errors',
                    sum(stat.import_errors for stat in self._file_stats.values()))

        # TODO: Remove before Airflow 2.0
        Stats.gauge('collect_dags', parse_time)
        Stats.gauge('dagbag_import_errors', sum(stat.import_errors for stat in self._file_stats.values()))
コード例 #5
0
ファイル: base_executor.py プロジェクト: zorseti/airflow
    def heartbeat(self) -> None:
        """
        Heartbeat sent to trigger new jobs.
        """
        if not self.parallelism:
            open_slots = len(self.queued_tasks)
        else:
            open_slots = self.parallelism - len(self.running)

        num_running_tasks = len(self.running)
        num_queued_tasks = len(self.queued_tasks)

        self.log.debug("%s running task instances", num_running_tasks)
        self.log.debug("%s in queue", num_queued_tasks)
        self.log.debug("%s open slots", open_slots)

        Stats.gauge('executor.open_slots', open_slots)
        Stats.gauge('executor.queued_tasks', num_queued_tasks)
        Stats.gauge('executor.running_tasks', num_running_tasks)

        self.trigger_tasks(open_slots)

        # Calling child class sync method
        self.log.debug("Calling the %s sync method", self.__class__)
        self.sync()
コード例 #6
0
    def heartbeat(self):
        # Triggering new jobs
        if not self.parallelism:
            open_slots = len(self.queued_tasks)
        else:
            open_slots = self.parallelism - len(self.running)

        num_running_tasks = len(self.running)
        num_queued_tasks = len(self.queued_tasks)

        self.log.debug("%s running task instances", num_running_tasks)
        self.log.debug("%s in queue", num_queued_tasks)
        self.log.debug("%s open slots", open_slots)

        Stats.gauge('executor.open_slots', open_slots)
        Stats.gauge('executor.queued_tasks', num_queued_tasks)
        Stats.gauge('executor.running_tasks', num_running_tasks)

        sorted_queue = sorted(
            [(k, v) for k, v in self.queued_tasks.items()],
            key=lambda x: x[1][1],
            reverse=True)
        for i in range(min((open_slots, len(self.queued_tasks)))):
            key, (command, _, queue, simple_ti) = sorted_queue.pop(0)
            self.queued_tasks.pop(key)
            self.running[key] = command
            self.execute_async(key=key,
                               command=command,
                               queue=queue,
                               executor_config=simple_ti.executor_config)

        # Calling child class sync method
        self.log.debug("Calling the %s sync method", self.__class__)
        self.sync()
コード例 #7
0
ファイル: base_executor.py プロジェクト: zyh1690/airflow
    def heartbeat(self):
        # Triggering new jobs
        if not self.parallelism:
            open_slots = len(self.queued_tasks)
        else:
            open_slots = self.parallelism - len(self.running)

        num_running_tasks = len(self.running)
        num_queued_tasks = len(self.queued_tasks)

        self.log.debug("%s running task instances", num_running_tasks)
        self.log.debug("%s in queue", num_queued_tasks)
        self.log.debug("%s open slots", open_slots)

        Stats.gauge('executor.open_slots', open_slots)
        Stats.gauge('executor.queued_tasks', num_queued_tasks)
        Stats.gauge('executor.running_tasks', num_running_tasks)

        sorted_queue = sorted([(k, v) for k, v in self.queued_tasks.items()],
                              key=lambda x: x[1][1],
                              reverse=True)
        for i in range(min((open_slots, len(self.queued_tasks)))):
            key, (command, _, queue, simple_ti) = sorted_queue.pop(0)
            self.queued_tasks.pop(key)
            self.running[key] = command
            self.execute_async(key=key,
                               command=command,
                               queue=queue,
                               executor_config=simple_ti.executor_config)

        # Calling child class sync method
        self.log.debug("Calling the %s sync method", self.__class__)
        self.sync()
コード例 #8
0
 def _emit_pool_metrics(self, session: Session = None) -> None:
     pools = models.Pool.slots_stats(session=session)
     for pool_name, slot_stats in pools.items():
         Stats.gauge(f'pool.open_slots.{pool_name}', slot_stats["open"])
         Stats.gauge(f'pool.queued_slots.{pool_name}',
                     slot_stats[State.QUEUED])
         Stats.gauge(f'pool.running_slots.{pool_name}',
                     slot_stats[State.RUNNING])
コード例 #9
0
ファイル: dagbag.py プロジェクト: shubhparekh/airflow
    def collect_dags(
        self,
        dag_folder=None,
        only_if_updated=True,
        include_examples=configuration.conf.getboolean('core',
                                                       'LOAD_EXAMPLES'),
        safe_mode=configuration.conf.getboolean('core',
                                                'DAG_DISCOVERY_SAFE_MODE')):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the regex patterns specified
        in the file.

        **Note**: The patterns in .airflowignore are treated as
        un-anchored regexes, not shell-like glob patterns.
        """
        start_dttm = timezone.utcnow()
        dag_folder = dag_folder or self.dag_folder

        # Used to store stats around DagBag processing
        stats = []
        FileLoadStat = namedtuple('FileLoadStat',
                                  "file duration dag_num task_num dags")

        dag_folder = correct_maybe_zipped(dag_folder)

        for filepath in list_py_file_paths(dag_folder,
                                           safe_mode=safe_mode,
                                           include_examples=include_examples):
            try:
                ts = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)

                td = timezone.utcnow() - ts
                td = td.total_seconds() + (float(td.microseconds) / 1000000)
                stats.append(
                    FileLoadStat(
                        filepath.replace(dag_folder, ''),
                        td,
                        len(found_dags),
                        sum([len(dag.tasks) for dag in found_dags]),
                        str([dag.dag_id for dag in found_dags]),
                    ))
            except Exception as e:
                self.log.exception(e)
        Stats.gauge('collect_dags',
                    (timezone.utcnow() - start_dttm).total_seconds(), 1)
        Stats.gauge('dagbag_size', len(self.dags), 1)
        Stats.gauge('dagbag_import_errors', len(self.import_errors), 1)
        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
コード例 #10
0
ファイル: dag_processing.py プロジェクト: ysktir/airflow-1
    def emit_metrics(self):
        """
        Emit metrics about dag parsing summary

        This is called once every time around the parsing "loop" - i.e. after
        all files have been parsed.
        """
        parse_time = time.perf_counter() - self._parsing_start_time
        Stats.gauge('dag_processing.total_parse_time', parse_time)
        Stats.gauge('dagbag_size', sum(stat.num_dags for stat in self._file_stats.values()))
        Stats.gauge(
            'dag_processing.import_errors', sum(stat.import_errors for stat in self._file_stats.values())
        )
コード例 #11
0
    def collect_dags(self,
                     dag_folder=None,
                     only_if_updated=True,
                     include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'),
                     safe_mode=conf.getboolean('core',
                                               'DAG_DISCOVERY_SAFE_MODE')):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the regex patterns specified
        in the file.

        **Note**: The patterns in .airflowignore are treated as
        un-anchored regexes, not shell-like glob patterns.
        """
        if self.store_serialized_dags:
            return

        self.log.info("Filling up the DagBag from %s", dag_folder)
        start_dttm = timezone.utcnow()
        dag_folder = dag_folder or self.dag_folder
        # Used to store stats around DagBag processing
        stats = []

        dag_folder = correct_maybe_zipped(dag_folder)
        for filepath in list_py_file_paths(dag_folder,
                                           safe_mode=safe_mode,
                                           include_examples=include_examples):
            try:
                file_parse_start_dttm = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)

                file_parse_end_dttm = timezone.utcnow()
                stats.append(
                    FileLoadStat(
                        file=filepath.replace(settings.DAGS_FOLDER, ''),
                        duration=file_parse_end_dttm - file_parse_start_dttm,
                        dag_num=len(found_dags),
                        task_num=sum([len(dag.tasks) for dag in found_dags]),
                        dags=str([dag.dag_id for dag in found_dags]),
                    ))
            except Exception as e:  # pylint: disable=broad-except
                self.log.exception(e)

        end_dttm = timezone.utcnow()
        durations = (end_dttm - start_dttm).total_seconds()
        Stats.gauge('collect_dags', durations, 1)
        Stats.gauge('dagbag_size', len(self.dags), 1)
        Stats.gauge('dagbag_import_errors', len(self.import_errors), 1)
        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
        for file_stat in self.dagbag_stats:
            # file_stat.file similar format: /subdir/dag_name.py
            # TODO: Remove for Airflow 2.0
            filename = file_stat.file.split('/')[-1].replace('.py', '')
            Stats.timing('dag.loading-duration.{}'.format(filename),
                         file_stat.duration)
コード例 #12
0
    def _log_file_processing_stats(self, known_file_paths):
        """
        Print out stats about how files are getting processed.

        :param known_file_paths: a list of file paths that may contain Airflow
            DAG definitions
        :type known_file_paths: list[unicode]
        :return: None
        """

        # File Path: Path to the file containing the DAG definition
        # PID: PID associated with the process that's processing the file. May
        # be empty.
        # Runtime: If the process is currently running, how long it's been
        # running for in seconds.
        # Last Runtime: If the process ran before, how long did it take to
        # finish in seconds
        # Last Run: When the file finished processing in the previous run.
        headers = ["File Path",
                   "PID",
                   "Runtime",
                   "Last Runtime",
                   "Last Run"]

        rows = []
        for file_path in known_file_paths:
            last_runtime = self.get_last_runtime(file_path)
            file_name = os.path.basename(file_path)
            file_name = os.path.splitext(file_name)[0].replace(os.sep, '.')
            if last_runtime:
                Stats.gauge(
                    'dag_processing.last_runtime.{}'.format(file_name),
                    last_runtime
                )

            processor_pid = self.get_pid(file_path)
            processor_start_time = self.get_start_time(file_path)
            runtime = ((timezone.utcnow() - processor_start_time).total_seconds()
                       if processor_start_time else None)
            last_run = self.get_last_finish_time(file_path)
            if last_run:
                seconds_ago = (timezone.utcnow() - last_run).total_seconds()
                Stats.gauge(
                    'dag_processing.last_run.seconds_ago.{}'.format(file_name),
                    seconds_ago
                )

            rows.append((file_path,
                         processor_pid,
                         runtime,
                         last_runtime,
                         last_run))

        # Sort by longest last runtime. (Can't sort None values in python3)
        rows = sorted(rows, key=lambda x: x[3] or 0.0)

        formatted_rows = []
        for file_path, pid, runtime, last_runtime, last_run in rows:
            formatted_rows.append((file_path,
                                   pid,
                                   "{:.2f}s".format(runtime)
                                   if runtime else None,
                                   "{:.2f}s".format(last_runtime)
                                   if last_runtime else None,
                                   last_run.strftime("%Y-%m-%dT%H:%M:%S")
                                   if last_run else None))
        log_str = ("\n" +
                   "=" * 80 +
                   "\n" +
                   "DAG File Processing Stats\n\n" +
                   tabulate(formatted_rows, headers=headers) +
                   "\n" +
                   "=" * 80)

        self.log.info(log_str)
コード例 #13
0
ファイル: triggerer_job.py プロジェクト: karankale/airflow
 def emit_metrics(self):
     Stats.gauge('triggers.running', len(self.runner.triggers))
コード例 #14
0
    def _executable_task_instances_to_queued(self, max_tis: int, session: Session = None) -> List[TI]:
        """
        Finds TIs that are ready for execution with respect to pool limits,
        dag max_active_tasks, executor state, and priority.

        :param max_tis: Maximum number of TIs to queue in this loop.
        :type max_tis: int
        :return: list[airflow.models.TaskInstance]
        """
        executable_tis: List[TI] = []

        # Get the pool settings. We get a lock on the pool rows, treating this as a "critical section"
        # Throws an exception if lock cannot be obtained, rather than blocking
        pools = models.Pool.slots_stats(lock_rows=True, session=session)

        # If the pools are full, there is no point doing anything!
        # If _somehow_ the pool is overfull, don't let the limit go negative - it breaks SQL
        pool_slots_free = max(0, sum(pool['open'] for pool in pools.values()))

        if pool_slots_free == 0:
            self.log.debug("All pools are full!")
            return executable_tis

        max_tis = min(max_tis, pool_slots_free)

        # Get all task instances associated with scheduled
        # DagRuns which are not backfilled, in the given states,
        # and the dag is not paused
        query = (
            session.query(TI)
            .join(TI.dag_run)
            .options(eagerload(TI.dag_run))
            .filter(DR.run_type != DagRunType.BACKFILL_JOB, DR.state != DagRunState.QUEUED)
            .join(TI.dag_model)
            .filter(not_(DM.is_paused))
            .filter(TI.state == State.SCHEDULED)
            .options(selectinload('dag_model'))
            .order_by(-TI.priority_weight, DR.execution_date)
        )
        starved_pools = [pool_name for pool_name, stats in pools.items() if stats['open'] <= 0]
        if starved_pools:
            query = query.filter(not_(TI.pool.in_(starved_pools)))

        query = query.limit(max_tis)

        task_instances_to_examine: List[TI] = with_row_locks(
            query,
            of=TI,
            session=session,
            **skip_locked(session=session),
        ).all()
        # TODO[HA]: This was wrong before anyway, as it only looked at a sub-set of dags, not everything.
        # Stats.gauge('scheduler.tasks.pending', len(task_instances_to_examine))

        if len(task_instances_to_examine) == 0:
            self.log.debug("No tasks to consider for execution.")
            return executable_tis

        # Put one task instance on each line
        task_instance_str = "\n\t".join(repr(x) for x in task_instances_to_examine)
        self.log.info("%s tasks up for execution:\n\t%s", len(task_instances_to_examine), task_instance_str)

        pool_to_task_instances: DefaultDict[str, List[models.Pool]] = defaultdict(list)
        for task_instance in task_instances_to_examine:
            pool_to_task_instances[task_instance.pool].append(task_instance)

        # dag_id to # of running tasks and (dag_id, task_id) to # of running tasks.
        dag_max_active_tasks_map: DefaultDict[str, int]
        task_concurrency_map: DefaultDict[Tuple[str, str], int]
        dag_max_active_tasks_map, task_concurrency_map = self.__get_concurrency_maps(
            states=list(EXECUTION_STATES), session=session
        )

        num_tasks_in_executor = 0
        # Number of tasks that cannot be scheduled because of no open slot in pool
        num_starving_tasks_total = 0

        # Go through each pool, and queue up a task for execution if there are
        # any open slots in the pool.

        for pool, task_instances in pool_to_task_instances.items():
            pool_name = pool
            if pool not in pools:
                self.log.warning("Tasks using non-existent pool '%s' will not be scheduled", pool)
                continue

            open_slots = pools[pool]["open"]

            num_ready = len(task_instances)
            self.log.info(
                "Figuring out tasks to run in Pool(name=%s) with %s open slots "
                "and %s task instances ready to be queued",
                pool,
                open_slots,
                num_ready,
            )

            priority_sorted_task_instances = sorted(
                task_instances, key=lambda ti: (-ti.priority_weight, ti.execution_date)
            )

            num_starving_tasks = 0
            for current_index, task_instance in enumerate(priority_sorted_task_instances):
                if open_slots <= 0:
                    self.log.info("Not scheduling since there are %s open slots in pool %s", open_slots, pool)
                    # Can't schedule any more since there are no more open slots.
                    num_unhandled = len(priority_sorted_task_instances) - current_index
                    num_starving_tasks += num_unhandled
                    num_starving_tasks_total += num_unhandled
                    break

                # Check to make sure that the task max_active_tasks of the DAG hasn't been
                # reached.
                dag_id = task_instance.dag_id

                current_max_active_tasks_per_dag = dag_max_active_tasks_map[dag_id]
                max_active_tasks_per_dag_limit = task_instance.dag_model.max_active_tasks
                self.log.info(
                    "DAG %s has %s/%s running and queued tasks",
                    dag_id,
                    current_max_active_tasks_per_dag,
                    max_active_tasks_per_dag_limit,
                )
                if current_max_active_tasks_per_dag >= max_active_tasks_per_dag_limit:
                    self.log.info(
                        "Not executing %s since the number of tasks running or queued "
                        "from DAG %s is >= to the DAG's max_active_tasks limit of %s",
                        task_instance,
                        dag_id,
                        max_active_tasks_per_dag_limit,
                    )
                    continue

                task_concurrency_limit: Optional[int] = None
                if task_instance.dag_model.has_task_concurrency_limits:
                    # Many dags don't have a task_concurrency, so where we can avoid loading the full
                    # serialized DAG the better.
                    serialized_dag = self.dagbag.get_dag(dag_id, session=session)
                    if serialized_dag.has_task(task_instance.task_id):
                        task_concurrency_limit = serialized_dag.get_task(
                            task_instance.task_id
                        ).max_active_tis_per_dag

                    if task_concurrency_limit is not None:
                        current_task_concurrency = task_concurrency_map[
                            (task_instance.dag_id, task_instance.task_id)
                        ]

                        if current_task_concurrency >= task_concurrency_limit:
                            self.log.info(
                                "Not executing %s since the task concurrency for"
                                " this task has been reached.",
                                task_instance,
                            )
                            continue

                if task_instance.pool_slots > open_slots:
                    self.log.info(
                        "Not executing %s since it requires %s slots "
                        "but there are %s open slots in the pool %s.",
                        task_instance,
                        task_instance.pool_slots,
                        open_slots,
                        pool,
                    )
                    num_starving_tasks += 1
                    num_starving_tasks_total += 1
                    # Though we can execute tasks with lower priority if there's enough room
                    continue

                executable_tis.append(task_instance)
                open_slots -= task_instance.pool_slots
                dag_max_active_tasks_map[dag_id] += 1
                task_concurrency_map[(task_instance.dag_id, task_instance.task_id)] += 1

            Stats.gauge(f'pool.starving_tasks.{pool_name}', num_starving_tasks)

        Stats.gauge('scheduler.tasks.starving', num_starving_tasks_total)
        Stats.gauge('scheduler.tasks.running', num_tasks_in_executor)
        Stats.gauge('scheduler.tasks.executable', len(executable_tis))

        task_instance_str = "\n\t".join(repr(x) for x in executable_tis)
        self.log.info("Setting the following tasks to queued state:\n\t%s", task_instance_str)
        if len(executable_tis) > 0:
            # set TIs to queued state
            filter_for_tis = TI.filter_for_tis(executable_tis)
            session.query(TI).filter(filter_for_tis).update(
                # TODO[ha]: should we use func.now()? How does that work with DB timezone
                # on mysql when it's not UTC?
                {TI.state: State.QUEUED, TI.queued_dttm: timezone.utcnow(), TI.queued_by_job_id: self.id},
                synchronize_session=False,
            )

        for ti in executable_tis:
            make_transient(ti)
        return executable_tis