Exemple #1
0
    def _collect_results_from_processor(self, processor) -> None:
        self.log.debug("Processor for %s finished", processor.file_path)
        Stats.decr('dag_processing.processes')
        last_finish_time = timezone.utcnow()

        if processor.result is not None:
            num_dags, count_import_errors = processor.result
        else:
            self.log.error("Processor for %s exited with return code %s.",
                           processor.file_path, processor.exit_code)
            count_import_errors = -1
            num_dags = 0

        last_duration = last_finish_time - processor.start_time
        stat = DagFileStat(
            num_dags=num_dags,
            import_errors=count_import_errors,
            last_finish_time=last_finish_time,
            last_duration=last_duration,
            run_count=self.get_run_count(processor.file_path) + 1,
        )
        self._file_stats[processor.file_path] = stat

        file_name = os.path.splitext(os.path.basename(
            processor.file_path))[0].replace(os.sep, '.')
        Stats.timing(f'dag_processing.last_duration.{file_name}',
                     last_duration)
Exemple #2
0
    def _kill_timed_out_processors(self):
        """Kill any file processors that timeout to defend against process hangs."""
        now = timezone.utcnow()
        processors_to_remove = []
        for file_path, processor in self._processors.items():
            duration = now - processor.start_time
            if duration > self._processor_timeout:
                self.log.error(
                    "Processor for %s with PID %s started at %s has timed out, killing it.",
                    file_path,
                    processor.pid,
                    processor.start_time.isoformat(),
                )
                Stats.decr('dag_processing.processes')
                Stats.incr('dag_processing.processor_timeouts')
                # TODO: Remove after Airflow 2.0
                Stats.incr('dag_file_processor_timeouts')
                processor.kill()

                # Clean up processor references
                self.waitables.pop(processor.waitable_handle)
                processors_to_remove.append(file_path)

        # Clean up `self._processors` after iterating over it
        for proc in processors_to_remove:
            self._processors.pop(proc)
Exemple #3
0
 def terminate(self):
     """
     Stops all running processors
     :return: None
     """
     for processor in self._processors.values():
         Stats.decr('dag_processing.processes')
         processor.terminate()
Exemple #4
0
    def collect_results(self):
        """
        Collect the result from any finished DAG processors

        :return: a list of SimpleDags that were produced by processors that
            have finished since the last time this was called
        :rtype: list[airflow.utils.dag_processing.SimpleDag]
        """
        self._kill_timed_out_processors()

        finished_processors = {}
        """:type : dict[unicode, AbstractDagFileProcessor]"""
        running_processors = {}
        """:type : dict[unicode, AbstractDagFileProcessor]"""

        for file_path, processor in self._processors.items():
            if processor.done:
                self.log.debug("Processor for %s finished", file_path)
                Stats.decr('dag_processing.processes')
                now = timezone.utcnow()
                finished_processors[file_path] = processor

                stat = DagFileStat(
                    len(processor.result[0])
                    if processor.result is not None else 0,
                    processor.result[1]
                    if processor.result is not None else -1,
                    now,
                    (now - processor.start_time).total_seconds(),
                    self.get_run_count(file_path) + 1,
                )
                self._file_stats[file_path] = stat
            else:
                running_processors[file_path] = processor
        self._processors = running_processors

        self.log.debug("%s/%s DAG parsing processes running",
                       len(self._processors), self._parallelism)

        self.log.debug("%s file paths queued for processing",
                       len(self._file_path_queue))

        # Collect all the DAGs that were found in the processed files
        simple_dags = []
        for file_path, processor in finished_processors.items():
            if processor.result is None:
                self.log.warning(
                    "Processor for %s exited with return code %s.",
                    processor.file_path, processor.exit_code)
            else:
                for simple_dag in processor.result[0]:
                    simple_dags.append(simple_dag)

        return simple_dags
    def collect_results(self):
        """
        Collect the result from any finished DAG processors

        :return: a list of SimpleDags that were produced by processors that
            have finished since the last time this was called
        :rtype: list[airflow.utils.dag_processing.SimpleDag]
        """
        finished_processors: Dict[str, AbstractDagFileProcessorProcess] = {}
        running_processors: Dict[str, AbstractDagFileProcessorProcess] = {}

        for file_path, processor in self._processors.items():
            if processor.done:
                self.log.debug("Processor for %s finished", file_path)
                Stats.decr('dag_processing.processes')
                last_finish_time = timezone.utcnow()
                finished_processors[file_path] = processor

                if processor.result is not None:
                    dags, count_import_errors = processor.result
                else:
                    dags, count_import_errors = [], -1

                stat = DagFileStat(
                    num_dags=len(dags),
                    import_errors=count_import_errors,
                    last_finish_time=last_finish_time,
                    last_duration=(last_finish_time - processor.start_time).total_seconds(),
                    run_count=self.get_run_count(file_path) + 1,
                )
                self._file_stats[file_path] = stat
            else:
                running_processors[file_path] = processor
        self._processors = running_processors

        self.log.debug("%s/%s DAG parsing processes running",
                       len(self._processors), self._parallelism)

        self.log.debug("%s file paths queued for processing",
                       len(self._file_path_queue))

        # Collect all the DAGs that were found in the processed files
        simple_dags = []
        for file_path, processor in finished_processors.items():
            if processor.result is None:
                self.log.error(
                    "Processor for %s exited with return code %s.",
                    processor.file_path, processor.exit_code
                )
            else:
                for simple_dag in processor.result[0]:
                    simple_dags.append(simple_dag)

        return simple_dags
Exemple #6
0
 def _kill_timed_out_processors(self):
     """Kill any file processors that timeout to defend against process hangs."""
     now = timezone.utcnow()
     for file_path, processor in self._processors.items():
         duration = now - processor.start_time
         if duration > self._processor_timeout:
             self.log.error(
                 "Processor for %s with PID %s started at %s has timed out, "
                 "killing it.", file_path, processor.pid,
                 processor.start_time.isoformat())
             Stats.decr('dag_processing.processes')
             Stats.incr('dag_processing.processor_timeouts')
             # TODO: Remove after Airflow 2.0
             Stats.incr('dag_file_processor_timeouts')
             processor.kill()
Exemple #7
0
    def set_file_paths(self, new_file_paths):
        """
        Update this with a new set of paths to DAG definition files.

        :param new_file_paths: list of paths to DAG definition files
        :return: None
        """
        self._file_paths = new_file_paths
        self._file_path_queue = [x for x in self._file_path_queue if x in new_file_paths]
        # Stop processors that are working on deleted files
        filtered_processors = {}
        for file_path, processor in self._processors.items():
            if file_path in new_file_paths:
                filtered_processors[file_path] = processor
            else:
                self.log.warning("Stopping processor for %s", file_path)
                Stats.decr('dag_processing.processes')
                processor.terminate()
                self._file_stats.pop(file_path)
        self._processors = filtered_processors
Exemple #8
0
    def _collect_results_from_processor(self, processor) -> None:
        self.log.debug("Processor for %s finished", processor.file_path)
        Stats.decr('dag_processing.processes')
        last_finish_time = timezone.utcnow()

        if processor.result is not None:
            num_dags, count_import_errors = processor.result
        else:
            self.log.error(
                "Processor for %s exited with return code %s.", processor.file_path, processor.exit_code
            )
            count_import_errors = -1
            num_dags = 0

        stat = DagFileStat(
            num_dags=num_dags,
            import_errors=count_import_errors,
            last_finish_time=last_finish_time,
            last_duration=(last_finish_time - processor.start_time).total_seconds(),
            run_count=self.get_run_count(processor.file_path) + 1,
        )
        self._file_stats[processor.file_path] = stat