Example #1
0
    def _emit_duration_stats_for_finished_state(self):
        if self.state == State.RUNNING:
            return

        duration = (self.end_date - self.start_date)
        if self.state is State.SUCCESS:
            Stats.timing('dagrun.duration.success.{}'.format(self.dag_id), duration)
        elif self.state == State.FAILED:
            Stats.timing('dagrun.duration.failed.{}'.format(self.dag_id), duration)
Example #2
0
    def collect_dags(
            self,
            dag_folder=None,
            only_if_updated=True,
            include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'),
            include_smart_sensor=conf.getboolean('smart_sensor',
                                                 'USE_SMART_SENSOR'),
            safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE'),
    ):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the regex patterns specified
        in the file.

        **Note**: The patterns in .airflowignore are treated as
        un-anchored regexes, not shell-like glob patterns.
        """
        if self.read_dags_from_db:
            return

        self.log.info("Filling up the DagBag from %s", dag_folder)
        start_dttm = timezone.utcnow()
        dag_folder = dag_folder or self.dag_folder
        # Used to store stats around DagBag processing
        stats = []

        dag_folder = correct_maybe_zipped(dag_folder)
        for filepath in list_py_file_paths(
                dag_folder,
                safe_mode=safe_mode,
                include_examples=include_examples,
                include_smart_sensor=include_smart_sensor,
        ):
            try:
                file_parse_start_dttm = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)

                file_parse_end_dttm = timezone.utcnow()
                stats.append(
                    FileLoadStat(
                        file=filepath.replace(settings.DAGS_FOLDER, ''),
                        duration=file_parse_end_dttm - file_parse_start_dttm,
                        dag_num=len(found_dags),
                        task_num=sum([len(dag.tasks) for dag in found_dags]),
                        dags=str([dag.dag_id for dag in found_dags]),
                    ))
            except Exception as e:  # pylint: disable=broad-except
                self.log.exception(e)

        end_dttm = timezone.utcnow()
        durations = (end_dttm - start_dttm).total_seconds()
        Stats.gauge('collect_dags', durations, 1)
        Stats.gauge('dagbag_size', len(self.dags), 1)
        Stats.gauge('dagbag_import_errors', len(self.import_errors), 1)
        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
        for file_stat in self.dagbag_stats:
            # file_stat.file similar format: /subdir/dag_name.py
            # TODO: Remove for Airflow 2.0
            filename = file_stat.file.split('/')[-1].replace('.py', '')
            Stats.timing(f'dag.loading-duration.{filename}',
                         file_stat.duration)
Example #3
0
    def update_state(self, session=None):
        """
        Determines the overall state of the DagRun based on the state
        of its TaskInstances.

        :return: State
        """

        dag = self.get_dag()

        tis = self.get_task_instances(session=session)
        self.log.debug("Updating state for %s considering %s task(s)", self, len(tis))

        for ti in list(tis):
            # skip in db?
            if ti.state == State.REMOVED:
                tis.remove(ti)
            else:
                ti.task = dag.get_task(ti.task_id)

        # pre-calculate
        # db is faster
        start_dttm = timezone.utcnow()
        unfinished_tasks = self.get_task_instances(
            state=State.unfinished(),
            session=session
        )
        none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks)
        none_task_concurrency = all(t.task.task_concurrency is None
                                    for t in unfinished_tasks)
        # small speed up
        if unfinished_tasks and none_depends_on_past and none_task_concurrency:
            # todo: this can actually get pretty slow: one task costs between 0.01-015s
            no_dependencies_met = True
            for ut in unfinished_tasks:
                # We need to flag upstream and check for changes because upstream
                # failures/re-schedules can result in deadlock false positives
                old_state = ut.state
                deps_met = ut.are_dependencies_met(
                    dep_context=DepContext(
                        flag_upstream_failed=True,
                        ignore_in_retry_period=True,
                        ignore_in_reschedule_period=True),
                    session=session)
                if deps_met or old_state != ut.current_state(session=session):
                    no_dependencies_met = False
                    break

        duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000
        Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration)

        root_ids = [t.task_id for t in dag.roots]
        roots = [t for t in tis if t.task_id in root_ids]

        # if all roots finished and at least one failed, the run failed
        if (not unfinished_tasks and
                any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)):
            self.log.info('Marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self, success=False, reason='task_failure',
                                session=session)

        # if all roots succeeded and no unfinished tasks, the run succeeded
        elif not unfinished_tasks and all(r.state in (State.SUCCESS, State.SKIPPED)
                                          for r in roots):
            self.log.info('Marking run %s successful', self)
            self.set_state(State.SUCCESS)
            dag.handle_callback(self, success=True, reason='success', session=session)

        # if *all tasks* are deadlocked, the run failed
        elif (unfinished_tasks and none_depends_on_past and
              none_task_concurrency and no_dependencies_met):
            self.log.info('Deadlock; marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self, success=False, reason='all_tasks_deadlocked',
                                session=session)

        # finally, if the roots aren't done, the dag is still running
        else:
            self.set_state(State.RUNNING)

        self._emit_duration_stats_for_finished_state()

        # todo: determine we want to use with_for_update to make sure to lock the run
        session.merge(self)
        session.commit()

        return self.state
Example #4
0
    def update_state(self, session=None) -> List[TI]:
        """
        Determines the overall state of the DagRun based on the state
        of its TaskInstances.

        :return: ready_tis: the tis that can be scheduled in the current loop
        :rtype ready_tis: list[airflow.models.TaskInstance]
        """

        dag = self.get_dag()
        ready_tis: List[TI] = []
        tis = [
            ti for ti in self.get_task_instances(
                session=session, state=State.task_states + (State.SHUTDOWN, ))
        ]
        self.log.debug("number of tis tasks for %s: %s task(s)", self,
                       len(tis))
        for ti in tis:
            ti.task = dag.get_task(ti.task_id)

        start_dttm = timezone.utcnow()
        unfinished_tasks = [t for t in tis if t.state in State.unfinished()]
        finished_tasks = [
            t for t in tis
            if t.state in State.finished() + [State.UPSTREAM_FAILED]
        ]
        none_depends_on_past = all(not t.task.depends_on_past
                                   for t in unfinished_tasks)
        none_task_concurrency = all(t.task.task_concurrency is None
                                    for t in unfinished_tasks)
        if unfinished_tasks:
            scheduleable_tasks = [
                ut for ut in unfinished_tasks
                if ut.state in SCHEDULEABLE_STATES
            ]
            self.log.debug("number of scheduleable tasks for %s: %s task(s)",
                           self, len(scheduleable_tasks))
            ready_tis, changed_tis = self._get_ready_tis(
                scheduleable_tasks, finished_tasks, session)
            self.log.debug("ready tis length for %s: %s task(s)", self,
                           len(ready_tis))
            if none_depends_on_past and none_task_concurrency:
                # small speed up
                are_runnable_tasks = ready_tis or self._are_premature_tis(
                    unfinished_tasks, finished_tasks, session) or changed_tis

        duration = (timezone.utcnow() - start_dttm)
        Stats.timing("dagrun.dependency-check.{}".format(self.dag_id),
                     duration)

        leaf_task_ids = {t.task_id for t in dag.leaves}
        leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids]

        # if all roots finished and at least one failed, the run failed
        if not unfinished_tasks and any(
                leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED}
                for leaf_ti in leaf_tis):
            self.log.error('Marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self,
                                success=False,
                                reason='task_failure',
                                session=session)

        # if all leafs succeeded and no unfinished tasks, the run succeeded
        elif not unfinished_tasks and all(
                leaf_ti.state in {State.SUCCESS, State.SKIPPED}
                for leaf_ti in leaf_tis):
            self.log.info('Marking run %s successful', self)
            self.set_state(State.SUCCESS)
            dag.handle_callback(self,
                                success=True,
                                reason='success',
                                session=session)

        # if *all tasks* are deadlocked, the run failed
        elif (unfinished_tasks and none_depends_on_past
              and none_task_concurrency and not are_runnable_tasks):
            self.log.error('Deadlock; marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self,
                                success=False,
                                reason='all_tasks_deadlocked',
                                session=session)

        # finally, if the roots aren't done, the dag is still running
        else:
            self.set_state(State.RUNNING)

        self._emit_duration_stats_for_finished_state()

        # todo: determine we want to use with_for_update to make sure to lock the run
        session.merge(self)
        session.commit()

        return ready_tis