예제 #1
0
    def _execute(self):
        self.task_runner = get_task_runner(self)

        def signal_handler(signum, frame):
            """Setting kill signal handler"""
            self.log.error("Received SIGTERM. Terminating subprocesses")
            self.on_kill()
            raise AirflowException("LocalTaskJob received SIGTERM signal")
        signal.signal(signal.SIGTERM, signal_handler)

        if not self.task_instance._check_and_change_state_before_execution(
                mark_success=self.mark_success,
                ignore_all_deps=self.ignore_all_deps,
                ignore_depends_on_past=self.ignore_depends_on_past,
                ignore_task_deps=self.ignore_task_deps,
                ignore_ti_state=self.ignore_ti_state,
                job_id=self.id,
                pool=self.pool):
            self.log.info("Task is not able to be run")
            return

        try:
            self.task_runner.start()

            heartbeat_time_limit = conf.getint('scheduler',
                                               'scheduler_zombie_task_threshold')
            while True:
                # Monitor the task to see if it's done
                return_code = self.task_runner.return_code()
                if return_code is not None:
                    self.log.info("Task exited with return code %s", return_code)
                    return

                self.heartbeat()

                # If it's been too long since we've heartbeat, then it's possible that
                # the scheduler rescheduled this task, so kill launched processes.
                # This can only really happen if the worker can't read the DB for a long time
                time_since_last_heartbeat = (timezone.utcnow() - self.latest_heartbeat).total_seconds()
                if time_since_last_heartbeat > heartbeat_time_limit:
                    Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1)
                    self.log.error("Heartbeat time limited exceeded!")
                    raise AirflowException("Time since last heartbeat({:.2f}s) "
                                           "exceeded limit ({}s)."
                                           .format(time_since_last_heartbeat,
                                                   heartbeat_time_limit))

                if time_since_last_heartbeat < self.heartrate:
                    sleep_for = self.heartrate - time_since_last_heartbeat
                    self.log.debug("Time since last heartbeat(%.2f s) < heartrate(%s s)"
                                   ", sleeping for %s s", time_since_last_heartbeat,
                                   self.heartrate, sleep_for)
                    time.sleep(sleep_for)
        finally:
            self.on_kill()
예제 #2
0
    def _emit_duration_stats_for_finished_state(self):
        if self.state == State.RUNNING:
            return

        duration = (self.end_date - self.start_date)
        if self.state is State.SUCCESS:
            Stats.timing('dagrun.duration.success.{}'.format(self.dag_id),
                         duration)
        elif self.state == State.FAILED:
            Stats.timing('dagrun.duration.failed.{}'.format(self.dag_id),
                         duration)
예제 #3
0
def stop_time(ctx, *args, **kwargs):
    start = ctx['start_time']
    stop = datetime.now()
    duration = (stop - start).total_seconds() * 1000

    tags = {
        'dag': ctx['operator'].dag_id,
        'task': ctx['operator'].task_id,
        'operator': ctx['operator'].__class__.__name__,
        'domain': ctx['domain'],
    }
    Stats.timing('request.duration', duration, tags=tags)
예제 #4
0
def bq_duration(ctx, self, *args, **kwargs):
    stats = ctx['job']['statistics']
    creation = int(stats['creationTime'])
    start = int(stats['startTime'])
    end = int(stats['endTime'])

    tags = {
        'dag': self.dag_id,
        'task': self.task_id,
        'operator': self.__class__.__name__,
    }
    Stats.timing('task.delay.gcs_to_bq', start - creation, tags=tags)
    Stats.timing('task.duration.gcs_to_bq', end - start, tags=tags)
예제 #5
0
 def _kill_timed_out_processors(self):
     """
     Kill any file processors that timeout to defend against process hangs.
     """
     now = timezone.utcnow()
     for file_path, processor in self._processors.items():
         duration = now - processor.start_time
         if duration > self._processor_timeout:
             self.log.info(
                 "Processor for %s with PID %s started at %s has timed out, "
                 "killing it.", processor.file_path, processor.pid,
                 processor.start_time.isoformat())
             Stats.incr('dag_file_processor_timeouts', 1, 1)
             processor.kill()
예제 #6
0
def task_states(_since, session=None):
    states = (
        session.query(TaskInstance.state, sqlalchemy.func.count())
        .group_by(TaskInstance.state)
    )

    for state, count in states:
        if state is None:
            continue

        tags = {
            'state': state
        }
        Stats.gauge('task.state', count, tags=tags)
예제 #7
0
def http_status(ctx, *args, **kwargs):
    response = ctx['return']
    status = response.status_code

    tags = {
        'dag': ctx['operator'].dag_id,
        'task': ctx['operator'].task_id,
        'operator': ctx['operator'].__class__.__name__,
        'domain': ctx['domain'],
        'status': status
    }
    if status < 400:
        Stats.incr('request.status.success', tags=tags)
    else:
        Stats.incr('request.status.failure', tags=tags)
예제 #8
0
def bq_task_states(since, session=None):
    states = (
        session.query(TaskInstance.state, sqlalchemy.func.count())
        .filter(TaskInstance.operator == 'BigQueryOperator')
        .filter(TaskInstance.end_date > since)
        .group_by(TaskInstance.state)
    )

    for state, count in states:
        if state is None:
            continue

        tags = {
            'state': state
        }
        Stats.incr('task.state.bq', count, tags=tags)
    def _process_dags_and_create_dagruns(self, dagbag, dags, dagrun_out):
        """
        Iterates over the dags and processes them. Processing includes:

        1. Create appropriate DagRun(s) in the DB.
        2. Create appropriate TaskInstance(s) in the DB.
        3. Send emails for tasks that have missed SLAs.

        :param dagbag: a collection of DAGs to process
        :type dagbag: airflow.models.DagBag
        :param dags: the DAGs from the DagBag to process
        :type dags: list[airflow.models.DAG]
        :param dagrun_out: A list to add DagRun objects
        :type dagrun_out: list[DagRun]
        :rtype: None
        """
        for dag in dags:
            dag = dagbag.get_dag(dag.dag_id)
            if not dag:
                self.log.error("DAG ID %s was not found in the DagBag",
                               dag.dag_id)
                continue

            if dag.is_paused:
                self.log.info("Not processing DAG %s since it's paused",
                              dag.dag_id)
                continue

            self.log.info("Processing %s", dag.dag_id)

            dag_run = self.create_dag_run(dag)
            if dag_run:
                dag_run.dag = dag
                expected_start_date = dag.following_schedule(
                    dag_run.execution_date)
                if expected_start_date:
                    schedule_delay = dag_run.start_date - expected_start_date
                    Stats.timing(
                        'dagrun.schedule_delay.{dag_id}'.format(
                            dag_id=dag.dag_id), schedule_delay)
                self.log.info("Created %s", dag_run)
                self._create_task_instances(dag_run)
                self.log.info("Created tasks instances %s", dag_run)
                dagrun_out.append(dag_run)
            if conf.getboolean('core', 'CHECK_SLAS', fallback=True):
                self.manage_slas(dag)
예제 #10
0
    def collect_dags_from_db(self):
        """Collects DAGs from database."""
        from airflow.models.serialized_dag import SerializedDagModel
        start_dttm = timezone.utcnow()
        self.log.info("Filling up the DagBag from database")

        # The dagbag contains all rows in serialized_dag table. Deleted DAGs are deleted
        # from the table by the scheduler job.
        self.dags = SerializedDagModel.read_all_dags()

        # Adds subdags.
        # DAG post-processing steps such as self.bag_dag and croniter are not needed as
        # they are done by scheduler before serialization.
        subdags = {}
        for dag in self.dags.values():
            for subdag in dag.subdags:
                subdags[subdag.dag_id] = subdag
        self.dags.update(subdags)

        Stats.timing('collect_db_dags', timezone.utcnow() - start_dttm)
예제 #11
0
    def heartbeat(self):
        # Triggering new jobs
        if not self.parallelism:
            open_slots = len(self.queued_tasks)
        else:
            open_slots = self.parallelism - len(self.running)

        num_running_tasks = len(self.running)
        num_queued_tasks = len(self.queued_tasks)

        self.log.debug("%s running task instances", num_running_tasks)
        self.log.debug("%s in queue", num_queued_tasks)
        self.log.debug("%s open slots", open_slots)

        Stats.gauge('executor.open_slots', open_slots)
        Stats.gauge('executor.queued_tasks', num_queued_tasks)
        Stats.gauge('executor.running_tasks', num_running_tasks)

        sorted_queue = sorted([(k, v) for k, v in self.queued_tasks.items()],
                              key=lambda x: x[1][1],
                              reverse=True)
        for i in range(min((open_slots, len(self.queued_tasks)))):
            key, (command, _, queue, simple_ti) = sorted_queue.pop(0)
            self.queued_tasks.pop(key)
            self.running[key] = command
            self.execute_async(key=key,
                               command=command,
                               queue=queue,
                               executor_config=simple_ti.executor_config)

        # Calling child class sync method
        self.log.debug("Calling the %s sync method", self.__class__)
        self.sync()
예제 #12
0
    def verify_integrity(self, session=None):
        """
        Verifies the DagRun by checking for removed tasks or tasks that are not in the
        database yet. It will set state to removed or add the task if required.
        """
        from airflow.models.taskinstance import TaskInstance  # Avoid circular import

        dag = self.get_dag()
        tis = self.get_task_instances(session=session)

        # check for removed or restored tasks
        task_ids = set()
        for ti in tis:
            task_instance_mutation_hook(ti)
            task_ids.add(ti.task_id)
            task = None
            try:
                task = dag.get_task(ti.task_id)
            except AirflowException:
                if ti.state == State.REMOVED:
                    pass  # ti has already been removed, just ignore it
                elif self.state is not State.RUNNING and not dag.partial:
                    self.log.warning("Failed to get task '{}' for dag '{}'. "
                                     "Marking it as removed.".format(ti, dag))
                    Stats.incr(
                        "task_removed_from_dag.{}".format(dag.dag_id), 1, 1)
                    ti.state = State.REMOVED

            is_task_in_dag = task is not None
            should_restore_task = is_task_in_dag and ti.state == State.REMOVED
            if should_restore_task:
                self.log.info("Restoring task '{}' which was previously "
                              "removed from DAG '{}'".format(ti, dag))
                Stats.incr("task_restored_to_dag.{}".format(dag.dag_id), 1, 1)
                ti.state = State.NONE
            session.merge(ti)

        # check for missing tasks
        for task in six.itervalues(dag.task_dict):
            if task.start_date > self.execution_date and not self.is_backfill:
                continue

            if task.task_id not in task_ids:
                Stats.incr(
                    "task_instance_created-{}".format(task.__class__.__name__),
                    1, 1)
                ti = TaskInstance(task, self.execution_date)
                task_instance_mutation_hook(ti)
                session.add(ti)

        try:
            session.commit()
        except IntegrityError as err:
            self.log.info(str(err))
            self.log.info(
                'Hit IntegrityError while creating the TIs for %s - %s',
                dag.dag_id, self.execution_date
            )
            self.log.info('Doing session rollback.')
            session.rollback()
예제 #13
0
    def _emit_true_scheduling_delay_stats_for_finished_state(self, finished_tis):
        """
        This is a helper method to emit the true scheduling delay stats, which is defined as
        the time when the first task in DAG starts minus the expected DAG run datetime.
        This method will be used in the update_state method when the state of the DagRun
        is updated to a completed status (either success or failure). The method will find the first
        started task within the DAG and calculate the expected DagRun start time (based on
        dag.execution_date & dag.schedule_interval), and minus these two values to get the delay.
        The emitted data may contains outlier (e.g. when the first task was cleared, so
        the second task's start_date will be used), but we can get rid of the the outliers
        on the stats side through the dashboards tooling built.
        Note, the stat will only be emitted if the DagRun is a scheduler triggered one
        (i.e. external_trigger is False).
        """
        if self.state == State.RUNNING:
            return
        if self.external_trigger:
            return
        if not finished_tis:
            return

        try:
            dag = self.get_dag()

            if not self.dag.schedule_interval or self.dag.schedule_interval == "@once":
                # We can't emit this metric if there is no following schedule to cacluate from!
                return

            ordered_tis_by_start_date = [ti for ti in finished_tis if ti.start_date]
            ordered_tis_by_start_date.sort(key=lambda ti: ti.start_date, reverse=False)
            first_start_date = ordered_tis_by_start_date[0].start_date
            if first_start_date:
                # dag.following_schedule calculates the expected start datetime for a scheduled dagrun
                # i.e. a daily flow for execution date 1/1/20 actually runs on 1/2/20 hh:mm:ss,
                # and ti.start_date will be 1/2/20 hh:mm:ss so the following schedule is comparison
                true_delay = first_start_date - dag.following_schedule(self.execution_date)
                if true_delay.total_seconds() > 0:
                    Stats.timing('dagrun.{}.first_task_scheduling_delay'.format(dag.dag_id), true_delay)
        except Exception as e:
            self.log.warning('Failed to record first_task_scheduling_delay metric:\n', e)
예제 #14
0
    def run(self):
        Stats.incr(self.__class__.__name__.lower()+'_start', 1, 1)
        # Adding an entry in the DB
        session = settings.Session()
        self.state = State.RUNNING
        session.add(self)
        session.commit()
        id_ = self.id
        make_transient(self)
        self.id = id_

        # Run
        self._execute()

        # Marking the success in the DB
        self.end_date = datetime.now()
        self.state = State.SUCCESS
        session.merge(self)
        session.commit()
        session.close()

        Stats.incr(self.__class__.__name__.lower()+'_end', 1, 1)
예제 #15
0
    def run(self):
        Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1)
        # Adding an entry in the DB
        session = settings.Session()
        self.state = State.RUNNING
        session.add(self)
        session.commit()
        id_ = self.id
        make_transient(self)
        self.id = id_

        # Run
        self._execute()

        # Marking the success in the DB
        self.end_date = datetime.now()
        self.state = State.SUCCESS
        session.merge(self)
        session.commit()
        session.close()

        Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1)
예제 #16
0
    def collect_dags(
        self,
        dag_folder=None,
        only_if_updated=True,
        include_examples=configuration.conf.getboolean('core',
                                                       'LOAD_EXAMPLES'),
        safe_mode=configuration.conf.getboolean('core',
                                                'DAG_DISCOVERY_SAFE_MODE')):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the regex patterns specified
        in the file.

        **Note**: The patterns in .airflowignore are treated as
        un-anchored regexes, not shell-like glob patterns.
        """
        start_dttm = timezone.utcnow()
        dag_folder = dag_folder or self.dag_folder
        # Used to store stats around DagBag processing
        stats = []
        FileLoadStat = namedtuple('FileLoadStat',
                                  "file duration dag_num task_num dags")

        dag_folder = correct_maybe_zipped(dag_folder)

        for filepath in list_py_file_paths(dag_folder,
                                           safe_mode=safe_mode,
                                           include_examples=include_examples):
            try:
                ts = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)

                td = timezone.utcnow() - ts
                td = td.total_seconds() + (float(td.microseconds) / 1000000)
                stats.append(
                    FileLoadStat(
                        filepath.replace(dag_folder, ''),
                        td,
                        len(found_dags),
                        sum([len(dag.tasks) for dag in found_dags]),
                        str([dag.dag_id for dag in found_dags]),
                    ))
            except Exception as e:
                self.log.exception(e)
        Stats.gauge('collect_dags',
                    (timezone.utcnow() - start_dttm).total_seconds(), 1)
        Stats.gauge('dagbag_size', len(self.dags), 1)
        Stats.gauge('dagbag_import_errors', len(self.import_errors), 1)
        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
예제 #17
0
파일: dagbag.py 프로젝트: jxiao0/airflow
    def kill_zombies(self, session=None):
        """
        Fail zombie tasks, which are tasks that haven't
        had a heartbeat for too long, in the current DagBag.

        :param session: DB session.
        :type session: sqlalchemy.orm.session.Session
        """
        # Avoid circular import
        from airflow.models.taskinstance import TaskInstance as TI
        from airflow.jobs import LocalTaskJob as LJ

        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        zombie_threshold_secs = (configuration.getint(
            'scheduler', 'scheduler_zombie_task_threshold'))
        limit_dttm = timezone.utcnow() - timedelta(
            seconds=zombie_threshold_secs)
        self.log.debug("Failing jobs without heartbeat after %s", limit_dttm)

        tis = (session.query(TI).join(LJ, TI.job_id == LJ.id).filter(
            TI.state == State.RUNNING).filter(TI.dag_id.in_(self.dags)).filter(
                or_(
                    LJ.state != State.RUNNING,
                    LJ.latest_heartbeat < limit_dttm,
                )).all())
        for ti in tis:
            self.log.info(
                "Detected zombie job with dag_id %s, task_id %s, and execution date %s",
                ti.dag_id, ti.task_id, ti.execution_date.isoformat())
            ti.test_mode = configuration.getboolean('core', 'unit_test_mode')
            ti.task = self.dags[ti.dag_id].get_task(ti.task_id)
            ti.handle_failure("{} detected as zombie".format(ti), ti.test_mode,
                              ti.get_template_context())
            self.log.info('Marked zombie job %s as %s', ti, ti.state)
            Stats.incr('zombies_killed')
        session.commit()
예제 #18
0
    def heartbeat(self):
        # Triggering new jobs
        if not self.parallelism:
            open_slots = len(self.queued_tasks)
        else:
            open_slots = self.parallelism - len(self.running)

        num_running_tasks = len(self.running)
        num_queued_tasks = len(self.queued_tasks)

        self.log.debug("%s running task instances", num_running_tasks)
        self.log.debug("%s in queue", num_queued_tasks)
        self.log.debug("%s open slots", open_slots)

        Stats.gauge('executor.open_slots', open_slots)
        Stats.gauge('executor.queued_tasks', num_queued_tasks)
        Stats.gauge('executor.running_tasks', num_running_tasks)

        self.trigger_tasks(open_slots)

        # Calling child class sync method
        self.log.debug("Calling the %s sync method", self.__class__)
        self.sync()
예제 #19
0
    def update_state(self, session=None):
        """
        Determines the overall state of the DagRun based on the state
        of its TaskInstances.

        :return: ready_tis: the tis that can be scheduled in the current loop
        :rtype ready_tis: list[airflow.models.TaskInstance]
        """

        dag = self.get_dag()
        ready_tis = []
        tis = [ti for ti in self.get_task_instances(session=session,
                                                    state=State.task_states + (State.SHUTDOWN,))]
        self.log.debug("number of tis tasks for %s: %s task(s)", self, len(tis))
        for ti in list(tis):
            ti.task = dag.get_task(ti.task_id)

        start_dttm = timezone.utcnow()
        unfinished_tasks = [t for t in tis if t.state in State.unfinished()]
        finished_tasks = [t for t in tis if t.state in State.finished() + [State.UPSTREAM_FAILED]]
        none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks)
        none_task_concurrency = all(t.task.task_concurrency is None
                                    for t in unfinished_tasks)
        # small speed up
        if unfinished_tasks:
            scheduleable_tasks = [ut for ut in unfinished_tasks if ut.state in SCHEDULEABLE_STATES]
            self.log.debug(
                "number of scheduleable tasks for %s: %s task(s)",
                self, len(scheduleable_tasks))
            ready_tis, changed_tis = self._get_ready_tis(scheduleable_tasks, finished_tasks, session)
            self.log.debug("ready tis length for %s: %s task(s)", self, len(ready_tis))
            if none_depends_on_past and none_task_concurrency:
                # small speed up
                are_runnable_tasks = ready_tis or self._are_premature_tis(
                    unfinished_tasks, finished_tasks, session) or changed_tis

        duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000
        Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration)

        leaf_task_ids = {t.task_id for t in dag.leaves}
        leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids]

        # if all roots finished and at least one failed, the run failed
        if not unfinished_tasks and any(
            leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED} for leaf_ti in leaf_tis
        ):
            self.log.info('Marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self, success=False, reason='task_failure',
                                session=session)

        # if all leafs succeeded and no unfinished tasks, the run succeeded
        elif not unfinished_tasks and all(
            leaf_ti.state in {State.SUCCESS, State.SKIPPED} for leaf_ti in leaf_tis
        ):
            self.log.info('Marking run %s successful', self)
            self.set_state(State.SUCCESS)
            dag.handle_callback(self, success=True, reason='success', session=session)

        # if *all tasks* are deadlocked, the run failed
        elif (unfinished_tasks and none_depends_on_past and
              none_task_concurrency and not are_runnable_tasks):
            self.log.info('Deadlock; marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self, success=False, reason='all_tasks_deadlocked',
                                session=session)

        # finally, if the roots aren't done, the dag is still running
        else:
            self.set_state(State.RUNNING)

        self._emit_true_scheduling_delay_stats_for_finished_state(finished_tasks)
        self._emit_duration_stats_for_finished_state()

        # todo: determine we want to use with_for_update to make sure to lock the run
        session.merge(self)
        session.commit()

        return ready_tis
    def process_file(self,
                     file_path,
                     zombies,
                     pickle_dags=False,
                     session=None):
        """
        Process a Python file containing Airflow DAGs.

        This includes:

        1. Execute the file and look for DAG objects in the namespace.
        2. Pickle the DAG and save it to the DB (if necessary).
        3. For each DAG, see what tasks should run and create appropriate task
        instances in the DB.
        4. Record any errors importing the file into ORM
        5. Kill (in ORM) any task instances belonging to the DAGs that haven't
        issued a heartbeat in a while.

        Returns a list of SimpleDag objects that represent the DAGs found in
        the file

        :param file_path: the path to the Python file that should be executed
        :type file_path: unicode
        :param zombies: zombie task instances to kill.
        :type zombies: list[airflow.utils.dag_processing.SimpleTaskInstance]
        :param pickle_dags: whether serialize the DAGs found in the file and
            save them to the db
        :type pickle_dags: bool
        :return: a list of SimpleDagRuns made from the Dags found in the file
        :rtype: list[airflow.utils.dag_processing.SimpleDagBag]
        """
        self.log.info("Processing file %s for tasks to queue", file_path)
        if session is None:
            session = settings.Session()
        # As DAGs are parsed from this file, they will be converted into SimpleDags

        try:
            dagbag = models.DagBag(file_path, include_examples=False)
        except Exception:
            self.log.exception("Failed at reloading the DAG file %s",
                               file_path)
            Stats.incr('dag_file_refresh_error', 1, 1)
            return [], []

        if len(dagbag.dags) > 0:
            self.log.info("DAG(s) %s retrieved from %s", dagbag.dags.keys(),
                          file_path)
        else:
            self.log.warning("No viable dags retrieved from %s", file_path)
            self.update_import_errors(session, dagbag)
            return [], len(dagbag.import_errors)

        # Save individual DAGs in the ORM and update DagModel.last_scheduled_time
        for dag in dagbag.dags.values():
            dag.sync_to_db()

        paused_dag_ids = [
            dag.dag_id for dag in dagbag.dags.values() if dag.is_paused
        ]
        self.log.info("paused_dag_ids %s", paused_dag_ids)
        self.log.info("self %s", self.dag_ids)

        dag_to_pickle = {}
        # Pickle the DAGs (if necessary) and put them into a SimpleDag
        for dag_id in dagbag.dags:
            # Only return DAGs that are not paused
            if dag_id not in paused_dag_ids:
                dag = dagbag.get_dag(dag_id)
                pickle_id = None
                if pickle_dags:
                    pickle_id = dag.pickle(session).id
                dag_to_pickle[dag.dag_id] = pickle_id

        if len(self.dag_ids) > 0:
            dags = [
                dag for dag in dagbag.dags.values() if
                dag.dag_id in self.dag_ids and dag.dag_id not in paused_dag_ids
            ]
        else:
            dags = [
                dag for dag in dagbag.dags.values()
                if not dag.parent_dag and dag.dag_id not in paused_dag_ids
            ]

        # Not using multiprocessing.Queue() since it's no longer a separate
        # process and due to some unusual behavior. (empty() incorrectly
        # returns true as described in https://bugs.python.org/issue23582 )
        self.log.info("dags %s", dags)
        dag_run_out = []
        self._process_dags_and_create_dagruns(dagbag, dags, dag_run_out)
        self.log.info("dag run out %s", len(dag_run_out))
        simple_dag_runs = []
        for dag_run in dag_run_out:
            simple_dag_runs.append(
                SimpleDagRun(dag_run.id, SimpleDag(dag_run.dag)))
        # commit batch
        session.commit()

        # Record import errors into the ORM
        try:
            self.update_import_errors(session, dagbag)
        except Exception:
            self.log.exception("Error logging import errors!")
        try:
            dagbag.kill_zombies(zombies)
        except Exception:
            self.log.exception("Error killing zombies!")

        return simple_dag_runs, len(dagbag.import_errors)
예제 #21
0
    def heartbeat(self):
        """
        Heartbeats update the job's entry in the database with a timestamp
        for the latest_heartbeat and allows for the job to be killed
        externally. This allows at the system level to monitor what is
        actually active.

        For instance, an old heartbeat for SchedulerJob would mean something
        is wrong.

        This also allows for any job to be killed externally, regardless
        of who is running it or on which machine it is running.

        Note that if your heartbeat is set to 60 seconds and you call this
        method after 10 seconds of processing since the last heartbeat, it
        will sleep 50 seconds to complete the 60 seconds and keep a steady
        heart rate. If you go over 60 seconds before calling it, it won't
        sleep at all.
        """
        previous_heartbeat = self.latest_heartbeat

        try:
            with create_session() as session:
                # This will cause it to load from the db
                session.merge(self)
                previous_heartbeat = self.latest_heartbeat

            if self.state == State.SHUTDOWN:
                self.kill()

            is_unit_test = conf.getboolean('core', 'unit_test_mode')
            if not is_unit_test:
                # Figure out how long to sleep for
                sleep_for = 0
                if self.latest_heartbeat:
                    seconds_remaining = self.heartrate - \
                        (timezone.utcnow() - self.latest_heartbeat)\
                        .total_seconds()
                    sleep_for = max(0, seconds_remaining)

                sleep(sleep_for)

            # Update last heartbeat time
            with create_session() as session:
                # Make the sesion aware of this object
                session.merge(self)
                self.latest_heartbeat = timezone.utcnow()
                session.commit()
                # At this point, the DB has updated.
                previous_heartbeat = self.latest_heartbeat

                self.heartbeat_callback(session=session)
                self.log.debug('[heartbeat]')
        except OperationalError:
            Stats.incr(
                convert_camel_to_snake(self.__class__.__name__) +
                '_heartbeat_failure', 1, 1)
            self.log.exception("%s heartbeat got an exception",
                               self.__class__.__name__)
            # We didn't manage to heartbeat, so make sure that the timestamp isn't updated
            self.latest_heartbeat = previous_heartbeat
    def _set_task_instance_state(self,
                                 dag_run,
                                 dag_id,
                                 task_id,
                                 execution_date,
                                 state,
                                 try_number,
                                 session=None):
        """
        Set the task state to db and maybe set the dagrun object finished to db.
        :param dag_run: DagRun object
        :param dag_id: Dag identify
        :param task_id: task identify
        :param execution_date: the dag run execution date
        :param state: the task state should be set.
        :param try_number: the task try_number.
        :param session:
        :return:
        """
        TI = models.TaskInstance
        qry = session.query(TI).filter(TI.dag_id == dag_id,
                                       TI.task_id == task_id,
                                       TI.execution_date == execution_date)
        ti = qry.first()
        if not ti:
            self.log.warning("TaskInstance %s went missing from the database",
                             ti)
            return
        ts = TaskState.query_task_state(ti, session)
        self.log.debug(
            "set task state dag_id {0} task_id {1} execution_date {2} try_number {3} "
            "current try_number {4} state {5} ack_id {6} action {7}.".format(
                dag_id, task_id, execution_date, try_number, ti.try_number,
                state, ts.ack_id, ts.action))
        is_restart = False
        if state == State.FAILED or state == State.SUCCESS or state == State.SHUTDOWN:
            if ti.try_number == try_number and ti.state == State.QUEUED:
                msg = ("Executor reports task instance {} finished ({}) "
                       "although the task says its {}. Was the task "
                       "killed externally?".format(ti, state, ti.state))
                Stats.incr('scheduler.tasks.killed_externally')
                self.log.error(msg)
                try:
                    dag = self.task_route.find_dagrun(dag_id, execution_date)
                    ti.task = dag.get_task(task_id)
                    ti.handle_failure(msg)
                except Exception:
                    self.log.error(
                        "Cannot load the dag bag to handle failure for %s"
                        ". Setting task to FAILED without callbacks or "
                        "retries. Do you have enough resources?", ti)
                ti.state = State.FAILED
                session.merge(ti)
            else:
                if ts.action is None:
                    self.log.debug(
                        "task dag_id {0} task_id {1} execution_date {2} action is None."
                        .format(dag_id, task_id, execution_date))
                elif TaskAction(ts.action) == TaskAction.RESTART:
                    # if ts.stop_flag is not None and ts.stop_flag == try_number:
                    ti.state = State.SCHEDULED
                    ts.action = None
                    ts.stop_flag = None
                    ts.ack_id = 0
                    session.merge(ti)
                    session.merge(ts)
                    self.log.debug(
                        "task dag_id {0} task_id {1} execution_date {2} try_number {3} restart action."
                        .format(dag_id, task_id, execution_date,
                                str(try_number)))
                    is_restart = True
                elif TaskAction(ts.action) == TaskAction.STOP:
                    # if ts.stop_flag is not None and ts.stop_flag == try_number:
                    ts.action = None
                    ts.stop_flag = None
                    ts.ack_id = 0
                    session.merge(ts)
                    self.log.debug(
                        "task dag_id {0} task_id {1} execution_date {2} try_number {3} stop action."
                        .format(dag_id, task_id, execution_date,
                                str(try_number)))
                else:
                    self.log.debug(
                        "task dag_id {0} task_id {1} execution_date {2} action {3}."
                        .format(dag_id, task_id, execution_date, ts.action))
            session.commit()

        if not is_restart and ti.state == State.RUNNING:
            self.log.debug(
                "set task dag_id {0} task_id {1} execution_date {2} state {3}".
                format(dag_id, task_id, execution_date, state))
            ti.state = state
            session.merge(ti)
        session.commit()
        # update dagrun state
        sync_dag_run = session.query(DagRun).filter(
            DagRun.id == dag_run.id).first()
        if sync_dag_run.state not in FINISHED_STATES:
            if self.dagrun_route.find_dagrun_by_id(sync_dag_run.id) is None:
                self.log.error(
                    "DagRun lost dag_id {0} task_id {1} execution_date {2}".
                    format(dag_id, task_id, execution_date))
            else:
                run_process_func(target=dag_run_update_state,
                                 args=(
                                     dag_run,
                                     self.dagrun_route.find_simple_dag(
                                         dag_run.id),
                                 ))
예제 #23
0
 def heartbeat_callback(self):
     Stats.gauge('scheduler_heartbeat', 1, 1)
예제 #24
0
 def heartbeat_callback(self, session=None):
     Stats.gauge('scheduler_heartbeat', 1, 1)
예제 #25
0
 def heartbeat_callback(self, session=None):
     Stats.gauge('scheduler_heartbeat', 1, 1)
예제 #26
0
    def _log_file_processing_stats(self, known_file_paths):
        """
        Print out stats about how files are getting processed.

        :param known_file_paths: a list of file paths that may contain Airflow
            DAG definitions
        :type known_file_paths: list[unicode]
        :return: None
        """

        # File Path: Path to the file containing the DAG definition
        # PID: PID associated with the process that's processing the file. May
        # be empty.
        # Runtime: If the process is currently running, how long it's been
        # running for in seconds.
        # Last Runtime: If the process ran before, how long did it take to
        # finish in seconds
        # Last Run: When the file finished processing in the previous run.
        headers = ["File Path",
                   "PID",
                   "Runtime",
                   "Last Runtime",
                   "Last Run"]

        rows = []
        for file_path in known_file_paths:
            last_runtime = self.get_last_runtime(file_path)
            file_name = os.path.basename(file_path)
            file_name = os.path.splitext(file_name)[0].replace(os.sep, '.')
            if last_runtime:
                Stats.gauge(
                    'dag_processing.last_runtime.{}'.format(file_name),
                    last_runtime
                )

            processor_pid = self.get_pid(file_path)
            processor_start_time = self.get_start_time(file_path)
            runtime = ((timezone.utcnow() - processor_start_time).total_seconds()
                       if processor_start_time else None)
            last_run = self.get_last_finish_time(file_path)
            if last_run:
                seconds_ago = (timezone.utcnow() - last_run).total_seconds()
                Stats.gauge(
                    'dag_processing.last_run.seconds_ago.{}'.format(file_name),
                    seconds_ago
                )

            rows.append((file_path,
                         processor_pid,
                         runtime,
                         last_runtime,
                         last_run))

        # Sort by longest last runtime. (Can't sort None values in python3)
        rows = sorted(rows, key=lambda x: x[3] or 0.0)

        formatted_rows = []
        for file_path, pid, runtime, last_runtime, last_run in rows:
            formatted_rows.append((file_path,
                                   pid,
                                   "{:.2f}s".format(runtime)
                                   if runtime else None,
                                   "{:.2f}s".format(last_runtime)
                                   if last_runtime else None,
                                   last_run.strftime("%Y-%m-%dT%H:%M:%S")
                                   if last_run else None))
        log_str = ("\n" +
                   "=" * 80 +
                   "\n" +
                   "DAG File Processing Stats\n\n" +
                   tabulate(formatted_rows, headers=headers) +
                   "\n" +
                   "=" * 80)

        self.log.info(log_str)
예제 #27
0
 def heartbeat_callback(self):
     Stats.gauge('scheduler_heartbeat', 1, 1)
예제 #28
0
    def _execute(self):
        TI = models.TaskInstance

        pessimistic_connection_handling()

        logging.basicConfig(level=logging.DEBUG)
        self.logger.info("Starting the scheduler")

        dagbag = models.DagBag(self.subdir, sync_to_db=True)
        executor = self.executor = dagbag.executor
        executor.start()
        self.runs = 0
        while not self.num_runs or self.num_runs > self.runs:
            try:
                loop_start_dttm = datetime.now()
                try:
                    self.prioritize_queued(executor=executor, dagbag=dagbag)
                except Exception as e:
                    self.logger.exception(e)

                self.runs += 1
                try:
                    if self.runs % self.refresh_dags_every == 0:
                        dagbag = models.DagBag(self.subdir, sync_to_db=True)
                    else:
                        dagbag.collect_dags(only_if_updated=True)
                except Exception as e:
                    self.logger.error("Failed at reloading the dagbag. {}".format(e))
                    Stats.incr('dag_refresh_error', 1, 1)
                    sleep(5)

                if len(self.dag_ids) > 0:
                    dags = [dag for dag in dagbag.dags.values() if dag.dag_id in self.dag_ids]
                else:
                    dags = [
                        dag for dag in dagbag.dags.values()
                        if not dag.parent_dag]

                paused_dag_ids = dagbag.paused_dags()
                dags = [x for x in dags if x.dag_id not in paused_dag_ids]
                # dags = filter(lambda x: x.dag_id not in paused_dag_ids, dags)

                self.logger.debug("Total Cores: {} Max Threads: {} DAGs:{}".
                                  format(multiprocessing.cpu_count(),
                                         self.max_threads,
                                         len(dags)))
                dags = self._split(dags, math.ceil(len(dags) / self.max_threads))
                tis_q = multiprocessing.Queue()
                jobs = [multiprocessing.Process(target=self._do_dags,
                                                args=(dagbag, dags[i], tis_q))
                        for i in range(len(dags))]

                self.logger.info("Starting {} scheduler jobs".format(len(jobs)))
                for j in jobs:
                    j.start()

                while any(j.is_alive() for j in jobs):
                    while not tis_q.empty():
                        ti_key, pickle_id = tis_q.get()
                        dag = dagbag.dags[ti_key[0]]
                        task = dag.get_task(ti_key[1])
                        ti = TI(task, ti_key[2])
                        self.executor.queue_task_instance(ti, pickle_id=pickle_id)

                for j in jobs:
                    j.join()

                self.logger.info("Done queuing tasks, calling the executor's "
                              "heartbeat")
                duration_sec = (datetime.now() - loop_start_dttm).total_seconds()
                self.logger.info("Loop took: {} seconds".format(duration_sec))
                Stats.timing("scheduler_loop", duration_sec * 1000)
                try:
                    self.import_errors(dagbag)
                except Exception as e:
                    self.logger.exception(e)
                try:
                    dagbag.kill_zombies()
                except Exception as e:
                    self.logger.exception(e)
                try:
                    # We really just want the scheduler to never ever stop.
                    executor.heartbeat()
                    self.heartbeat()
                except Exception as e:
                    self.logger.exception(e)
                    self.logger.error("Tachycardia!")
            except Exception as deep_e:
                self.logger.exception(deep_e)
                raise
            finally:
                settings.Session.remove()
        executor.end()
예제 #29
0
    def _execute(self):
        self.task_runner = get_task_runner(self)

        def signal_handler(signum, frame):
            """Setting kill signal handler"""
            self.log.error("Received SIGTERM. Terminating subprocesses")
            self.on_kill()
            raise AirflowException("LocalTaskJob received SIGTERM signal")

        signal.signal(signal.SIGTERM, signal_handler)

        if not self.task_instance._check_and_change_state_before_execution(
                mark_success=self.mark_success,
                ignore_all_deps=self.ignore_all_deps,
                ignore_depends_on_past=self.ignore_depends_on_past,
                ignore_task_deps=self.ignore_task_deps,
                ignore_ti_state=self.ignore_ti_state,
                job_id=self.id,
                pool=self.pool):
            self.log.info("Task is not able to be run")
            return

        try:
            self.task_runner.start()

            last_heartbeat_time = time.time()
            heartbeat_time_limit = conf.getint(
                'scheduler', 'scheduler_zombie_task_threshold')
            while True:
                # Monitor the task to see if it's done
                return_code = self.task_runner.return_code()
                if return_code is not None:
                    self.log.info("Task exited with return code %s",
                                  return_code)
                    return

                # Periodically heartbeat so that the scheduler doesn't think this
                # is a zombie
                try:
                    self.heartbeat()
                    last_heartbeat_time = time.time()
                except OperationalError:
                    Stats.incr('local_task_job_heartbeat_failure', 1, 1)
                    self.log.exception(
                        "Exception while trying to heartbeat! Sleeping for %s seconds",
                        self.heartrate)
                    time.sleep(self.heartrate)

                # If it's been too long since we've heartbeat, then it's possible that
                # the scheduler rescheduled this task, so kill launched processes.
                time_since_last_heartbeat = time.time() - last_heartbeat_time
                if time_since_last_heartbeat > heartbeat_time_limit:
                    Stats.incr('local_task_job_prolonged_heartbeat_failure', 1,
                               1)
                    self.log.error("Heartbeat time limited exceeded!")
                    raise AirflowException(
                        "Time since last heartbeat({:.2f}s) "
                        "exceeded limit ({}s).".format(
                            time_since_last_heartbeat, heartbeat_time_limit))
        finally:
            self.on_kill()
예제 #30
0
    def collect_dags(self,
                     dag_folder=None,
                     only_if_updated=True,
                     include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'),
                     safe_mode=conf.getboolean('core',
                                               'DAG_DISCOVERY_SAFE_MODE')):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the regex patterns specified
        in the file.

        **Note**: The patterns in .airflowignore are treated as
        un-anchored regexes, not shell-like glob patterns.
        """
        if self.store_serialized_dags:
            return

        self.log.info("Filling up the DagBag from %s", dag_folder)
        dag_folder = dag_folder or self.dag_folder
        # Used to store stats around DagBag processing
        stats = []
        FileLoadStat = namedtuple('FileLoadStat',
                                  "file duration dag_num task_num dags")

        dag_folder = correct_maybe_zipped(dag_folder)

        dags_by_name = {}

        for filepath in list_py_file_paths(dag_folder,
                                           safe_mode=safe_mode,
                                           include_examples=include_examples):
            try:
                ts = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)
                dag_ids = [dag.dag_id for dag in found_dags]
                dag_id_names = str(dag_ids)

                td = timezone.utcnow() - ts
                td = td.total_seconds() + (float(td.microseconds) / 1000000)
                dags_by_name[dag_id_names] = dag_ids
                stats.append(
                    FileLoadStat(
                        filepath.replace(settings.DAGS_FOLDER, ''),
                        td,
                        len(found_dags),
                        sum([len(dag.tasks) for dag in found_dags]),
                        dag_id_names,
                    ))
            except Exception as e:
                self.log.exception(e)
        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
        for file_stat in self.dagbag_stats:
            dag_ids = dags_by_name[file_stat.dags]
            if file_stat.dag_num >= 1:
                # if we found multiple dags per file, the stat is 'dag_id1 _ dag_id2'
                dag_names = '_'.join(dag_ids)
                Stats.timing('dag.loading-duration.{}'.format(dag_names),
                             file_stat.duration)
예제 #31
0
    def _execute(self):
        session = settings.Session()
        TI = models.TaskInstance

        pessimistic_connection_handling()

        logging.basicConfig(level=logging.DEBUG)
        self.logger.info("Starting the scheduler")

        dagbag = models.DagBag(self.subdir, sync_to_db=True)
        executor = self.executor = dagbag.executor
        executor.start()

        # grab orphaned tasks and make sure to reset their state
        active_runs = DagRun.find(state=State.RUNNING,
                                  external_trigger=False,
                                  session=session)
        for dr in active_runs:
            self._reset_state_for_orphaned_tasks(dr, session=session)

        self.runs = 0
        while not self.num_runs or self.num_runs > self.runs:
            try:
                loop_start_dttm = datetime.now()
                try:
                    self.prioritize_queued(executor=executor, dagbag=dagbag)
                except Exception as e:
                    self.logger.exception(e)

                self.runs += 1
                try:
                    if self.runs % self.refresh_dags_every == 0:
                        dagbag = models.DagBag(self.subdir, sync_to_db=True)
                    else:
                        dagbag.collect_dags(only_if_updated=True)
                except Exception as e:
                    self.logger.error(
                        "Failed at reloading the dagbag. {}".format(e))
                    Stats.incr('dag_refresh_error', 1, 1)
                    sleep(5)

                if len(self.dag_ids) > 0:
                    dags = [
                        dag for dag in dagbag.dags.values()
                        if dag.dag_id in self.dag_ids
                    ]
                else:
                    dags = [
                        dag for dag in dagbag.dags.values()
                        if not dag.parent_dag
                    ]

                paused_dag_ids = dagbag.paused_dags()
                dags = [x for x in dags if x.dag_id not in paused_dag_ids]
                # dags = filter(lambda x: x.dag_id not in paused_dag_ids, dags)

                self.logger.debug(
                    "Total Cores: {} Max Threads: {} DAGs:{}".format(
                        multiprocessing.cpu_count(), self.max_threads,
                        len(dags)))
                dags = self._split(dags,
                                   math.ceil(len(dags) / self.max_threads))
                tis_q = multiprocessing.Queue()
                jobs = [
                    multiprocessing.Process(target=self._do_dags,
                                            args=(dagbag, dags[i], tis_q))
                    for i in range(len(dags))
                ]

                self.logger.info("Starting {} scheduler jobs".format(
                    len(jobs)))
                for j in jobs:
                    j.start()

                while any(j.is_alive() for j in jobs):
                    while not tis_q.empty():
                        ti_key, pickle_id = tis_q.get()
                        dag = dagbag.dags[ti_key[0]]
                        task = dag.get_task(ti_key[1])
                        ti = TI(task, ti_key[2])
                        ti.refresh_from_db(session=session,
                                           lock_for_update=True)
                        if ti.state == State.SCHEDULED:
                            session.commit()
                            self.logger.debug(
                                "Task {} was picked up by another scheduler".
                                format(ti))
                            continue
                        elif ti.state is State.NONE:
                            ti.state = State.SCHEDULED

                        self.executor.queue_task_instance(ti,
                                                          pickle_id=pickle_id)

                        session.merge(ti)
                        session.commit()

                for j in jobs:
                    j.join()

                self.logger.info("Done queuing tasks, calling the executor's "
                                 "heartbeat")
                duration_sec = (datetime.now() -
                                loop_start_dttm).total_seconds()
                self.logger.info("Loop took: {} seconds".format(duration_sec))
                Stats.timing("scheduler_loop", duration_sec * 1000)
                try:
                    self.import_errors(dagbag)
                except Exception as e:
                    self.logger.exception(e)
                try:
                    dagbag.kill_zombies()
                except Exception as e:
                    self.logger.exception(e)
                try:
                    # We really just want the scheduler to never ever stop.
                    executor.heartbeat()
                    self.heartbeat()
                except Exception as e:
                    self.logger.exception(e)
                    self.logger.error("Tachycardia!")
            except Exception as deep_e:
                self.logger.exception(deep_e)
                raise
            finally:
                settings.Session.remove()
        executor.end()
        session.close()
예제 #32
0
    def update_state(self, session=None):
        """
        Determines the overall state of the DagRun based on the state
        of its TaskInstances.

        :return: State
        """

        dag = self.get_dag()

        tis = self.get_task_instances(session=session)
        self.log.debug("Updating state for %s considering %s task(s)", self,
                       len(tis))

        for ti in list(tis):
            # skip in db?
            if ti.state == State.REMOVED:
                tis.remove(ti)
            else:
                ti.task = dag.get_task(ti.task_id)

        # pre-calculate
        # db is faster
        start_dttm = timezone.utcnow()
        unfinished_tasks = self.get_task_instances(state=State.unfinished(),
                                                   session=session)
        none_depends_on_past = all(not t.task.depends_on_past
                                   for t in unfinished_tasks)
        none_task_concurrency = all(t.task.task_concurrency is None
                                    for t in unfinished_tasks)
        # small speed up
        if unfinished_tasks and none_depends_on_past and none_task_concurrency:
            # todo: this can actually get pretty slow: one task costs between 0.01-015s
            no_dependencies_met = True
            for ut in unfinished_tasks:
                # We need to flag upstream and check for changes because upstream
                # failures/re-schedules can result in deadlock false positives
                old_state = ut.state
                deps_met = ut.are_dependencies_met(dep_context=DepContext(
                    flag_upstream_failed=True,
                    ignore_in_retry_period=True,
                    ignore_in_reschedule_period=True),
                                                   session=session)
                if deps_met or old_state != ut.current_state(session=session):
                    no_dependencies_met = False
                    break

        duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000
        Stats.timing("dagrun.dependency-check.{}".format(self.dag_id),
                     duration)

        root_ids = [t.task_id for t in dag.roots]
        roots = [t for t in tis if t.task_id in root_ids]

        # if all roots finished and at least one failed, the run failed
        if (not unfinished_tasks
                and any(r.state in (State.FAILED, State.UPSTREAM_FAILED)
                        for r in roots)):
            self.log.info('Marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self,
                                success=False,
                                reason='task_failure',
                                session=session)

        # if all roots succeeded and no unfinished tasks, the run succeeded
        elif not unfinished_tasks and all(
                r.state in (State.SUCCESS, State.SKIPPED) for r in roots):
            self.log.info('Marking run %s successful', self)
            self.set_state(State.SUCCESS)
            dag.handle_callback(self,
                                success=True,
                                reason='success',
                                session=session)

        # if *all tasks* are deadlocked, the run failed
        elif (unfinished_tasks and none_depends_on_past
              and none_task_concurrency and no_dependencies_met):
            self.log.info('Deadlock; marking run %s failed', self)
            self.set_state(State.FAILED)
            dag.handle_callback(self,
                                success=False,
                                reason='all_tasks_deadlocked',
                                session=session)

        # finally, if the roots aren't done, the dag is still running
        else:
            self.set_state(State.RUNNING)

        self._emit_duration_stats_for_finished_state()

        # todo: determine we want to use with_for_update to make sure to lock the run
        session.merge(self)
        session.commit()

        return self.state
예제 #33
0
    def _log_file_processing_stats(self, known_file_paths):
        """
        Print out stats about how files are getting processed.

        :param known_file_paths: a list of file paths that may contain Airflow
            DAG definitions
        :type known_file_paths: list[unicode]
        :return: None
        """

        # File Path: Path to the file containing the DAG definition
        # PID: PID associated with the process that's processing the file. May
        # be empty.
        # Runtime: If the process is currently running, how long it's been
        # running for in seconds.
        # Last Runtime: If the process ran before, how long did it take to
        # finish in seconds
        # Last Run: When the file finished processing in the previous run.
        headers = ["File Path",
                   "PID",
                   "Runtime",
                   "Last Runtime",
                   "Last Run"]

        rows = []
        for file_path in known_file_paths:
            last_runtime = self.get_last_runtime(file_path)
            file_name = os.path.basename(file_path)
            file_name = os.path.splitext(file_name)[0].replace(os.sep, '.')
            if last_runtime:
                Stats.gauge(
                    'dag_processing.last_runtime.{}'.format(file_name),
                    last_runtime
                )

            processor_pid = self.get_pid(file_path)
            processor_start_time = self.get_start_time(file_path)
            runtime = ((timezone.utcnow() - processor_start_time).total_seconds()
                       if processor_start_time else None)
            last_run = self.get_last_finish_time(file_path)
            if last_run:
                seconds_ago = (timezone.utcnow() - last_run).total_seconds()
                Stats.gauge(
                    'dag_processing.last_run.seconds_ago.{}'.format(file_name),
                    seconds_ago
                )

            rows.append((file_path,
                         processor_pid,
                         runtime,
                         last_runtime,
                         last_run))

        # Sort by longest last runtime. (Can't sort None values in python3)
        rows = sorted(rows, key=lambda x: x[3] or 0.0)

        formatted_rows = []
        for file_path, pid, runtime, last_runtime, last_run in rows:
            formatted_rows.append((file_path,
                                   pid,
                                   "{:.2f}s".format(runtime)
                                   if runtime else None,
                                   "{:.2f}s".format(last_runtime)
                                   if last_runtime else None,
                                   last_run.strftime("%Y-%m-%dT%H:%M:%S")
                                   if last_run else None))
        log_str = ("\n" +
                   "=" * 80 +
                   "\n" +
                   "DAG File Processing Stats\n\n" +
                   tabulate(formatted_rows, headers=headers) +
                   "\n" +
                   "=" * 80)

        self.log.info(log_str)