def _execute(self): self.task_runner = get_task_runner(self) def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.on_kill() raise AirflowException("LocalTaskJob received SIGTERM signal") signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance._check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool): self.log.info("Task is not able to be run") return try: self.task_runner.start() heartbeat_time_limit = conf.getint('scheduler', 'scheduler_zombie_task_threshold') while True: # Monitor the task to see if it's done return_code = self.task_runner.return_code() if return_code is not None: self.log.info("Task exited with return code %s", return_code) return self.heartbeat() # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. # This can only really happen if the worker can't read the DB for a long time time_since_last_heartbeat = (timezone.utcnow() - self.latest_heartbeat).total_seconds() if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limited exceeded!") raise AirflowException("Time since last heartbeat({:.2f}s) " "exceeded limit ({}s)." .format(time_since_last_heartbeat, heartbeat_time_limit)) if time_since_last_heartbeat < self.heartrate: sleep_for = self.heartrate - time_since_last_heartbeat self.log.debug("Time since last heartbeat(%.2f s) < heartrate(%s s)" ", sleeping for %s s", time_since_last_heartbeat, self.heartrate, sleep_for) time.sleep(sleep_for) finally: self.on_kill()
def _emit_duration_stats_for_finished_state(self): if self.state == State.RUNNING: return duration = (self.end_date - self.start_date) if self.state is State.SUCCESS: Stats.timing('dagrun.duration.success.{}'.format(self.dag_id), duration) elif self.state == State.FAILED: Stats.timing('dagrun.duration.failed.{}'.format(self.dag_id), duration)
def stop_time(ctx, *args, **kwargs): start = ctx['start_time'] stop = datetime.now() duration = (stop - start).total_seconds() * 1000 tags = { 'dag': ctx['operator'].dag_id, 'task': ctx['operator'].task_id, 'operator': ctx['operator'].__class__.__name__, 'domain': ctx['domain'], } Stats.timing('request.duration', duration, tags=tags)
def bq_duration(ctx, self, *args, **kwargs): stats = ctx['job']['statistics'] creation = int(stats['creationTime']) start = int(stats['startTime']) end = int(stats['endTime']) tags = { 'dag': self.dag_id, 'task': self.task_id, 'operator': self.__class__.__name__, } Stats.timing('task.delay.gcs_to_bq', start - creation, tags=tags) Stats.timing('task.duration.gcs_to_bq', end - start, tags=tags)
def _kill_timed_out_processors(self): """ Kill any file processors that timeout to defend against process hangs. """ now = timezone.utcnow() for file_path, processor in self._processors.items(): duration = now - processor.start_time if duration > self._processor_timeout: self.log.info( "Processor for %s with PID %s started at %s has timed out, " "killing it.", processor.file_path, processor.pid, processor.start_time.isoformat()) Stats.incr('dag_file_processor_timeouts', 1, 1) processor.kill()
def task_states(_since, session=None): states = ( session.query(TaskInstance.state, sqlalchemy.func.count()) .group_by(TaskInstance.state) ) for state, count in states: if state is None: continue tags = { 'state': state } Stats.gauge('task.state', count, tags=tags)
def http_status(ctx, *args, **kwargs): response = ctx['return'] status = response.status_code tags = { 'dag': ctx['operator'].dag_id, 'task': ctx['operator'].task_id, 'operator': ctx['operator'].__class__.__name__, 'domain': ctx['domain'], 'status': status } if status < 400: Stats.incr('request.status.success', tags=tags) else: Stats.incr('request.status.failure', tags=tags)
def bq_task_states(since, session=None): states = ( session.query(TaskInstance.state, sqlalchemy.func.count()) .filter(TaskInstance.operator == 'BigQueryOperator') .filter(TaskInstance.end_date > since) .group_by(TaskInstance.state) ) for state, count in states: if state is None: continue tags = { 'state': state } Stats.incr('task.state.bq', count, tags=tags)
def _process_dags_and_create_dagruns(self, dagbag, dags, dagrun_out): """ Iterates over the dags and processes them. Processing includes: 1. Create appropriate DagRun(s) in the DB. 2. Create appropriate TaskInstance(s) in the DB. 3. Send emails for tasks that have missed SLAs. :param dagbag: a collection of DAGs to process :type dagbag: airflow.models.DagBag :param dags: the DAGs from the DagBag to process :type dags: list[airflow.models.DAG] :param dagrun_out: A list to add DagRun objects :type dagrun_out: list[DagRun] :rtype: None """ for dag in dags: dag = dagbag.get_dag(dag.dag_id) if not dag: self.log.error("DAG ID %s was not found in the DagBag", dag.dag_id) continue if dag.is_paused: self.log.info("Not processing DAG %s since it's paused", dag.dag_id) continue self.log.info("Processing %s", dag.dag_id) dag_run = self.create_dag_run(dag) if dag_run: dag_run.dag = dag expected_start_date = dag.following_schedule( dag_run.execution_date) if expected_start_date: schedule_delay = dag_run.start_date - expected_start_date Stats.timing( 'dagrun.schedule_delay.{dag_id}'.format( dag_id=dag.dag_id), schedule_delay) self.log.info("Created %s", dag_run) self._create_task_instances(dag_run) self.log.info("Created tasks instances %s", dag_run) dagrun_out.append(dag_run) if conf.getboolean('core', 'CHECK_SLAS', fallback=True): self.manage_slas(dag)
def collect_dags_from_db(self): """Collects DAGs from database.""" from airflow.models.serialized_dag import SerializedDagModel start_dttm = timezone.utcnow() self.log.info("Filling up the DagBag from database") # The dagbag contains all rows in serialized_dag table. Deleted DAGs are deleted # from the table by the scheduler job. self.dags = SerializedDagModel.read_all_dags() # Adds subdags. # DAG post-processing steps such as self.bag_dag and croniter are not needed as # they are done by scheduler before serialization. subdags = {} for dag in self.dags.values(): for subdag in dag.subdags: subdags[subdag.dag_id] = subdag self.dags.update(subdags) Stats.timing('collect_db_dags', timezone.utcnow() - start_dttm)
def heartbeat(self): # Triggering new jobs if not self.parallelism: open_slots = len(self.queued_tasks) else: open_slots = self.parallelism - len(self.running) num_running_tasks = len(self.running) num_queued_tasks = len(self.queued_tasks) self.log.debug("%s running task instances", num_running_tasks) self.log.debug("%s in queue", num_queued_tasks) self.log.debug("%s open slots", open_slots) Stats.gauge('executor.open_slots', open_slots) Stats.gauge('executor.queued_tasks', num_queued_tasks) Stats.gauge('executor.running_tasks', num_running_tasks) sorted_queue = sorted([(k, v) for k, v in self.queued_tasks.items()], key=lambda x: x[1][1], reverse=True) for i in range(min((open_slots, len(self.queued_tasks)))): key, (command, _, queue, simple_ti) = sorted_queue.pop(0) self.queued_tasks.pop(key) self.running[key] = command self.execute_async(key=key, command=command, queue=queue, executor_config=simple_ti.executor_config) # Calling child class sync method self.log.debug("Calling the %s sync method", self.__class__) self.sync()
def verify_integrity(self, session=None): """ Verifies the DagRun by checking for removed tasks or tasks that are not in the database yet. It will set state to removed or add the task if required. """ from airflow.models.taskinstance import TaskInstance # Avoid circular import dag = self.get_dag() tis = self.get_task_instances(session=session) # check for removed or restored tasks task_ids = set() for ti in tis: task_instance_mutation_hook(ti) task_ids.add(ti.task_id) task = None try: task = dag.get_task(ti.task_id) except AirflowException: if ti.state == State.REMOVED: pass # ti has already been removed, just ignore it elif self.state is not State.RUNNING and not dag.partial: self.log.warning("Failed to get task '{}' for dag '{}'. " "Marking it as removed.".format(ti, dag)) Stats.incr( "task_removed_from_dag.{}".format(dag.dag_id), 1, 1) ti.state = State.REMOVED is_task_in_dag = task is not None should_restore_task = is_task_in_dag and ti.state == State.REMOVED if should_restore_task: self.log.info("Restoring task '{}' which was previously " "removed from DAG '{}'".format(ti, dag)) Stats.incr("task_restored_to_dag.{}".format(dag.dag_id), 1, 1) ti.state = State.NONE session.merge(ti) # check for missing tasks for task in six.itervalues(dag.task_dict): if task.start_date > self.execution_date and not self.is_backfill: continue if task.task_id not in task_ids: Stats.incr( "task_instance_created-{}".format(task.__class__.__name__), 1, 1) ti = TaskInstance(task, self.execution_date) task_instance_mutation_hook(ti) session.add(ti) try: session.commit() except IntegrityError as err: self.log.info(str(err)) self.log.info( 'Hit IntegrityError while creating the TIs for %s - %s', dag.dag_id, self.execution_date ) self.log.info('Doing session rollback.') session.rollback()
def _emit_true_scheduling_delay_stats_for_finished_state(self, finished_tis): """ This is a helper method to emit the true scheduling delay stats, which is defined as the time when the first task in DAG starts minus the expected DAG run datetime. This method will be used in the update_state method when the state of the DagRun is updated to a completed status (either success or failure). The method will find the first started task within the DAG and calculate the expected DagRun start time (based on dag.execution_date & dag.schedule_interval), and minus these two values to get the delay. The emitted data may contains outlier (e.g. when the first task was cleared, so the second task's start_date will be used), but we can get rid of the the outliers on the stats side through the dashboards tooling built. Note, the stat will only be emitted if the DagRun is a scheduler triggered one (i.e. external_trigger is False). """ if self.state == State.RUNNING: return if self.external_trigger: return if not finished_tis: return try: dag = self.get_dag() if not self.dag.schedule_interval or self.dag.schedule_interval == "@once": # We can't emit this metric if there is no following schedule to cacluate from! return ordered_tis_by_start_date = [ti for ti in finished_tis if ti.start_date] ordered_tis_by_start_date.sort(key=lambda ti: ti.start_date, reverse=False) first_start_date = ordered_tis_by_start_date[0].start_date if first_start_date: # dag.following_schedule calculates the expected start datetime for a scheduled dagrun # i.e. a daily flow for execution date 1/1/20 actually runs on 1/2/20 hh:mm:ss, # and ti.start_date will be 1/2/20 hh:mm:ss so the following schedule is comparison true_delay = first_start_date - dag.following_schedule(self.execution_date) if true_delay.total_seconds() > 0: Stats.timing('dagrun.{}.first_task_scheduling_delay'.format(dag.dag_id), true_delay) except Exception as e: self.log.warning('Failed to record first_task_scheduling_delay metric:\n', e)
def run(self): Stats.incr(self.__class__.__name__.lower()+'_start', 1, 1) # Adding an entry in the DB session = settings.Session() self.state = State.RUNNING session.add(self) session.commit() id_ = self.id make_transient(self) self.id = id_ # Run self._execute() # Marking the success in the DB self.end_date = datetime.now() self.state = State.SUCCESS session.merge(self) session.commit() session.close() Stats.incr(self.__class__.__name__.lower()+'_end', 1, 1)
def run(self): Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1) # Adding an entry in the DB session = settings.Session() self.state = State.RUNNING session.add(self) session.commit() id_ = self.id make_transient(self) self.id = id_ # Run self._execute() # Marking the success in the DB self.end_date = datetime.now() self.state = State.SUCCESS session.merge(self) session.commit() session.close() Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1)
def collect_dags( self, dag_folder=None, only_if_updated=True, include_examples=configuration.conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=configuration.conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ start_dttm = timezone.utcnow() dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] FileLoadStat = namedtuple('FileLoadStat', "file duration dag_num task_num dags") dag_folder = correct_maybe_zipped(dag_folder) for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: ts = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) td = timezone.utcnow() - ts td = td.total_seconds() + (float(td.microseconds) / 1000000) stats.append( FileLoadStat( filepath.replace(dag_folder, ''), td, len(found_dags), sum([len(dag.tasks) for dag in found_dags]), str([dag.dag_id for dag in found_dags]), )) except Exception as e: self.log.exception(e) Stats.gauge('collect_dags', (timezone.utcnow() - start_dttm).total_seconds(), 1) Stats.gauge('dagbag_size', len(self.dags), 1) Stats.gauge('dagbag_import_errors', len(self.import_errors), 1) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True)
def kill_zombies(self, session=None): """ Fail zombie tasks, which are tasks that haven't had a heartbeat for too long, in the current DagBag. :param session: DB session. :type session: sqlalchemy.orm.session.Session """ # Avoid circular import from airflow.models.taskinstance import TaskInstance as TI from airflow.jobs import LocalTaskJob as LJ # How many seconds do we wait for tasks to heartbeat before mark them as zombies. zombie_threshold_secs = (configuration.getint( 'scheduler', 'scheduler_zombie_task_threshold')) limit_dttm = timezone.utcnow() - timedelta( seconds=zombie_threshold_secs) self.log.debug("Failing jobs without heartbeat after %s", limit_dttm) tis = (session.query(TI).join(LJ, TI.job_id == LJ.id).filter( TI.state == State.RUNNING).filter(TI.dag_id.in_(self.dags)).filter( or_( LJ.state != State.RUNNING, LJ.latest_heartbeat < limit_dttm, )).all()) for ti in tis: self.log.info( "Detected zombie job with dag_id %s, task_id %s, and execution date %s", ti.dag_id, ti.task_id, ti.execution_date.isoformat()) ti.test_mode = configuration.getboolean('core', 'unit_test_mode') ti.task = self.dags[ti.dag_id].get_task(ti.task_id) ti.handle_failure("{} detected as zombie".format(ti), ti.test_mode, ti.get_template_context()) self.log.info('Marked zombie job %s as %s', ti, ti.state) Stats.incr('zombies_killed') session.commit()
def heartbeat(self): # Triggering new jobs if not self.parallelism: open_slots = len(self.queued_tasks) else: open_slots = self.parallelism - len(self.running) num_running_tasks = len(self.running) num_queued_tasks = len(self.queued_tasks) self.log.debug("%s running task instances", num_running_tasks) self.log.debug("%s in queue", num_queued_tasks) self.log.debug("%s open slots", open_slots) Stats.gauge('executor.open_slots', open_slots) Stats.gauge('executor.queued_tasks', num_queued_tasks) Stats.gauge('executor.running_tasks', num_running_tasks) self.trigger_tasks(open_slots) # Calling child class sync method self.log.debug("Calling the %s sync method", self.__class__) self.sync()
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: ready_tis: the tis that can be scheduled in the current loop :rtype ready_tis: list[airflow.models.TaskInstance] """ dag = self.get_dag() ready_tis = [] tis = [ti for ti in self.get_task_instances(session=session, state=State.task_states + (State.SHUTDOWN,))] self.log.debug("number of tis tasks for %s: %s task(s)", self, len(tis)) for ti in list(tis): ti.task = dag.get_task(ti.task_id) start_dttm = timezone.utcnow() unfinished_tasks = [t for t in tis if t.state in State.unfinished()] finished_tasks = [t for t in tis if t.state in State.finished() + [State.UPSTREAM_FAILED]] none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) # small speed up if unfinished_tasks: scheduleable_tasks = [ut for ut in unfinished_tasks if ut.state in SCHEDULEABLE_STATES] self.log.debug( "number of scheduleable tasks for %s: %s task(s)", self, len(scheduleable_tasks)) ready_tis, changed_tis = self._get_ready_tis(scheduleable_tasks, finished_tasks, session) self.log.debug("ready tis length for %s: %s task(s)", self, len(ready_tis)) if none_depends_on_past and none_task_concurrency: # small speed up are_runnable_tasks = ready_tis or self._are_premature_tis( unfinished_tasks, finished_tasks, session) or changed_tis duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) leaf_task_ids = {t.task_id for t in dag.leaves} leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids] # if all roots finished and at least one failed, the run failed if not unfinished_tasks and any( leaf_ti.state in {State.FAILED, State.UPSTREAM_FAILED} for leaf_ti in leaf_tis ): self.log.info('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all leafs succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( leaf_ti.state in {State.SUCCESS, State.SKIPPED} for leaf_ti in leaf_tis ): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and not are_runnable_tasks): self.log.info('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_true_scheduling_delay_stats_for_finished_state(finished_tasks) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return ready_tis
def process_file(self, file_path, zombies, pickle_dags=False, session=None): """ Process a Python file containing Airflow DAGs. This includes: 1. Execute the file and look for DAG objects in the namespace. 2. Pickle the DAG and save it to the DB (if necessary). 3. For each DAG, see what tasks should run and create appropriate task instances in the DB. 4. Record any errors importing the file into ORM 5. Kill (in ORM) any task instances belonging to the DAGs that haven't issued a heartbeat in a while. Returns a list of SimpleDag objects that represent the DAGs found in the file :param file_path: the path to the Python file that should be executed :type file_path: unicode :param zombies: zombie task instances to kill. :type zombies: list[airflow.utils.dag_processing.SimpleTaskInstance] :param pickle_dags: whether serialize the DAGs found in the file and save them to the db :type pickle_dags: bool :return: a list of SimpleDagRuns made from the Dags found in the file :rtype: list[airflow.utils.dag_processing.SimpleDagBag] """ self.log.info("Processing file %s for tasks to queue", file_path) if session is None: session = settings.Session() # As DAGs are parsed from this file, they will be converted into SimpleDags try: dagbag = models.DagBag(file_path, include_examples=False) except Exception: self.log.exception("Failed at reloading the DAG file %s", file_path) Stats.incr('dag_file_refresh_error', 1, 1) return [], [] if len(dagbag.dags) > 0: self.log.info("DAG(s) %s retrieved from %s", dagbag.dags.keys(), file_path) else: self.log.warning("No viable dags retrieved from %s", file_path) self.update_import_errors(session, dagbag) return [], len(dagbag.import_errors) # Save individual DAGs in the ORM and update DagModel.last_scheduled_time for dag in dagbag.dags.values(): dag.sync_to_db() paused_dag_ids = [ dag.dag_id for dag in dagbag.dags.values() if dag.is_paused ] self.log.info("paused_dag_ids %s", paused_dag_ids) self.log.info("self %s", self.dag_ids) dag_to_pickle = {} # Pickle the DAGs (if necessary) and put them into a SimpleDag for dag_id in dagbag.dags: # Only return DAGs that are not paused if dag_id not in paused_dag_ids: dag = dagbag.get_dag(dag_id) pickle_id = None if pickle_dags: pickle_id = dag.pickle(session).id dag_to_pickle[dag.dag_id] = pickle_id if len(self.dag_ids) > 0: dags = [ dag for dag in dagbag.dags.values() if dag.dag_id in self.dag_ids and dag.dag_id not in paused_dag_ids ] else: dags = [ dag for dag in dagbag.dags.values() if not dag.parent_dag and dag.dag_id not in paused_dag_ids ] # Not using multiprocessing.Queue() since it's no longer a separate # process and due to some unusual behavior. (empty() incorrectly # returns true as described in https://bugs.python.org/issue23582 ) self.log.info("dags %s", dags) dag_run_out = [] self._process_dags_and_create_dagruns(dagbag, dags, dag_run_out) self.log.info("dag run out %s", len(dag_run_out)) simple_dag_runs = [] for dag_run in dag_run_out: simple_dag_runs.append( SimpleDagRun(dag_run.id, SimpleDag(dag_run.dag))) # commit batch session.commit() # Record import errors into the ORM try: self.update_import_errors(session, dagbag) except Exception: self.log.exception("Error logging import errors!") try: dagbag.kill_zombies(zombies) except Exception: self.log.exception("Error killing zombies!") return simple_dag_runs, len(dagbag.import_errors)
def heartbeat(self): """ Heartbeats update the job's entry in the database with a timestamp for the latest_heartbeat and allows for the job to be killed externally. This allows at the system level to monitor what is actually active. For instance, an old heartbeat for SchedulerJob would mean something is wrong. This also allows for any job to be killed externally, regardless of who is running it or on which machine it is running. Note that if your heartbeat is set to 60 seconds and you call this method after 10 seconds of processing since the last heartbeat, it will sleep 50 seconds to complete the 60 seconds and keep a steady heart rate. If you go over 60 seconds before calling it, it won't sleep at all. """ previous_heartbeat = self.latest_heartbeat try: with create_session() as session: # This will cause it to load from the db session.merge(self) previous_heartbeat = self.latest_heartbeat if self.state == State.SHUTDOWN: self.kill() is_unit_test = conf.getboolean('core', 'unit_test_mode') if not is_unit_test: # Figure out how long to sleep for sleep_for = 0 if self.latest_heartbeat: seconds_remaining = self.heartrate - \ (timezone.utcnow() - self.latest_heartbeat)\ .total_seconds() sleep_for = max(0, seconds_remaining) sleep(sleep_for) # Update last heartbeat time with create_session() as session: # Make the sesion aware of this object session.merge(self) self.latest_heartbeat = timezone.utcnow() session.commit() # At this point, the DB has updated. previous_heartbeat = self.latest_heartbeat self.heartbeat_callback(session=session) self.log.debug('[heartbeat]') except OperationalError: Stats.incr( convert_camel_to_snake(self.__class__.__name__) + '_heartbeat_failure', 1, 1) self.log.exception("%s heartbeat got an exception", self.__class__.__name__) # We didn't manage to heartbeat, so make sure that the timestamp isn't updated self.latest_heartbeat = previous_heartbeat
def _set_task_instance_state(self, dag_run, dag_id, task_id, execution_date, state, try_number, session=None): """ Set the task state to db and maybe set the dagrun object finished to db. :param dag_run: DagRun object :param dag_id: Dag identify :param task_id: task identify :param execution_date: the dag run execution date :param state: the task state should be set. :param try_number: the task try_number. :param session: :return: """ TI = models.TaskInstance qry = session.query(TI).filter(TI.dag_id == dag_id, TI.task_id == task_id, TI.execution_date == execution_date) ti = qry.first() if not ti: self.log.warning("TaskInstance %s went missing from the database", ti) return ts = TaskState.query_task_state(ti, session) self.log.debug( "set task state dag_id {0} task_id {1} execution_date {2} try_number {3} " "current try_number {4} state {5} ack_id {6} action {7}.".format( dag_id, task_id, execution_date, try_number, ti.try_number, state, ts.ack_id, ts.action)) is_restart = False if state == State.FAILED or state == State.SUCCESS or state == State.SHUTDOWN: if ti.try_number == try_number and ti.state == State.QUEUED: msg = ("Executor reports task instance {} finished ({}) " "although the task says its {}. Was the task " "killed externally?".format(ti, state, ti.state)) Stats.incr('scheduler.tasks.killed_externally') self.log.error(msg) try: dag = self.task_route.find_dagrun(dag_id, execution_date) ti.task = dag.get_task(task_id) ti.handle_failure(msg) except Exception: self.log.error( "Cannot load the dag bag to handle failure for %s" ". Setting task to FAILED without callbacks or " "retries. Do you have enough resources?", ti) ti.state = State.FAILED session.merge(ti) else: if ts.action is None: self.log.debug( "task dag_id {0} task_id {1} execution_date {2} action is None." .format(dag_id, task_id, execution_date)) elif TaskAction(ts.action) == TaskAction.RESTART: # if ts.stop_flag is not None and ts.stop_flag == try_number: ti.state = State.SCHEDULED ts.action = None ts.stop_flag = None ts.ack_id = 0 session.merge(ti) session.merge(ts) self.log.debug( "task dag_id {0} task_id {1} execution_date {2} try_number {3} restart action." .format(dag_id, task_id, execution_date, str(try_number))) is_restart = True elif TaskAction(ts.action) == TaskAction.STOP: # if ts.stop_flag is not None and ts.stop_flag == try_number: ts.action = None ts.stop_flag = None ts.ack_id = 0 session.merge(ts) self.log.debug( "task dag_id {0} task_id {1} execution_date {2} try_number {3} stop action." .format(dag_id, task_id, execution_date, str(try_number))) else: self.log.debug( "task dag_id {0} task_id {1} execution_date {2} action {3}." .format(dag_id, task_id, execution_date, ts.action)) session.commit() if not is_restart and ti.state == State.RUNNING: self.log.debug( "set task dag_id {0} task_id {1} execution_date {2} state {3}". format(dag_id, task_id, execution_date, state)) ti.state = state session.merge(ti) session.commit() # update dagrun state sync_dag_run = session.query(DagRun).filter( DagRun.id == dag_run.id).first() if sync_dag_run.state not in FINISHED_STATES: if self.dagrun_route.find_dagrun_by_id(sync_dag_run.id) is None: self.log.error( "DagRun lost dag_id {0} task_id {1} execution_date {2}". format(dag_id, task_id, execution_date)) else: run_process_func(target=dag_run_update_state, args=( dag_run, self.dagrun_route.find_simple_dag( dag_run.id), ))
def heartbeat_callback(self): Stats.gauge('scheduler_heartbeat', 1, 1)
def heartbeat_callback(self, session=None): Stats.gauge('scheduler_heartbeat', 1, 1)
def _log_file_processing_stats(self, known_file_paths): """ Print out stats about how files are getting processed. :param known_file_paths: a list of file paths that may contain Airflow DAG definitions :type known_file_paths: list[unicode] :return: None """ # File Path: Path to the file containing the DAG definition # PID: PID associated with the process that's processing the file. May # be empty. # Runtime: If the process is currently running, how long it's been # running for in seconds. # Last Runtime: If the process ran before, how long did it take to # finish in seconds # Last Run: When the file finished processing in the previous run. headers = ["File Path", "PID", "Runtime", "Last Runtime", "Last Run"] rows = [] for file_path in known_file_paths: last_runtime = self.get_last_runtime(file_path) file_name = os.path.basename(file_path) file_name = os.path.splitext(file_name)[0].replace(os.sep, '.') if last_runtime: Stats.gauge( 'dag_processing.last_runtime.{}'.format(file_name), last_runtime ) processor_pid = self.get_pid(file_path) processor_start_time = self.get_start_time(file_path) runtime = ((timezone.utcnow() - processor_start_time).total_seconds() if processor_start_time else None) last_run = self.get_last_finish_time(file_path) if last_run: seconds_ago = (timezone.utcnow() - last_run).total_seconds() Stats.gauge( 'dag_processing.last_run.seconds_ago.{}'.format(file_name), seconds_ago ) rows.append((file_path, processor_pid, runtime, last_runtime, last_run)) # Sort by longest last runtime. (Can't sort None values in python3) rows = sorted(rows, key=lambda x: x[3] or 0.0) formatted_rows = [] for file_path, pid, runtime, last_runtime, last_run in rows: formatted_rows.append((file_path, pid, "{:.2f}s".format(runtime) if runtime else None, "{:.2f}s".format(last_runtime) if last_runtime else None, last_run.strftime("%Y-%m-%dT%H:%M:%S") if last_run else None)) log_str = ("\n" + "=" * 80 + "\n" + "DAG File Processing Stats\n\n" + tabulate(formatted_rows, headers=headers) + "\n" + "=" * 80) self.log.info(log_str)
def _execute(self): TI = models.TaskInstance pessimistic_connection_handling() logging.basicConfig(level=logging.DEBUG) self.logger.info("Starting the scheduler") dagbag = models.DagBag(self.subdir, sync_to_db=True) executor = self.executor = dagbag.executor executor.start() self.runs = 0 while not self.num_runs or self.num_runs > self.runs: try: loop_start_dttm = datetime.now() try: self.prioritize_queued(executor=executor, dagbag=dagbag) except Exception as e: self.logger.exception(e) self.runs += 1 try: if self.runs % self.refresh_dags_every == 0: dagbag = models.DagBag(self.subdir, sync_to_db=True) else: dagbag.collect_dags(only_if_updated=True) except Exception as e: self.logger.error("Failed at reloading the dagbag. {}".format(e)) Stats.incr('dag_refresh_error', 1, 1) sleep(5) if len(self.dag_ids) > 0: dags = [dag for dag in dagbag.dags.values() if dag.dag_id in self.dag_ids] else: dags = [ dag for dag in dagbag.dags.values() if not dag.parent_dag] paused_dag_ids = dagbag.paused_dags() dags = [x for x in dags if x.dag_id not in paused_dag_ids] # dags = filter(lambda x: x.dag_id not in paused_dag_ids, dags) self.logger.debug("Total Cores: {} Max Threads: {} DAGs:{}". format(multiprocessing.cpu_count(), self.max_threads, len(dags))) dags = self._split(dags, math.ceil(len(dags) / self.max_threads)) tis_q = multiprocessing.Queue() jobs = [multiprocessing.Process(target=self._do_dags, args=(dagbag, dags[i], tis_q)) for i in range(len(dags))] self.logger.info("Starting {} scheduler jobs".format(len(jobs))) for j in jobs: j.start() while any(j.is_alive() for j in jobs): while not tis_q.empty(): ti_key, pickle_id = tis_q.get() dag = dagbag.dags[ti_key[0]] task = dag.get_task(ti_key[1]) ti = TI(task, ti_key[2]) self.executor.queue_task_instance(ti, pickle_id=pickle_id) for j in jobs: j.join() self.logger.info("Done queuing tasks, calling the executor's " "heartbeat") duration_sec = (datetime.now() - loop_start_dttm).total_seconds() self.logger.info("Loop took: {} seconds".format(duration_sec)) Stats.timing("scheduler_loop", duration_sec * 1000) try: self.import_errors(dagbag) except Exception as e: self.logger.exception(e) try: dagbag.kill_zombies() except Exception as e: self.logger.exception(e) try: # We really just want the scheduler to never ever stop. executor.heartbeat() self.heartbeat() except Exception as e: self.logger.exception(e) self.logger.error("Tachycardia!") except Exception as deep_e: self.logger.exception(deep_e) raise finally: settings.Session.remove() executor.end()
def _execute(self): self.task_runner = get_task_runner(self) def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.on_kill() raise AirflowException("LocalTaskJob received SIGTERM signal") signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance._check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool): self.log.info("Task is not able to be run") return try: self.task_runner.start() last_heartbeat_time = time.time() heartbeat_time_limit = conf.getint( 'scheduler', 'scheduler_zombie_task_threshold') while True: # Monitor the task to see if it's done return_code = self.task_runner.return_code() if return_code is not None: self.log.info("Task exited with return code %s", return_code) return # Periodically heartbeat so that the scheduler doesn't think this # is a zombie try: self.heartbeat() last_heartbeat_time = time.time() except OperationalError: Stats.incr('local_task_job_heartbeat_failure', 1, 1) self.log.exception( "Exception while trying to heartbeat! Sleeping for %s seconds", self.heartrate) time.sleep(self.heartrate) # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. time_since_last_heartbeat = time.time() - last_heartbeat_time if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limited exceeded!") raise AirflowException( "Time since last heartbeat({:.2f}s) " "exceeded limit ({}s).".format( time_since_last_heartbeat, heartbeat_time_limit)) finally: self.on_kill()
def collect_dags(self, dag_folder=None, only_if_updated=True, include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ if self.store_serialized_dags: return self.log.info("Filling up the DagBag from %s", dag_folder) dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] FileLoadStat = namedtuple('FileLoadStat', "file duration dag_num task_num dags") dag_folder = correct_maybe_zipped(dag_folder) dags_by_name = {} for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: ts = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) dag_ids = [dag.dag_id for dag in found_dags] dag_id_names = str(dag_ids) td = timezone.utcnow() - ts td = td.total_seconds() + (float(td.microseconds) / 1000000) dags_by_name[dag_id_names] = dag_ids stats.append( FileLoadStat( filepath.replace(settings.DAGS_FOLDER, ''), td, len(found_dags), sum([len(dag.tasks) for dag in found_dags]), dag_id_names, )) except Exception as e: self.log.exception(e) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True) for file_stat in self.dagbag_stats: dag_ids = dags_by_name[file_stat.dags] if file_stat.dag_num >= 1: # if we found multiple dags per file, the stat is 'dag_id1 _ dag_id2' dag_names = '_'.join(dag_ids) Stats.timing('dag.loading-duration.{}'.format(dag_names), file_stat.duration)
def _execute(self): session = settings.Session() TI = models.TaskInstance pessimistic_connection_handling() logging.basicConfig(level=logging.DEBUG) self.logger.info("Starting the scheduler") dagbag = models.DagBag(self.subdir, sync_to_db=True) executor = self.executor = dagbag.executor executor.start() # grab orphaned tasks and make sure to reset their state active_runs = DagRun.find(state=State.RUNNING, external_trigger=False, session=session) for dr in active_runs: self._reset_state_for_orphaned_tasks(dr, session=session) self.runs = 0 while not self.num_runs or self.num_runs > self.runs: try: loop_start_dttm = datetime.now() try: self.prioritize_queued(executor=executor, dagbag=dagbag) except Exception as e: self.logger.exception(e) self.runs += 1 try: if self.runs % self.refresh_dags_every == 0: dagbag = models.DagBag(self.subdir, sync_to_db=True) else: dagbag.collect_dags(only_if_updated=True) except Exception as e: self.logger.error( "Failed at reloading the dagbag. {}".format(e)) Stats.incr('dag_refresh_error', 1, 1) sleep(5) if len(self.dag_ids) > 0: dags = [ dag for dag in dagbag.dags.values() if dag.dag_id in self.dag_ids ] else: dags = [ dag for dag in dagbag.dags.values() if not dag.parent_dag ] paused_dag_ids = dagbag.paused_dags() dags = [x for x in dags if x.dag_id not in paused_dag_ids] # dags = filter(lambda x: x.dag_id not in paused_dag_ids, dags) self.logger.debug( "Total Cores: {} Max Threads: {} DAGs:{}".format( multiprocessing.cpu_count(), self.max_threads, len(dags))) dags = self._split(dags, math.ceil(len(dags) / self.max_threads)) tis_q = multiprocessing.Queue() jobs = [ multiprocessing.Process(target=self._do_dags, args=(dagbag, dags[i], tis_q)) for i in range(len(dags)) ] self.logger.info("Starting {} scheduler jobs".format( len(jobs))) for j in jobs: j.start() while any(j.is_alive() for j in jobs): while not tis_q.empty(): ti_key, pickle_id = tis_q.get() dag = dagbag.dags[ti_key[0]] task = dag.get_task(ti_key[1]) ti = TI(task, ti_key[2]) ti.refresh_from_db(session=session, lock_for_update=True) if ti.state == State.SCHEDULED: session.commit() self.logger.debug( "Task {} was picked up by another scheduler". format(ti)) continue elif ti.state is State.NONE: ti.state = State.SCHEDULED self.executor.queue_task_instance(ti, pickle_id=pickle_id) session.merge(ti) session.commit() for j in jobs: j.join() self.logger.info("Done queuing tasks, calling the executor's " "heartbeat") duration_sec = (datetime.now() - loop_start_dttm).total_seconds() self.logger.info("Loop took: {} seconds".format(duration_sec)) Stats.timing("scheduler_loop", duration_sec * 1000) try: self.import_errors(dagbag) except Exception as e: self.logger.exception(e) try: dagbag.kill_zombies() except Exception as e: self.logger.exception(e) try: # We really just want the scheduler to never ever stop. executor.heartbeat() self.heartbeat() except Exception as e: self.logger.exception(e) self.logger.error("Tachycardia!") except Exception as deep_e: self.logger.exception(deep_e) raise finally: settings.Session.remove() executor.end() session.close()
def update_state(self, session=None): """ Determines the overall state of the DagRun based on the state of its TaskInstances. :return: State """ dag = self.get_dag() tis = self.get_task_instances(session=session) self.log.debug("Updating state for %s considering %s task(s)", self, len(tis)) for ti in list(tis): # skip in db? if ti.state == State.REMOVED: tis.remove(ti) else: ti.task = dag.get_task(ti.task_id) # pre-calculate # db is faster start_dttm = timezone.utcnow() unfinished_tasks = self.get_task_instances(state=State.unfinished(), session=session) none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) # small speed up if unfinished_tasks and none_depends_on_past and none_task_concurrency: # todo: this can actually get pretty slow: one task costs between 0.01-015s no_dependencies_met = True for ut in unfinished_tasks: # We need to flag upstream and check for changes because upstream # failures/re-schedules can result in deadlock false positives old_state = ut.state deps_met = ut.are_dependencies_met(dep_context=DepContext( flag_upstream_failed=True, ignore_in_retry_period=True, ignore_in_reschedule_period=True), session=session) if deps_met or old_state != ut.current_state(session=session): no_dependencies_met = False break duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) root_ids = [t.task_id for t in dag.roots] roots = [t for t in tis if t.task_id in root_ids] # if all roots finished and at least one failed, the run failed if (not unfinished_tasks and any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)): self.log.info('Marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='task_failure', session=session) # if all roots succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all( r.state in (State.SUCCESS, State.SKIPPED) for r in roots): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) dag.handle_callback(self, success=True, reason='success', session=session) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and no_dependencies_met): self.log.info('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() return self.state