def execution_order(phases): """ Yields goals in execution order for the given phases. Does not account for goals run multiple times due to grouping. """ dependencies_by_goal = OrderedDict() def populate_dependencies(phases): for phase in phases: for goal in phase.goals(): if goal not in dependencies_by_goal: populate_dependencies(goal.dependencies) deps = OrderedSet() for phasedep in goal.dependencies: deps.update(phasedep.goals()) dependencies_by_goal[goal] = deps populate_dependencies(phases) while dependencies_by_goal: for goal, deps in dependencies_by_goal.items(): if not deps: dependencies_by_goal.pop(goal) for _, deps in dependencies_by_goal.items(): if goal in deps: deps.discard(goal) yield goal
class ThermosGCExecutor(ExecutorBase, ExceptionalThread, Observable): """ Thermos GC Executor, responsible for: - garbage collecting old tasks to make sure they don't clutter up the system - state reconciliation with the scheduler (in case it thinks we're running something we're not or vice versa.) """ MAX_PID_TIME_DRIFT = Amount(10, Time.SECONDS) MAX_CHECKPOINT_TIME_DRIFT = Amount( 1, Time.HOURS) # maximum runner disconnection time # how old a task must be before we're willing to kill it, assuming that there could be # slight races in the following scenario: # launch gc with retained_tasks={t1, t2, t3} # launch task t4 MINIMUM_KILL_AGE = Amount(10, Time.MINUTES) # wait time between checking for new GC events from the slave and/or cleaning orphaned tasks POLL_WAIT = Amount(5, Time.MINUTES) # maximum amount of time the executor will wait with no tasks before it exits. MAXIMUM_EXECUTOR_WAIT = Amount(15, Time.MINUTES) # maximum lifetime of this executor. this is to prevent older GC executor binaries from # running forever MAXIMUM_EXECUTOR_LIFETIME = Amount(1, Time.DAYS) PERSISTENCE_WAIT = Amount(5, Time.SECONDS) def __init__(self, checkpoint_root, verbose=True, task_killer=TaskKiller, executor_detector=ExecutorDetector, task_garbage_collector=TaskGarbageCollector, clock=time): ExecutorBase.__init__(self) ExceptionalThread.__init__(self) self.daemon = True self._stop_event = threading.Event() # mapping of task_id => (TaskInfo, AdjustRetainedTasks), in the order in # which they were received via a launchTask. self._gc_task_queue = OrderedDict() # cache the ExecutorDriver provided by the slave, so we can use it out # of band from slave-initiated callbacks. This should be supplied by # ExecutorBase.registered() when the executor first registers with the # slave. self._driver = None self._slave_id = None # cache the slave ID provided by the slave self._task_id = None # the task_id currently being executed by the ThermosGCExecutor, if any self._start_time = None # the start time of a task currently being executed, if any self._detector = executor_detector() self._collector = task_garbage_collector(root=checkpoint_root) self._clock = clock self._task_killer = task_killer self._checkpoint_root = checkpoint_root self._dropped_tasks = AtomicGauge('dropped_tasks') self.metrics.register(self._dropped_tasks) def _runner_ckpt(self, task_id): """Return the runner checkpoint file for a given task_id.""" return TaskPath(root=self._checkpoint_root, task_id=task_id).getpath('runner_checkpoint') def _terminate_task(self, task_id, kill=True): """Terminate a task using the associated task killer. Returns a boolean indicating success.""" killer = self._task_killer(task_id, self._checkpoint_root) self.log('Terminating %s...' % task_id) runner_terminate = killer.kill if kill else killer.lose try: runner_terminate(force=True) return True except Exception as e: self.log('Could not terminate: %s' % e) return False def partition_tasks(self): """Return active/finished tasks as discovered from the checkpoint root.""" detector = TaskDetector(root=self._checkpoint_root) active_tasks = set( t_id for _, t_id in detector.get_task_ids(state='active')) finished_tasks = set( t_id for _, t_id in detector.get_task_ids(state='finished')) return active_tasks, finished_tasks def get_states(self, task_id): """Returns the (timestamp, status) tuples of the task or [] if could not replay.""" statuses = CheckpointDispatcher.iter_statuses( self._runner_ckpt(task_id)) try: return [(state.timestamp_ms / 1000.0, state.state) for state in statuses] except CheckpointDispatcher.ErrorRecoveringState: return [] def get_sandbox(self, task_id): """Returns the sandbox of the task, or None if it has not yet been initialized.""" try: for update in CheckpointDispatcher.iter_updates( self._runner_ckpt(task_id)): if update.runner_header and update.runner_header.sandbox: return update.runner_header.sandbox except CheckpointDispatcher.ErrorRecoveringState: return None def maybe_terminate_unknown_task(self, task_id): """Terminate a task if we believe the scheduler doesn't know about it. It's possible for the scheduler to queue a GC and launch a task afterwards, in which case we may see actively running tasks that the scheduler did not report in the AdjustRetainedTasks message. Returns: boolean indicating whether the task was terminated """ states = self.get_states(task_id) if states: task_start_time, _ = states[0] if self._start_time - task_start_time > self.MINIMUM_KILL_AGE.as_( Time.SECONDS): return self._terminate_task(task_id) return False def should_gc_task(self, task_id): """Check if a possibly-corrupt task should be locally GCed A task should be GCed if its checkpoint stream appears to be corrupted and the kill age threshold is exceeded. Returns: set, containing the task_id if it should be marked for local GC, or empty otherwise """ runner_ckpt = self._runner_ckpt(task_id) if not os.path.exists(runner_ckpt): return set() latest_update = os.path.getmtime(runner_ckpt) if self._start_time - latest_update > self.MINIMUM_KILL_AGE.as_( Time.SECONDS): self.log( 'Got corrupt checkpoint file for %s - marking for local GC' % task_id) return set([task_id]) else: self.log( 'Checkpoint file unreadable, but not yet beyond MINIMUM_KILL_AGE threshold' ) return set() def reconcile_states(self, driver, retained_tasks): """Reconcile states that the scheduler thinks tasks are in vs what they really are in. Local vs Scheduler => Action =================================== ACTIVE ACTIVE => no-op ACTIVE STARTING => no-op ACTIVE TERMINAL => maybe kill task* ACTIVE !EXISTS => maybe kill task* TERMINAL ACTIVE => send actual status** TERMINAL STARTING => send actual status** TERMINAL TERMINAL => no-op TERMINAL !EXISTS => gc locally !EXISTS ACTIVE => send LOST** !EXISTS STARTING => no-op !EXISTS TERMINAL => gc remotely * - Only kill if this does not appear to be a race condition. ** - These appear to have no effect Side effecting operations: ACTIVE | (TERMINAL / !EXISTS) => maybe kill TERMINAL | !EXISTS => delete !EXISTS | TERMINAL => delete Returns tuple of (local_gc, remote_gc, updates), where: local_gc - set of task_ids to be GCed locally remote_gc - set of task_ids to be deleted on the scheduler updates - dictionary of updates sent to the scheduler (task_id: ScheduleStatus) """ def partition(rt): active, starting, finished = set(), set(), set() for task_id, schedule_status in rt.items(): if schedule_status in TERMINAL_STATES: finished.add(task_id) elif (schedule_status == ScheduleStatus.STARTING or schedule_status == ScheduleStatus.ASSIGNED): starting.add(task_id) else: active.add(task_id) return active, starting, finished local_active, local_finished = self.partition_tasks() sched_active, sched_starting, sched_finished = partition( retained_tasks) local_task_ids = local_active | local_finished sched_task_ids = sched_active | sched_starting | sched_finished all_task_ids = local_task_ids | sched_task_ids self.log('Told to retain the following task ids:') for task_id, schedule_status in retained_tasks.items(): self.log(' => %s as %s' % (task_id, ScheduleStatus._VALUES_TO_NAMES.get( schedule_status, 'UNKNOWN'))) self.log('Local active tasks:') for task_id in local_active: self.log(' => %s' % task_id) self.log('Local finished tasks:') for task_id in local_finished: self.log(' => %s' % task_id) local_gc, remote_gc = set(), set() updates = {} for task_id in all_task_ids: if task_id in local_active and task_id not in (sched_active | sched_starting): self.log('Inspecting task %s for termination.' % task_id) if not self.maybe_terminate_unknown_task(task_id): local_gc.update(self.should_gc_task(task_id)) if task_id in local_finished and task_id not in sched_task_ids: self.log('Queueing task %s for local deletion.' % task_id) local_gc.add(task_id) if task_id in local_finished and task_id in (sched_active | sched_starting): self.log( 'Task %s finished but scheduler thinks active/starting.' % task_id) states = self.get_states(task_id) if states: _, last_state = states[-1] updates[task_id] = THERMOS_TO_TWITTER_STATES.get( last_state, ScheduleStatus.SANDBOX_DELETED) self.send_update( driver, task_id, THERMOS_TO_MESOS_STATES.get(last_state, mesos_pb2.TASK_LOST), 'Task finish detected by GC executor.') else: local_gc.update(self.should_gc_task(task_id)) if task_id in sched_finished and task_id not in local_task_ids: self.log('Queueing task %s for remote deletion.' % task_id) remote_gc.add(task_id) if task_id not in local_task_ids and task_id in sched_active: self.log( 'Know nothing about task %s, telling scheduler of LOSS.' % task_id) updates[task_id] = ScheduleStatus.LOST self.send_update(driver, task_id, mesos_pb2.TASK_LOST, 'GC executor found no trace of task.') if task_id not in local_task_ids and task_id in sched_starting: self.log( 'Know nothing about task %s, but scheduler says STARTING - passing' % task_id) return local_gc, remote_gc, updates def clean_orphans(self, driver): """Inspect checkpoints for trees that have been kill -9'ed but not properly cleaned up.""" self.log('Checking for orphaned tasks') active_tasks, _ = self.partition_tasks() updates = {} inspector = CheckpointInspector(self._checkpoint_root) def is_our_process(process, uid, timestamp): if process.uids().real != uid: return False estimated_start_time = self._clock.time() - process.create_time() return abs(timestamp - estimated_start_time) < self.MAX_PID_TIME_DRIFT.as_( Time.SECONDS) for task_id in active_tasks: self.log('Inspecting running task: %s' % task_id) inspection = inspector.inspect(task_id) if not inspection: self.log(' - Error inspecting task runner') continue latest_runner = inspection.runner_processes[-1] # Assume that it has not yet started? if not latest_runner: self.log(' - Task has no registered runners.') continue runner_pid, runner_uid, timestamp_ms = latest_runner try: runner_process = psutil.Process(runner_pid) if is_our_process(runner_process, runner_uid, timestamp_ms / 1000.0): self.log(' - Runner appears healthy.') continue except psutil.NoSuchProcess: # Runner is dead pass except psutil.Error as err: self.log(' - Error sampling runner process [pid=%s]: %s' % (runner_pid, err)) continue try: latest_update = os.path.getmtime(self._runner_ckpt(task_id)) except (IOError, OSError) as err: self.log(' - Error accessing runner ckpt: %s' % err) continue if self._clock.time( ) - latest_update < self.MAX_CHECKPOINT_TIME_DRIFT.as_( Time.SECONDS): self.log(' - Runner is dead but under LOST threshold.') continue self.log(' - Runner is dead but beyond LOST threshold: %.1fs' % (self._clock.time() - latest_update)) if self._terminate_task(task_id, kill=False): updates[task_id] = ScheduleStatus.LOST self.send_update(driver, task_id, mesos_pb2.TASK_LOST, 'GC executor detected failed task runner.') return updates def _erase_sandbox(self, task_id): # TODO(wickman) Only mesos should be in the business of garbage collecting sandboxes. header_sandbox = self.get_sandbox(task_id) directory_sandbox = DirectorySandbox( header_sandbox) if header_sandbox else None if directory_sandbox and directory_sandbox.exists(): self.log('Destroying DirectorySandbox for %s' % task_id) try: directory_sandbox.destroy() except DirectorySandbox.Error as e: self.log('Failed to destroy DirectorySandbox: %s' % e) else: self.log('Found no sandboxes for %s' % task_id) def _gc(self, task_id): """Erase the sandbox, logs and metadata of the given task.""" self.log('Erasing sandbox for %s' % task_id) self._erase_sandbox(task_id) self.log('Erasing logs for %s' % task_id) self._collector.erase_logs(task_id) self.log('Erasing metadata for %s' % task_id) self._collector.erase_metadata(task_id) def garbage_collect(self, force_delete=frozenset()): """Garbage collect tasks on the system no longer active or in the supplied force_delete. Return a set of task_ids representing the tasks that were garbage collected. """ active_tasks, finished_tasks = self.partition_tasks() retained_executors = set(iter(self.linked_executors)) self.log('Executor sandboxes retained by Mesos:') if retained_executors: for r_e in sorted(retained_executors): self.log(' %s' % r_e) else: self.log(' None') for task_id in (active_tasks - retained_executors): self.log('ERROR: Active task %s had its executor sandbox pulled.' % task_id) gc_tasks = (finished_tasks - retained_executors) | force_delete for gc_task in gc_tasks: self._gc(gc_task) return gc_tasks @property def linked_executors(self): """Generator yielding the executor sandboxes detected on the system.""" thermos_executor_prefix = 'thermos-' for executor in self._detector: # It's possible for just the 'latest' symlink to be present but no run directories. # This indicates that the task has been fully garbage collected. if executor.executor_id.startswith( thermos_executor_prefix) and executor.run != 'latest': yield executor.executor_id[len(thermos_executor_prefix):] def _run_gc(self, task, retain_tasks, retain_start): """ Reconcile the set of tasks to retain (provided by the scheduler) with the current state of executors on this system. Garbage collect tasks/executors as appropriate. Not re-entrant! Previous executions must complete (and clear self._task_id) before this can be invoked. Potentially blocking (e.g. on I/O) in self.garbage_collect() Args: task: TaskInfo provided by the slave retain_tasks: mapping of task_id => ScheduleStatus, describing what the scheduler thinks is running on this system retain_start: the time at which the retain_tasks message is effective -- this means that tasks started after the retain_tasks message is effective are skipped until future GC runs. """ task_id = task.task_id.value if self._task_id is not None: raise RuntimeError( '_run_gc() called [task_id=%s], but already running [task_id=%s]' % (task_id, self._task_id)) self._task_id = task_id self.log('Launching garbage collection [task_id=%s]' % task_id) self._start_time = retain_start local_gc, remote_gc, _ = self.reconcile_states(self._driver, retain_tasks) deleted_tasks = set(retain_tasks).intersection( self.garbage_collect(local_gc)) | remote_gc if deleted_tasks: self._driver.sendFrameworkMessage( thrift_serialize( SchedulerMessage(deletedTasks=DeletedTasks( taskIds=deleted_tasks)))) self.send_update(self._driver, task.task_id.value, mesos_pb2.TASK_FINISHED, 'Garbage collection finished.') self.log('Garbage collection complete [task_id=%s]' % task_id) self._task_id = self._start_time = None def run(self): """Main GC executor event loop. Periodically perform state reconciliation with the set of tasks provided by the slave, and garbage collect orphaned tasks on the system. """ run_start = self._clock.time() last_gc_run = self._clock.time() def should_terminate(): now = self._clock.time() if now > run_start + self.MAXIMUM_EXECUTOR_LIFETIME.as_( Time.SECONDS): return True if now > last_gc_run + self.MAXIMUM_EXECUTOR_WAIT.as_( Time.SECONDS): return True return self._stop_event.is_set() while not should_terminate(): try: _, (task, retain_tasks, retain_start) = self._gc_task_queue.popitem(0) last_gc_run = retain_start self._run_gc(task, retain_tasks, retain_start) except KeyError: # no enqueued GC tasks pass if self._driver is not None: self.clean_orphans(self._driver) self._stop_event.wait(self.POLL_WAIT.as_(Time.SECONDS)) # shutdown if self._driver is not None: try: prev_task_id, _ = self._gc_task_queue.popitem(0) except KeyError: # no enqueued GC tasks pass else: self.send_update( self._driver, prev_task_id, mesos_pb2.TASK_FINISHED, 'Garbage collection skipped - GC executor shutting down') # TODO(jon) Remove this once external MESOS-243 is resolved. self.log( 'Sleeping briefly to mitigate https://issues.apache.org/jira/browse/MESOS-243' ) self._clock.sleep(self.PERSISTENCE_WAIT.as_(Time.SECONDS)) self._driver.stop() """ Mesos Executor API methods follow """ def launchTask(self, driver, task): """Queue a new garbage collection run, and drop any currently-enqueued runs.""" if self._slave_id is None: self._slave_id = task.slave_id.value task_id = task.task_id.value self.log('launchTask() got task_id: %s' % task_id) if task_id == self._task_id: self.log('=> GC with task_id %s currently running - ignoring' % task_id) return elif task_id in self._gc_task_queue: self.log('=> Already have task_id %s queued - ignoring' % task_id) return try: art = thrift_deserialize(AdjustRetainedTasks(), task.data) except Exception as err: self.log('Error deserializing task: %s' % err) self.send_update(self._driver, task_id, mesos_pb2.TASK_FAILED, 'Deserialization of GC task failed') return try: prev_task_id, _ = self._gc_task_queue.popitem(0) except KeyError: # no enqueued GC tasks - reset counter self._dropped_tasks.write(0) else: self.log('=> Dropping previously queued GC with task_id %s' % prev_task_id) self._dropped_tasks.increment() self.log('=> Updating scheduler') self.send_update( self._driver, prev_task_id, mesos_pb2.TASK_FINISHED, 'Garbage collection skipped - GC executor received another task' ) self.log('=> Adding %s to GC queue' % task_id) self._gc_task_queue[task_id] = (task, art.retainedTasks, self._clock.time()) def killTask(self, driver, task_id): """Remove the specified task from the queue, if it's not yet run. Otherwise, no-op.""" self.log('killTask() got task_id: %s' % task_id) task = self._gc_task_queue.pop(task_id, None) if task is not None: self.log('=> Removed %s from queued GC tasks' % task_id) elif task_id == self._task_id: self.log('=> GC with task_id %s currently running - ignoring' % task_id) else: self.log('=> Unknown task_id %s - ignoring' % task_id) def shutdown(self, driver): """Trigger the Executor to shut down as soon as the current GC run is finished.""" self.log('shutdown() called - setting stop event') self._stop_event.set()
class ThermosGCExecutor(ExecutorBase, ExceptionalThread, Observable): """ Thermos GC Executor, responsible for: - garbage collecting old tasks to make sure they don't clutter up the system - state reconciliation with the scheduler (in case it thinks we're running something we're not or vice versa.) """ MAX_PID_TIME_DRIFT = Amount(10, Time.SECONDS) MAX_CHECKPOINT_TIME_DRIFT = Amount(1, Time.HOURS) # maximum runner disconnection time # how old a task must be before we're willing to kill it, assuming that there could be # slight races in the following scenario: # launch gc with retained_tasks={t1, t2, t3} # launch task t4 MINIMUM_KILL_AGE = Amount(10, Time.MINUTES) # wait time between checking for new GC events from the slave and/or cleaning orphaned tasks POLL_WAIT = Amount(5, Time.MINUTES) # maximum amount of time the executor will wait with no tasks before it exits. MAXIMUM_EXECUTOR_WAIT = Amount(15, Time.MINUTES) # maximum lifetime of this executor. this is to prevent older GC executor binaries from # running forever MAXIMUM_EXECUTOR_LIFETIME = Amount(1, Time.DAYS) PERSISTENCE_WAIT = Amount(5, Time.SECONDS) def __init__(self, checkpoint_root, verbose=True, task_killer=TaskKiller, executor_detector=ExecutorDetector, task_garbage_collector=TaskGarbageCollector, clock=time): ExecutorBase.__init__(self) ExceptionalThread.__init__(self) self.daemon = True self._stop_event = threading.Event() # mapping of task_id => (TaskInfo, AdjustRetainedTasks), in the order in # which they were received via a launchTask. self._gc_task_queue = OrderedDict() # cache the ExecutorDriver provided by the slave, so we can use it out # of band from slave-initiated callbacks. This should be supplied by # ExecutorBase.registered() when the executor first registers with the # slave. self._driver = None self._slave_id = None # cache the slave ID provided by the slave self._task_id = None # the task_id currently being executed by the ThermosGCExecutor, if any self._start_time = None # the start time of a task currently being executed, if any self._detector = executor_detector() self._collector = task_garbage_collector(root=checkpoint_root) self._clock = clock self._task_killer = task_killer self._checkpoint_root = checkpoint_root self._dropped_tasks = AtomicGauge('dropped_tasks') self.metrics.register(self._dropped_tasks) def _runner_ckpt(self, task_id): """Return the runner checkpoint file for a given task_id.""" return TaskPath(root=self._checkpoint_root, task_id=task_id).getpath('runner_checkpoint') def _terminate_task(self, task_id, kill=True): """Terminate a task using the associated task killer. Returns a boolean indicating success.""" killer = self._task_killer(task_id, self._checkpoint_root) self.log('Terminating %s...' % task_id) runner_terminate = killer.kill if kill else killer.lose try: runner_terminate(force=True) return True except Exception as e: self.log('Could not terminate: %s' % e) return False def partition_tasks(self): """Return active/finished tasks as discovered from the checkpoint root.""" detector = TaskDetector(root=self._checkpoint_root) active_tasks = set(t_id for _, t_id in detector.get_task_ids(state='active')) finished_tasks = set(t_id for _, t_id in detector.get_task_ids(state='finished')) return active_tasks, finished_tasks def get_states(self, task_id): """Returns the (timestamp, status) tuples of the task or [] if could not replay.""" statuses = CheckpointDispatcher.iter_statuses(self._runner_ckpt(task_id)) try: return [(state.timestamp_ms / 1000.0, state.state) for state in statuses] except CheckpointDispatcher.ErrorRecoveringState: return [] def get_sandbox(self, task_id): """Returns the sandbox of the task, or None if it has not yet been initialized.""" try: for update in CheckpointDispatcher.iter_updates(self._runner_ckpt(task_id)): if update.runner_header and update.runner_header.sandbox: return update.runner_header.sandbox except CheckpointDispatcher.ErrorRecoveringState: return None def maybe_terminate_unknown_task(self, task_id): """Terminate a task if we believe the scheduler doesn't know about it. It's possible for the scheduler to queue a GC and launch a task afterwards, in which case we may see actively running tasks that the scheduler did not report in the AdjustRetainedTasks message. Returns: boolean indicating whether the task was terminated """ states = self.get_states(task_id) if states: task_start_time, _ = states[0] if self._start_time - task_start_time > self.MINIMUM_KILL_AGE.as_(Time.SECONDS): return self._terminate_task(task_id) return False def should_gc_task(self, task_id): """Check if a possibly-corrupt task should be locally GCed A task should be GCed if its checkpoint stream appears to be corrupted and the kill age threshold is exceeded. Returns: set, containing the task_id if it should be marked for local GC, or empty otherwise """ runner_ckpt = self._runner_ckpt(task_id) if not os.path.exists(runner_ckpt): return set() latest_update = os.path.getmtime(runner_ckpt) if self._start_time - latest_update > self.MINIMUM_KILL_AGE.as_(Time.SECONDS): self.log('Got corrupt checkpoint file for %s - marking for local GC' % task_id) return set([task_id]) else: self.log('Checkpoint file unreadable, but not yet beyond MINIMUM_KILL_AGE threshold') return set() def reconcile_states(self, driver, retained_tasks): """Reconcile states that the scheduler thinks tasks are in vs what they really are in. Local vs Scheduler => Action =================================== ACTIVE ACTIVE => no-op ACTIVE STARTING => no-op ACTIVE TERMINAL => maybe kill task* ACTIVE !EXISTS => maybe kill task* TERMINAL ACTIVE => send actual status** TERMINAL STARTING => send actual status** TERMINAL TERMINAL => no-op TERMINAL !EXISTS => gc locally !EXISTS ACTIVE => send LOST** !EXISTS STARTING => no-op !EXISTS TERMINAL => gc remotely * - Only kill if this does not appear to be a race condition. ** - These appear to have no effect Side effecting operations: ACTIVE | (TERMINAL / !EXISTS) => maybe kill TERMINAL | !EXISTS => delete !EXISTS | TERMINAL => delete Returns tuple of (local_gc, remote_gc, updates), where: local_gc - set of task_ids to be GCed locally remote_gc - set of task_ids to be deleted on the scheduler updates - dictionary of updates sent to the scheduler (task_id: ScheduleStatus) """ def partition(rt): active, starting, finished = set(), set(), set() for task_id, schedule_status in rt.items(): if schedule_status in TERMINAL_STATES: finished.add(task_id) elif (schedule_status == ScheduleStatus.STARTING or schedule_status == ScheduleStatus.ASSIGNED): starting.add(task_id) else: active.add(task_id) return active, starting, finished local_active, local_finished = self.partition_tasks() sched_active, sched_starting, sched_finished = partition(retained_tasks) local_task_ids = local_active | local_finished sched_task_ids = sched_active | sched_starting | sched_finished all_task_ids = local_task_ids | sched_task_ids self.log('Told to retain the following task ids:') for task_id, schedule_status in retained_tasks.items(): self.log(' => %s as %s' % (task_id, ScheduleStatus._VALUES_TO_NAMES.get(schedule_status, 'UNKNOWN'))) self.log('Local active tasks:') for task_id in local_active: self.log(' => %s' % task_id) self.log('Local finished tasks:') for task_id in local_finished: self.log(' => %s' % task_id) local_gc, remote_gc = set(), set() updates = {} for task_id in all_task_ids: if task_id in local_active and task_id not in (sched_active | sched_starting): self.log('Inspecting task %s for termination.' % task_id) if not self.maybe_terminate_unknown_task(task_id): local_gc.update(self.should_gc_task(task_id)) if task_id in local_finished and task_id not in sched_task_ids: self.log('Queueing task %s for local deletion.' % task_id) local_gc.add(task_id) if task_id in local_finished and task_id in (sched_active | sched_starting): self.log('Task %s finished but scheduler thinks active/starting.' % task_id) states = self.get_states(task_id) if states: _, last_state = states[-1] updates[task_id] = THERMOS_TO_TWITTER_STATES.get( last_state, ScheduleStatus.SANDBOX_DELETED) self.send_update( driver, task_id, THERMOS_TO_MESOS_STATES.get(last_state, mesos_pb2.TASK_LOST), 'Task finish detected by GC executor.') else: local_gc.update(self.should_gc_task(task_id)) if task_id in sched_finished and task_id not in local_task_ids: self.log('Queueing task %s for remote deletion.' % task_id) remote_gc.add(task_id) if task_id not in local_task_ids and task_id in sched_active: self.log('Know nothing about task %s, telling scheduler of LOSS.' % task_id) updates[task_id] = ScheduleStatus.LOST self.send_update( driver, task_id, mesos_pb2.TASK_LOST, 'GC executor found no trace of task.') if task_id not in local_task_ids and task_id in sched_starting: self.log('Know nothing about task %s, but scheduler says STARTING - passing' % task_id) return local_gc, remote_gc, updates def clean_orphans(self, driver): """Inspect checkpoints for trees that have been kill -9'ed but not properly cleaned up.""" self.log('Checking for orphaned tasks') active_tasks, _ = self.partition_tasks() updates = {} inspector = CheckpointInspector(self._checkpoint_root) def is_our_process(process, uid, timestamp): if process.uids.real != uid: return False estimated_start_time = self._clock.time() - process.create_time return abs(timestamp - estimated_start_time) < self.MAX_PID_TIME_DRIFT.as_(Time.SECONDS) for task_id in active_tasks: self.log('Inspecting running task: %s' % task_id) inspection = inspector.inspect(task_id) if not inspection: self.log(' - Error inspecting task runner') continue latest_runner = inspection.runner_processes[-1] # Assume that it has not yet started? if not latest_runner: self.log(' - Task has no registered runners.') continue runner_pid, runner_uid, timestamp_ms = latest_runner try: runner_process = psutil.Process(runner_pid) if is_our_process(runner_process, runner_uid, timestamp_ms / 1000.0): self.log(' - Runner appears healthy.') continue except psutil.NoSuchProcess: # Runner is dead pass except psutil.Error as err: self.log(' - Error sampling runner process [pid=%s]: %s' % (runner_pid, err)) continue try: latest_update = os.path.getmtime(self._runner_ckpt(task_id)) except (IOError, OSError) as err: self.log(' - Error accessing runner ckpt: %s' % err) continue if self._clock.time() - latest_update < self.MAX_CHECKPOINT_TIME_DRIFT.as_(Time.SECONDS): self.log(' - Runner is dead but under LOST threshold.') continue self.log(' - Runner is dead but beyond LOST threshold: %.1fs' % ( self._clock.time() - latest_update)) if self._terminate_task(task_id, kill=False): updates[task_id] = ScheduleStatus.LOST self.send_update( driver, task_id, mesos_pb2.TASK_LOST, 'GC executor detected failed task runner.') return updates def _erase_sandbox(self, task_id): # TODO(wickman) Only mesos should be in the business of garbage collecting sandboxes. header_sandbox = self.get_sandbox(task_id) directory_sandbox = DirectorySandbox(header_sandbox) if header_sandbox else None if directory_sandbox and directory_sandbox.exists(): self.log('Destroying DirectorySandbox for %s' % task_id) try: directory_sandbox.destroy() except DirectorySandbox.Error as e: self.log('Failed to destroy DirectorySandbox: %s' % e) else: self.log('Found no sandboxes for %s' % task_id) def _gc(self, task_id): """Erase the sandbox, logs and metadata of the given task.""" self.log('Erasing sandbox for %s' % task_id) self._erase_sandbox(task_id) self.log('Erasing logs for %s' % task_id) self._collector.erase_logs(task_id) self.log('Erasing metadata for %s' % task_id) self._collector.erase_metadata(task_id) def garbage_collect(self, force_delete=frozenset()): """Garbage collect tasks on the system no longer active or in the supplied force_delete. Return a set of task_ids representing the tasks that were garbage collected. """ active_tasks, finished_tasks = self.partition_tasks() retained_executors = set(iter(self.linked_executors)) self.log('Executor sandboxes retained by Mesos:') if retained_executors: for r_e in sorted(retained_executors): self.log(' %s' % r_e) else: self.log(' None') for task_id in (active_tasks - retained_executors): self.log('ERROR: Active task %s had its executor sandbox pulled.' % task_id) gc_tasks = (finished_tasks - retained_executors) | force_delete for gc_task in gc_tasks: self._gc(gc_task) return gc_tasks @property def linked_executors(self): """Generator yielding the executor sandboxes detected on the system.""" thermos_executor_prefix = 'thermos-' for executor in self._detector: # It's possible for just the 'latest' symlink to be present but no run directories. # This indicates that the task has been fully garbage collected. if executor.executor_id.startswith(thermos_executor_prefix) and executor.run != 'latest': yield executor.executor_id[len(thermos_executor_prefix):] def _run_gc(self, task, retain_tasks, retain_start): """ Reconcile the set of tasks to retain (provided by the scheduler) with the current state of executors on this system. Garbage collect tasks/executors as appropriate. Not re-entrant! Previous executions must complete (and clear self._task_id) before this can be invoked. Potentially blocking (e.g. on I/O) in self.garbage_collect() Args: task: TaskInfo provided by the slave retain_tasks: mapping of task_id => ScheduleStatus, describing what the scheduler thinks is running on this system retain_start: the time at which the retain_tasks message is effective -- this means that tasks started after the retain_tasks message is effective are skipped until future GC runs. """ task_id = task.task_id.value if self._task_id is not None: raise RuntimeError('_run_gc() called [task_id=%s], but already running [task_id=%s]' % (task_id, self._task_id)) self._task_id = task_id self.log('Launching garbage collection [task_id=%s]' % task_id) self._start_time = retain_start local_gc, remote_gc, _ = self.reconcile_states(self._driver, retain_tasks) deleted_tasks = set(retain_tasks).intersection(self.garbage_collect(local_gc)) | remote_gc if deleted_tasks: self._driver.sendFrameworkMessage(thrift_serialize( SchedulerMessage(deletedTasks=DeletedTasks(taskIds=deleted_tasks)))) self.send_update( self._driver, task.task_id.value, mesos_pb2.TASK_FINISHED, 'Garbage collection finished.') self.log('Garbage collection complete [task_id=%s]' % task_id) self._task_id = self._start_time = None def run(self): """Main GC executor event loop. Periodically perform state reconciliation with the set of tasks provided by the slave, and garbage collect orphaned tasks on the system. """ run_start = self._clock.time() last_gc_run = self._clock.time() def should_terminate(): now = self._clock.time() if now > run_start + self.MAXIMUM_EXECUTOR_LIFETIME.as_(Time.SECONDS): return True if now > last_gc_run + self.MAXIMUM_EXECUTOR_WAIT.as_(Time.SECONDS): return True return self._stop_event.is_set() while not should_terminate(): try: _, (task, retain_tasks, retain_start) = self._gc_task_queue.popitem(0) last_gc_run = retain_start self._run_gc(task, retain_tasks, retain_start) except KeyError: # no enqueued GC tasks pass if self._driver is not None: self.clean_orphans(self._driver) self._stop_event.wait(self.POLL_WAIT.as_(Time.SECONDS)) # shutdown if self._driver is not None: try: prev_task_id, _ = self._gc_task_queue.popitem(0) except KeyError: # no enqueued GC tasks pass else: self.send_update(self._driver, prev_task_id, mesos_pb2.TASK_FINISHED, 'Garbage collection skipped - GC executor shutting down') # TODO(jon) Remove this once external MESOS-243 is resolved. self.log('Sleeping briefly to mitigate https://issues.apache.org/jira/browse/MESOS-243') self._clock.sleep(self.PERSISTENCE_WAIT.as_(Time.SECONDS)) self._driver.stop() """ Mesos Executor API methods follow """ def launchTask(self, driver, task): """Queue a new garbage collection run, and drop any currently-enqueued runs.""" if self._slave_id is None: self._slave_id = task.slave_id.value task_id = task.task_id.value self.log('launchTask() got task_id: %s' % task_id) if task_id == self._task_id: self.log('=> GC with task_id %s currently running - ignoring' % task_id) return elif task_id in self._gc_task_queue: self.log('=> Already have task_id %s queued - ignoring' % task_id) return try: art = thrift_deserialize(AdjustRetainedTasks(), task.data) except Exception as err: self.log('Error deserializing task: %s' % err) self.send_update( self._driver, task_id, mesos_pb2.TASK_FAILED, 'Deserialization of GC task failed') return try: prev_task_id, _ = self._gc_task_queue.popitem(0) except KeyError: # no enqueued GC tasks - reset counter self._dropped_tasks.write(0) else: self.log('=> Dropping previously queued GC with task_id %s' % prev_task_id) self._dropped_tasks.increment() self.log('=> Updating scheduler') self.send_update(self._driver, prev_task_id, mesos_pb2.TASK_FINISHED, 'Garbage collection skipped - GC executor received another task') self.log('=> Adding %s to GC queue' % task_id) self._gc_task_queue[task_id] = (task, art.retainedTasks, self._clock.time()) def killTask(self, driver, task_id): """Remove the specified task from the queue, if it's not yet run. Otherwise, no-op.""" self.log('killTask() got task_id: %s' % task_id) task = self._gc_task_queue.pop(task_id, None) if task is not None: self.log('=> Removed %s from queued GC tasks' % task_id) elif task_id == self._task_id: self.log('=> GC with task_id %s currently running - ignoring' % task_id) else: self.log('=> Unknown task_id %s - ignoring' % task_id) def shutdown(self, driver): """Trigger the Executor to shut down as soon as the current GC run is finished.""" self.log('shutdown() called - setting stop event') self._stop_event.set()