def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error("Expected a task to tail, got nothing!") if len(args) not in (1, 2): app.error("Expected at most two arguments (task and optional process), got %d" % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print("ERROR: No processes found.", file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print("ERROR: More than one process matches query.", file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath( "process_logdir" ) logfile = os.path.join(logdir, "stderr" if options.use_stderr else "stdout") monitor = TaskMonitor(TaskPath(root=options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print("Tail of terminal log %s" % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print("Tail of active log %s" % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0
def partition_tasks(self): """Return active/finished tasks as discovered from the checkpoint root.""" detector = TaskDetector(root=self._checkpoint_root) active_tasks = set( t_id for _, t_id in detector.get_task_ids(state='active')) finished_tasks = set( t_id for _, t_id in detector.get_task_ids(state='finished')) return active_tasks, finished_tasks
def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error('Expected a task to tail, got nothing!') if len(args) not in (1, 2): app.error('Expected at most two arguments (task and optional process), got %d' % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print('ERROR: No processes found.', file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print('ERROR: More than one process matches query.', file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath('process_logdir') logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout') monitor = TaskMonitor(TaskPath(root=options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print('Tail of terminal log %s' % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print('Tail of active log %s' % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0
def __init__(self, root, resource_monitor_class=TaskResourceMonitor): self._pathspec = TaskPath(root=root) self._detector = TaskDetector(root) if not issubclass(resource_monitor_class, ResourceMonitorBase): raise ValueError("resource monitor class must implement ResourceMonitorBase!") self._resource_monitor = resource_monitor_class self._active_tasks = {} # task_id => ActiveObservedTask self._finished_tasks = {} # task_id => FinishedObservedTask self._stop_event = threading.Event() ExceptionalThread.__init__(self) Lockable.__init__(self) self.daemon = True
def partition_tasks(self): """Return active/finished tasks as discovered from the checkpoint roots.""" active_tasks, finished_tasks = set(), set() for checkpoint_root in self._path_detector.get_paths(): detector = TaskDetector(root=checkpoint_root) active_tasks.update(RootedTask(checkpoint_root, task_id) for _, task_id in detector.get_task_ids(state='active')) finished_tasks.update(RootedTask(checkpoint_root, task_id) for _, task_id in detector.get_task_ids(state='finished')) return active_tasks, finished_tasks
def tasks_from_re(expressions, root, state=None): task_ids = [t_id for _, t_id in TaskDetector(root=root).get_task_ids(state=state)] matched_tasks = set() for task_expr in map(re.compile, expressions): for task_id in task_ids: if task_expr.match(task_id): matched_tasks.add(task_id) return matched_tasks
def _refresh_detectors(self): new_paths = set(self._path_detector.get_paths()) old_paths = set(self._detectors) for path in old_paths - new_paths: self._detectors.pop(path) for path in new_paths - old_paths: self._detectors[path] = TaskDetector(root=path)
def tasks_from_re(expressions, state=None): path_detector = get_path_detector() matched_tasks = set() for root in path_detector.get_paths(): task_ids = [ t_id for _, t_id in TaskDetector(root).get_task_ids(state=state) ] for task_expr in map(re.compile, expressions): for task_id in task_ids: if task_expr.match(task_id): matched_tasks.add((root, task_id)) return matched_tasks
class TaskObserver(ExceptionalThread, Lockable): """ The TaskObserver monitors the thermos checkpoint root for active/finished tasks. It is used to be the oracle of the state of all thermos tasks on a machine. It currently returns JSON, but really should just return objects. We should then build an object->json translator. """ class UnexpectedError(Exception): pass class UnexpectedState(Exception): pass POLLING_INTERVAL = Amount(1, Time.SECONDS) def __init__(self, root, resource_monitor_class=TaskResourceMonitor): self._pathspec = TaskPath(root=root) self._detector = TaskDetector(root) if not issubclass(resource_monitor_class, ResourceMonitorBase): raise ValueError("resource monitor class must implement ResourceMonitorBase!") self._resource_monitor = resource_monitor_class self._active_tasks = {} # task_id => ActiveObservedTask self._finished_tasks = {} # task_id => FinishedObservedTask self._stop_event = threading.Event() ExceptionalThread.__init__(self) Lockable.__init__(self) self.daemon = True @property def active_tasks(self): """Return a dictionary of active Tasks""" return self._active_tasks @property def finished_tasks(self): """Return a dictionary of finished Tasks""" return self._finished_tasks @property def all_tasks(self): """Return a dictionary of all Tasks known by the TaskObserver""" return dict(self.active_tasks.items() + self.finished_tasks.items()) def stop(self): self._stop_event.set() def start(self): ExceptionalThread.start(self) @Lockable.sync def add_active_task(self, task_id): if task_id in self.finished_tasks: log.error('Found an active task (%s) in finished tasks?' % task_id) return task_monitor = TaskMonitor(self._pathspec, task_id) if not task_monitor.get_state().header: log.info('Unable to load task "%s"' % task_id) return sandbox = task_monitor.get_state().header.sandbox resource_monitor = self._resource_monitor(task_monitor, sandbox) resource_monitor.start() self._active_tasks[task_id] = ActiveObservedTask( task_id=task_id, pathspec=self._pathspec, task_monitor=task_monitor, resource_monitor=resource_monitor ) @Lockable.sync def add_finished_task(self, task_id): self._finished_tasks[task_id] = FinishedObservedTask( task_id=task_id, pathspec=self._pathspec ) @Lockable.sync def active_to_finished(self, task_id): self.remove_active_task(task_id) self.add_finished_task(task_id) @Lockable.sync def remove_active_task(self, task_id): task = self.active_tasks.pop(task_id) task.resource_monitor.kill() @Lockable.sync def remove_finished_task(self, task_id): self.finished_tasks.pop(task_id) def run(self): """ The internal thread for the observer. This periodically polls the checkpoint root for new tasks, or transitions of tasks from active to finished state. """ while not self._stop_event.is_set(): time.sleep(self.POLLING_INTERVAL.as_(Time.SECONDS)) active_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='active')] finished_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='finished')] with self.lock: # Ensure all tasks currently detected on the system are observed appropriately for active in active_tasks: if active not in self.active_tasks: log.debug('task_id %s (unknown) -> active' % active) self.add_active_task(active) for finished in finished_tasks: if finished in self.active_tasks: log.debug('task_id %s active -> finished' % finished) self.active_to_finished(finished) elif finished not in self.finished_tasks: log.debug('task_id %s (unknown) -> finished' % finished) self.add_finished_task(finished) # Remove ObservedTasks for tasks no longer detected on the system for unknown in set(self.active_tasks) - set(active_tasks + finished_tasks): log.debug('task_id %s active -> (unknown)' % unknown) self.remove_active_task(unknown) for unknown in set(self.finished_tasks) - set(active_tasks + finished_tasks): log.debug('task_id %s finished -> (unknown)' % unknown) self.remove_finished_task(unknown) @Lockable.sync def process_from_name(self, task_id, process_id): if task_id in self.all_tasks: task = self.all_tasks[task_id].task if task: for process in task.processes(): if process.name().get() == process_id: return process @Lockable.sync def task_count(self): """ Return the count of tasks that could be ready properly from disk. This may be <= self.task_id_count() """ return dict( active=len(self.active_tasks), finished=len(self.finished_tasks), all=len(self.all_tasks), ) @Lockable.sync def task_id_count(self): """ Return the raw count of active and finished task_ids from the TaskDetector. """ num_active = len(list(self._detector.get_task_ids(state='active'))) num_finished = len(list(self._detector.get_task_ids(state='finished'))) return dict(active=num_active, finished=num_finished, all=num_active + num_finished) def _get_tasks_of_type(self, type): """Convenience function to return all tasks of a given type""" tasks = { 'active': self.active_tasks, 'finished': self.finished_tasks, 'all': self.all_tasks, }.get(type, None) if tasks is None: log.error('Unknown task type %s' % type) return {} return tasks @Lockable.sync def state(self, task_id): """Return a dict containing mapped information about a task's state""" real_state = self.raw_state(task_id) if real_state is None or real_state.header is None: return {} else: return dict( task_id=real_state.header.task_id, launch_time=real_state.header.launch_time_ms / 1000.0, sandbox=real_state.header.sandbox, hostname=real_state.header.hostname, user=real_state.header.user ) @Lockable.sync def raw_state(self, task_id): """ Return the current runner state (thrift blob: gen.apache.thermos.ttypes.RunnerState) of a given task id """ if task_id not in self.all_tasks: return None return self.all_tasks[task_id].state @Lockable.sync def _task_processes(self, task_id): """ Return the processes of a task given its task_id. Returns a map from state to processes in that state, where possible states are: waiting, running, success, failed. """ if task_id not in self.all_tasks: return {} state = self.raw_state(task_id) if state is None or state.header is None: return {} waiting, running, success, failed, killed = [], [], [], [], [] for process, runs in state.processes.items(): # No runs ==> nothing started. if len(runs) == 0: waiting.append(process) else: if runs[-1].state in (None, ProcessState.WAITING, ProcessState.LOST): waiting.append(process) elif runs[-1].state in (ProcessState.FORKED, ProcessState.RUNNING): running.append(process) elif runs[-1].state == ProcessState.SUCCESS: success.append(process) elif runs[-1].state == ProcessState.FAILED: failed.append(process) elif runs[-1].state == ProcessState.KILLED: killed.append(process) else: # TODO(wickman) Consider log.error instead of raising. raise self.UnexpectedState( "Unexpected ProcessHistoryState: %s" % state.processes[process].state) return dict(waiting=waiting, running=running, success=success, failed=failed, killed=killed) @Lockable.sync def main(self, type=None, offset=None, num=None): """Return a set of information about tasks, optionally filtered Args: type = (all|active|finished|None) [default: all] offset = offset into the list of task_ids [default: 0] num = number of results to return [default: 20] Tasks are sorted by interest: - active tasks are sorted by start time - finished tasks are sorted by completion time Returns: { tasks: [task_id_1, ..., task_id_N], type: query type, offset: next offset, num: next num } """ type = type or 'all' offset = offset or 0 num = num or 20 # Get a list of all ObservedTasks of requested type tasks = sorted((task for task in self._get_tasks_of_type(type).values()), key=attrgetter('mtime'), reverse=True) # Filter by requested offset + number of results end = num if offset < 0: offset = offset % len(tasks) if len(tasks) > abs(offset) else 0 end += offset tasks = tasks[offset:end] def task_row(observed_task): """Generate an output row for a Task""" task = self._task(observed_task.task_id) # tasks include those which could not be found properly and are hence empty {} if task: return dict( task_id=observed_task.task_id, name=task['name'], role=task['user'], launch_timestamp=task['launch_timestamp'], state=task['state'], state_timestamp=task['state_timestamp'], ports=task['ports'], **task['resource_consumption']) return dict( tasks=filter(None, map(task_row, tasks)), type=type, offset=offset, num=num, task_count=self.task_count()[type], ) def _sample(self, task_id): if task_id not in self.active_tasks: log.debug("Task %s not found in active tasks" % task_id) sample = ProcessSample.empty().to_dict() sample['disk'] = 0 else: resource_sample = self.active_tasks[task_id].resource_monitor.sample()[1] sample = resource_sample.process_sample.to_dict() sample['disk'] = resource_sample.disk_usage log.debug("Got sample for task %s: %s" % (task_id, sample)) return sample @Lockable.sync def task_statuses(self, task_id): """ Return the sequence of task states. [(task_state [string], timestamp), ...] """ # Unknown task_id. if task_id not in self.all_tasks: return [] task = self.all_tasks[task_id] if task is None: return [] state = self.raw_state(task_id) if state is None or state.header is None: return [] # Get the timestamp of the transition into the current state. return [ (TaskState._VALUES_TO_NAMES.get(st.state, 'UNKNOWN'), st.timestamp_ms / 1000) for st in state.statuses] @Lockable.sync def _task(self, task_id): """ Return composite information about a particular task task_id, given the below schema. { task_id: string, name: string, user: string, launch_timestamp: seconds, state: string [ACTIVE, SUCCESS, FAILED] ports: { name1: 'url', name2: 'url2' } resource_consumption: { cpu:, ram:, disk: } processes: { -> names only waiting: [], running: [], success: [], failed: [] } } """ # Unknown task_id. if task_id not in self.all_tasks: return {} task = self.all_tasks[task_id].task if task is None: # TODO(wickman) Can this happen? log.error('Could not find task: %s' % task_id) return {} state = self.raw_state(task_id) if state is None or state.header is None: # TODO(wickman) Can this happen? return {} # Get the timestamp of the transition into the current state. current_state = state.statuses[-1].state last_state = state.statuses[0] state_timestamp = 0 for status in state.statuses: if status.state == current_state and last_state != current_state: state_timestamp = status.timestamp_ms / 1000 last_state = status.state return dict( task_id=task_id, name=task.name().get(), launch_timestamp=state.statuses[0].timestamp_ms / 1000, state=TaskState._VALUES_TO_NAMES[state.statuses[-1].state], state_timestamp=state_timestamp, user=state.header.user, resource_consumption=self._sample(task_id), ports=state.header.ports, processes=self._task_processes(task_id), task_struct=task, ) @Lockable.sync def _get_process_resource_consumption(self, task_id, process_name): if task_id not in self.active_tasks: log.debug("Task %s not found in active tasks" % task_id) return ProcessSample.empty().to_dict() sample = self.active_tasks[task_id].resource_monitor.sample_by_process(process_name).to_dict() log.debug('Resource consumption (%s, %s) => %s' % (task_id, process_name, sample)) return sample @Lockable.sync def _get_process_tuple(self, history, run): """ Return the basic description of a process run if it exists, otherwise an empty dictionary. { process_name: string process_run: int state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST] (optional) start_time: seconds from epoch (optional) stop_time: seconds from epoch } """ if len(history) == 0: return {} if run >= len(history): return {} else: process_run = history[run] run = run % len(history) d = dict( process_name=process_run.process, process_run=run, state=ProcessState._VALUES_TO_NAMES[process_run.state], ) if process_run.start_time: d.update(start_time=process_run.start_time) if process_run.stop_time: d.update(stop_time=process_run.stop_time) return d @Lockable.sync def process(self, task_id, process, run=None): """ Returns a process run, where the schema is given below: { process_name: string process_run: int used: { cpu: float, ram: int bytes, disk: int bytes } start_time: (time since epoch in millis (utc)) stop_time: (time since epoch in millis (utc)) state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST] } If run is None, return the latest run. """ state = self.raw_state(task_id) if state is None or state.header is None: return {} if process not in state.processes: return {} history = state.processes[process] run = int(run) if run is not None else -1 tup = self._get_process_tuple(history, run) if not tup: return {} if tup.get('state') == 'RUNNING': tup.update(used=self._get_process_resource_consumption(task_id, process)) return tup @Lockable.sync def _processes(self, task_id): """ Return { process1: { ... } process2: { ... } ... processN: { ... } } where processK is the latest run of processK and in the schema as defined by process(). """ if task_id not in self.all_tasks: return {} state = self.raw_state(task_id) if state is None or state.header is None: return {} processes = self._task_processes(task_id) d = dict() for process_type in processes: for process_name in processes[process_type]: d[process_name] = self.process(task_id, process_name) return d @Lockable.sync def processes(self, task_ids): """ Given a list of task_ids, returns a map of task_id => processes, where processes is defined by the schema in _processes. """ if not isinstance(task_ids, (list, tuple)): return {} return dict((task_id, self._processes(task_id)) for task_id in task_ids) @Lockable.sync def get_run_number(self, runner_state, process, run=None): if runner_state is not None and runner_state.processes is not None: run = run if run is not None else -1 if run < len(runner_state.processes[process]): if len(runner_state.processes[process]) > 0: return run % len(runner_state.processes[process]) @Lockable.sync def logs(self, task_id, process, run=None): """ Given a task_id and a process and (optional) run number, return a dict: { stderr: [dir, filename] stdout: [dir, filename] } If the run number is unspecified, uses the latest run. TODO(wickman) Just return the filenames directly? """ runner_state = self.raw_state(task_id) if runner_state is None or runner_state.header is None: return {} run = self.get_run_number(runner_state, process, run) if run is None: return {} log_path = self._pathspec.given(task_id=task_id, process=process, run=run, log_dir=runner_state.header.log_dir).getpath('process_logdir') return dict( stdout=[log_path, 'stdout'], stderr=[log_path, 'stderr'] ) @staticmethod def _sanitize_path(base_path, relpath): """ Attempts to sanitize a path through path normalization, also making sure that the relative path is contained inside of base_path. """ if relpath is None: relpath = "." normalized_base = os.path.realpath(base_path) normalized = os.path.realpath(os.path.join(base_path, relpath)) if normalized.startswith(normalized_base): return (normalized_base, os.path.relpath(normalized, normalized_base)) return (None, None) @Lockable.sync def valid_file(self, task_id, path): """ Like valid_path, but also verify the given path is a file """ chroot, path = self.valid_path(task_id, path) if chroot and path and os.path.isfile(os.path.join(chroot, path)): return chroot, path return None, None @Lockable.sync def valid_path(self, task_id, path): """ Given a task_id and a path within that task_id's sandbox, verify: (1) it's actually in the sandbox and not outside (2) it's a valid, existing path Returns chroot and the pathname relative to that chroot. """ runner_state = self.raw_state(task_id) if runner_state is None or runner_state.header is None: return None, None try: chroot = runner_state.header.sandbox except AttributeError: return None, None chroot, path = self._sanitize_path(chroot, path) if chroot and path: return chroot, path return None, None @Lockable.sync def files(self, task_id, path=None): """ Returns dictionary { task_id: task_id chroot: absolute directory on machine path: sanitized relative path w.r.t. chroot dirs: list of directories files: list of files } """ # TODO(jon): DEPRECATED: most of the necessary logic is handled directly in the templates. # Also, global s/chroot/sandbox/? empty = dict(task_id=task_id, chroot=None, path=None, dirs=None, files=None) path = path if path is not None else '.' runner_state = self.raw_state(task_id) if runner_state is None: return empty try: chroot = runner_state.header.sandbox except AttributeError: return empty if chroot is None: # chroot-less job return empty chroot, path = self._sanitize_path(chroot, path) if (chroot is None or path is None or not os.path.isdir(os.path.join(chroot, path))): return empty names = os.listdir(os.path.join(chroot, path)) dirs, files = [], [] for name in names: if os.path.isdir(os.path.join(chroot, path, name)): dirs.append(name) else: files.append(name) return dict( task_id=task_id, chroot=chroot, path=path, dirs=dirs, files=files )
def partition_tasks(self): """Return active/finished tasks as discovered from the checkpoint root.""" detector = TaskDetector(root=self._checkpoint_root) active_tasks = set(t_id for _, t_id in detector.get_task_ids(state='active')) finished_tasks = set(t_id for _, t_id in detector.get_task_ids(state='finished')) return active_tasks, finished_tasks
def status(args, options): """Get the status of task(s). Usage: thermos status [options] [task_name(s) or task_regexp(s)] Options: --verbosity=LEVEL Verbosity level for logging. [default: 0] --only=TYPE Only print tasks of TYPE (options: active finished) """ detector = TaskDetector(root=options.root) def format_task(task_id): checkpoint_filename = detector.get_checkpoint(task_id) checkpoint_stat = os.stat(checkpoint_filename) try: checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name except KeyError: checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid print(' %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='') if options.verbose == 0: print() if options.verbose > 0: state = CheckpointDispatcher.from_file(checkpoint_filename) if state is None or state.header is None: print(' - checkpoint stream CORRUPT or outdated format') return print(' state: %8s' % TaskState._VALUES_TO_NAMES.get(state.statuses[-1].state, 'Unknown'), end='') print(' start: %25s' % time.asctime(time.localtime(state.header.launch_time_ms / 1000.0))) if options.verbose > 1: print(' user: %s' % state.header.user, end='') if state.header.ports: print(' ports: %s' % ' '.join('%s -> %s' % (key, val) for key, val in state.header.ports.items())) else: print(' ports: None') print(' sandbox: %s' % state.header.sandbox) if options.verbose > 2: print(' process table:') for process, process_history in state.processes.items(): print(' - %s runs: %s' % (process, len(process_history)), end='') last_run = process_history[-1] print(' last: pid=%s, rc=%s, finish:%s, state:%s' % ( last_run.pid or 'None', last_run.return_code if last_run.return_code is not None else '', time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else 'None', ProcessState._VALUES_TO_NAMES.get(last_run.state, 'Unknown'))) print() matchers = map(re.compile, args or ['.*']) active = [t_id for _, t_id in detector.get_task_ids(state='active') if any(pattern.match(t_id) for pattern in matchers)] finished = [t_id for _, t_id in detector.get_task_ids(state='finished') if any(pattern.match(t_id) for pattern in matchers)] found = False if options.only is None or options.only == 'active': if active: print('Active tasks:') found = True for task_id in active: format_task(task_id) print() if options.only is None or options.only == 'finished': if finished: print('Finished tasks:') found = True for task_id in finished: format_task(task_id) print() if not found: print('No tasks found in root [%s]' % options.root) sys.exit(1)
def test_task_detector(): with temporary_dir() as root: active_log_dir = os.path.join(root, 'active_log') finished_log_dir = os.path.join(root, 'finished_log') path = TaskPath(root=root) detector = TaskDetector(root) # test empty paths assert list(detector.get_task_ids(state='active')) == [] assert list(detector.get_task_ids(state='finished')) == [] assert set(detector.get_task_ids()) == set() assert detector.get_checkpoint(task_id='active_task') == path.given( task_id='active_task').getpath('runner_checkpoint') assert detector.get_checkpoint(task_id='finished_task') == path.given( task_id='finished_task').getpath('runner_checkpoint') assert set(detector.get_process_checkpoints('active_task')) == set() assert set(detector.get_process_checkpoints('finished_task')) == set() assert set(detector.get_process_runs('active_task', active_log_dir)) == set() assert set(detector.get_process_runs('finished_task', finished_log_dir)) == set() assert set(detector.get_process_logs('active_task', active_log_dir)) == set() assert set(detector.get_process_logs('finished_task', finished_log_dir)) == set() # create paths paths = [ path.given(state='active', task_id='active_task').getpath('task_path'), path.given(state='finished', task_id='finished_task').getpath('task_path'), path.given(task_id='active_task').getpath('runner_checkpoint'), path.given(task_id='finished_task').getpath('runner_checkpoint'), path.given( task_id='active_task', process='hello_world', run='0', log_dir=active_log_dir ).with_filename('stdout').getpath('process_logdir'), path.given( task_id='finished_task', process='goodbye_world', run='1', log_dir=finished_log_dir ).with_filename('stderr').getpath('process_logdir'), path.given(task_id='active_task', process='hello_world').getpath('process_checkpoint'), path.given(task_id='finished_task', process='goodbye_world').getpath('process_checkpoint'), ] for p in paths: touch(p) detector = TaskDetector(root) assert list(detector.get_task_ids(state='active')) == list([('active', 'active_task')]) assert list(detector.get_task_ids(state='finished')) == list([('finished', 'finished_task')]) assert set(detector.get_task_ids()) == set( [('active', 'active_task'), ('finished', 'finished_task')]) assert list(detector.get_process_checkpoints('active_task')) == [ path.given(task_id='active_task', process='hello_world').getpath('process_checkpoint')] assert list(detector.get_process_checkpoints('finished_task')) == [ path.given(task_id='finished_task', process='goodbye_world').getpath('process_checkpoint')] assert list(detector.get_process_runs('active_task', active_log_dir)) == [ ('hello_world', 0)] assert list(detector.get_process_runs('finished_task', finished_log_dir)) == [ ('goodbye_world', 1)] assert list(detector.get_process_logs('active_task', active_log_dir)) == [ path.given( task_id='active_task', process='hello_world', run='0', log_dir=active_log_dir ).with_filename('stdout').getpath('process_logdir')] assert list(detector.get_process_logs('finished_task', finished_log_dir)) == [ path.given( task_id='finished_task', process='goodbye_world', run='1', log_dir=finished_log_dir ).with_filename('stderr').getpath('process_logdir')]
class TaskObserver(ExceptionalThread, Lockable): """ The TaskObserver monitors the thermos checkpoint root for active/finished tasks. It is used to be the oracle of the state of all thermos tasks on a machine. It currently returns JSON, but really should just return objects. We should then build an object->json translator. """ class UnexpectedError(Exception): pass class UnexpectedState(Exception): pass POLLING_INTERVAL = Amount(1, Time.SECONDS) def __init__(self, root, resource_monitor_class=TaskResourceMonitor): self._pathspec = TaskPath(root=root) self._detector = TaskDetector(root) if not issubclass(resource_monitor_class, ResourceMonitorBase): raise ValueError("resource monitor class must implement ResourceMonitorBase!") self._resource_monitor = resource_monitor_class self._active_tasks = {} # task_id => ActiveObservedTask self._finished_tasks = {} # task_id => FinishedObservedTask self._stop_event = threading.Event() ExceptionalThread.__init__(self) Lockable.__init__(self) self.daemon = True @property def active_tasks(self): """Return a dictionary of active Tasks""" return self._active_tasks @property def finished_tasks(self): """Return a dictionary of finished Tasks""" return self._finished_tasks @property def all_tasks(self): """Return a dictionary of all Tasks known by the TaskObserver""" return dict(self.active_tasks.items() + self.finished_tasks.items()) def stop(self): self._stop_event.set() def start(self): ExceptionalThread.start(self) @Lockable.sync def add_active_task(self, task_id): if task_id in self.finished_tasks: log.error('Found an active task (%s) in finished tasks?' % task_id) return task_monitor = TaskMonitor(self._pathspec, task_id) if not task_monitor.get_state().header: log.info('Unable to load task "%s"' % task_id) return sandbox = task_monitor.get_state().header.sandbox resource_monitor = self._resource_monitor(task_monitor, sandbox) resource_monitor.start() self._active_tasks[task_id] = ActiveObservedTask( task_id=task_id, pathspec=self._pathspec, task_monitor=task_monitor, resource_monitor=resource_monitor ) @Lockable.sync def add_finished_task(self, task_id): self._finished_tasks[task_id] = FinishedObservedTask( task_id=task_id, pathspec=self._pathspec ) @Lockable.sync def active_to_finished(self, task_id): self.remove_active_task(task_id) self.add_finished_task(task_id) @Lockable.sync def remove_active_task(self, task_id): task = self.active_tasks.pop(task_id) task.resource_monitor.kill() @Lockable.sync def remove_finished_task(self, task_id): self.finished_tasks.pop(task_id) def run(self): """ The internal thread for the observer. This periodically polls the checkpoint root for new tasks, or transitions of tasks from active to finished state. """ while not self._stop_event.is_set(): time.sleep(self.POLLING_INTERVAL.as_(Time.SECONDS)) active_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='active')] finished_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='finished')] with self.lock: # Ensure all tasks currently detected on the system are observed appropriately for active in active_tasks: if active not in self.active_tasks: log.debug('task_id %s (unknown) -> active' % active) self.add_active_task(active) for finished in finished_tasks: if finished in self.active_tasks: log.debug('task_id %s active -> finished' % finished) self.active_to_finished(finished) elif finished not in self.finished_tasks: log.debug('task_id %s (unknown) -> finished' % finished) self.add_finished_task(finished) # Remove ObservedTasks for tasks no longer detected on the system for unknown in set(self.active_tasks) - set(active_tasks + finished_tasks): log.debug('task_id %s active -> (unknown)' % unknown) self.remove_active_task(unknown) for unknown in set(self.finished_tasks) - set(active_tasks + finished_tasks): log.debug('task_id %s finished -> (unknown)' % unknown) self.remove_finished_task(unknown) @Lockable.sync def process_from_name(self, task_id, process_id): if task_id in self.all_tasks: task = self.all_tasks[task_id].task if task: for process in task.processes(): if process.name().get() == process_id: return process @Lockable.sync def task_count(self): """ Return the count of tasks that could be ready properly from disk. This may be <= self.task_id_count() """ return dict( active=len(self.active_tasks), finished=len(self.finished_tasks), all=len(self.all_tasks), ) @Lockable.sync def task_id_count(self): """ Return the raw count of active and finished task_ids from the TaskDetector. """ num_active = len(list(self._detector.get_task_ids(state='active'))) num_finished = len(list(self._detector.get_task_ids(state='finished'))) return dict(active=num_active, finished=num_finished, all=num_active + num_finished) def _get_tasks_of_type(self, type): """Convenience function to return all tasks of a given type""" tasks = { 'active': self.active_tasks, 'finished': self.finished_tasks, 'all': self.all_tasks, }.get(type, None) if tasks is None: log.error('Unknown task type %s' % type) return {} return tasks @Lockable.sync def state(self, task_id): """Return a dict containing mapped information about a task's state""" real_state = self.raw_state(task_id) if real_state is None or real_state.header is None: return {} else: return dict( task_id=real_state.header.task_id, launch_time=real_state.header.launch_time_ms / 1000.0, sandbox=real_state.header.sandbox, hostname=real_state.header.hostname, user=real_state.header.user ) @Lockable.sync def raw_state(self, task_id): """ Return the current runner state (thrift blob: gen.apache.thermos.ttypes.RunnerState) of a given task id """ if task_id not in self.all_tasks: return None return self.all_tasks[task_id].state @Lockable.sync def _task_processes(self, task_id): """ Return the processes of a task given its task_id. Returns a map from state to processes in that state, where possible states are: waiting, running, success, failed. """ if task_id not in self.all_tasks: return {} state = self.raw_state(task_id) if state is None or state.header is None: return {} waiting, running, success, failed, killed = [], [], [], [], [] for process, runs in state.processes.items(): # No runs ==> nothing started. if len(runs) == 0: waiting.append(process) else: if runs[-1].state in (None, ProcessState.WAITING, ProcessState.LOST): waiting.append(process) elif runs[-1].state in (ProcessState.FORKED, ProcessState.RUNNING): running.append(process) elif runs[-1].state == ProcessState.SUCCESS: success.append(process) elif runs[-1].state == ProcessState.FAILED: failed.append(process) elif runs[-1].state == ProcessState.KILLED: killed.append(process) else: # TODO(wickman) Consider log.error instead of raising. raise self.UnexpectedState( "Unexpected ProcessHistoryState: %s" % state.processes[process].state) return dict(waiting=waiting, running=running, success=success, failed=failed, killed=killed) @Lockable.sync def main(self, type=None, offset=None, num=None): """Return a set of information about tasks, optionally filtered Args: type = (all|active|finished|None) [default: all] offset = offset into the list of task_ids [default: 0] num = number of results to return [default: 20] Tasks are sorted by interest: - active tasks are sorted by start time - finished tasks are sorted by completion time Returns: { tasks: [task_id_1, ..., task_id_N], type: query type, offset: next offset, num: next num } """ type = type or 'all' offset = offset or 0 num = num or 20 # Get a list of all ObservedTasks of requested type tasks = sorted((task for task in self._get_tasks_of_type(type).values()), key=attrgetter('mtime'), reverse=True) # Filter by requested offset + number of results end = num if offset < 0: offset = offset % len(tasks) if len(tasks) > abs(offset) else 0 end += offset tasks = tasks[offset:end] def task_row(observed_task): """Generate an output row for a Task""" task = self._task(observed_task.task_id) # tasks include those which could not be found properly and are hence empty {} if task: return dict( task_id=observed_task.task_id, name=task['name'], role=task['user'], launch_timestamp=task['launch_timestamp'], state=task['state'], state_timestamp=task['state_timestamp'], ports=task['ports'], **task['resource_consumption']) return dict( tasks=filter(None, map(task_row, tasks)), type=type, offset=offset, num=num, task_count=self.task_count()[type], ) def _sample(self, task_id): if task_id not in self.active_tasks: log.debug("Task %s not found in active tasks" % task_id) sample = ProcessSample.empty().to_dict() sample['disk'] = 0 else: resource_sample = self.active_tasks[task_id].resource_monitor.sample()[1] sample = resource_sample.process_sample.to_dict() sample['disk'] = resource_sample.disk_usage log.debug("Got sample for task %s: %s" % (task_id, sample)) return sample @Lockable.sync def task_statuses(self, task_id): """ Return the sequence of task states. [(task_state [string], timestamp), ...] """ # Unknown task_id. if task_id not in self.all_tasks: return [] task = self.all_tasks[task_id] if task is None: return [] state = self.raw_state(task_id) if state is None or state.header is None: return [] # Get the timestamp of the transition into the current state. return [ (TaskState._VALUES_TO_NAMES.get(st.state, 'UNKNOWN'), st.timestamp_ms / 1000) for st in state.statuses] @Lockable.sync def tasks(self, task_ids): """ Return information about an iterable of tasks [task_id1, task_id2, ...] in the following form. { task_id1 : self._task(task_id1), task_id2 : self._task(task_id2), ... } """ res = {} for task_id in task_ids: d = self._task(task_id) task_struct = d.pop('task_struct') d['task'] = task_struct.get() res[task_id] = d return res @Lockable.sync def _task(self, task_id): """ Return composite information about a particular task task_id, given the below schema. { task_id: string, name: string, user: string, launch_timestamp: seconds, state: string [ACTIVE, SUCCESS, FAILED] ports: { name1: 'url', name2: 'url2' } resource_consumption: { cpu:, ram:, disk: } processes: { -> names only waiting: [], running: [], success: [], failed: [] } } """ # Unknown task_id. if task_id not in self.all_tasks: return {} task = self.all_tasks[task_id].task if task is None: # TODO(wickman) Can this happen? log.error('Could not find task: %s' % task_id) return {} state = self.raw_state(task_id) if state is None or state.header is None: # TODO(wickman) Can this happen? return {} # Get the timestamp of the transition into the current state. current_state = state.statuses[-1].state last_state = state.statuses[0] state_timestamp = 0 for status in state.statuses: if status.state == current_state and last_state != current_state: state_timestamp = status.timestamp_ms / 1000 last_state = status.state return dict( task_id=task_id, name=task.name().get(), launch_timestamp=state.statuses[0].timestamp_ms / 1000, state=TaskState._VALUES_TO_NAMES[state.statuses[-1].state], state_timestamp=state_timestamp, user=state.header.user, resource_consumption=self._sample(task_id), ports=state.header.ports, processes=self._task_processes(task_id), task_struct=task, ) @Lockable.sync def _get_process_resource_consumption(self, task_id, process_name): if task_id not in self.active_tasks: log.debug("Task %s not found in active tasks" % task_id) return ProcessSample.empty().to_dict() sample = self.active_tasks[task_id].resource_monitor.sample_by_process(process_name).to_dict() log.debug('Resource consumption (%s, %s) => %s' % (task_id, process_name, sample)) return sample @Lockable.sync def _get_process_tuple(self, history, run): """ Return the basic description of a process run if it exists, otherwise an empty dictionary. { process_name: string process_run: int (optional) return_code: int state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST] (optional) start_time: seconds from epoch (optional) stop_time: seconds from epoch } """ if len(history) == 0: return {} if run >= len(history): return {} else: process_run = history[run] run = run % len(history) d = dict( process_name=process_run.process, process_run=run, state=ProcessState._VALUES_TO_NAMES[process_run.state], ) if process_run.start_time: d.update(start_time=process_run.start_time) if process_run.stop_time: d.update(stop_time=process_run.stop_time) if process_run.return_code: d.update(return_code=process_run.return_code) return d @Lockable.sync def process(self, task_id, process, run=None): """ Returns a process run, where the schema is given below: { process_name: string process_run: int used: { cpu: float, ram: int bytes, disk: int bytes } start_time: (time since epoch in millis (utc)) stop_time: (time since epoch in millis (utc)) state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST] } If run is None, return the latest run. """ state = self.raw_state(task_id) if state is None or state.header is None: return {} if process not in state.processes: return {} history = state.processes[process] run = int(run) if run is not None else -1 tup = self._get_process_tuple(history, run) if not tup: return {} if tup.get('state') == 'RUNNING': tup.update(used=self._get_process_resource_consumption(task_id, process)) return tup @Lockable.sync def _processes(self, task_id): """ Return { process1: { ... } process2: { ... } ... processN: { ... } } where processK is the latest run of processK and in the schema as defined by process(). """ if task_id not in self.all_tasks: return {} state = self.raw_state(task_id) if state is None or state.header is None: return {} processes = self._task_processes(task_id) d = dict() for process_type in processes: for process_name in processes[process_type]: d[process_name] = self.process(task_id, process_name) return d @Lockable.sync def processes(self, task_ids): """ Given a list of task_ids, returns a map of task_id => processes, where processes is defined by the schema in _processes. """ if not isinstance(task_ids, (list, tuple)): return {} return dict((task_id, self._processes(task_id)) for task_id in task_ids) @Lockable.sync def get_run_number(self, runner_state, process, run=None): if runner_state is not None and runner_state.processes is not None: run = run if run is not None else -1 if run < len(runner_state.processes[process]): if len(runner_state.processes[process]) > 0: return run % len(runner_state.processes[process]) @Lockable.sync def logs(self, task_id, process, run=None): """ Given a task_id and a process and (optional) run number, return a dict: { stderr: [dir, filename] stdout: [dir, filename] } If the run number is unspecified, uses the latest run. TODO(wickman) Just return the filenames directly? """ runner_state = self.raw_state(task_id) if runner_state is None or runner_state.header is None: return {} run = self.get_run_number(runner_state, process, run) if run is None: return {} log_path = self._pathspec.given(task_id=task_id, process=process, run=run, log_dir=runner_state.header.log_dir).getpath('process_logdir') return dict( stdout=[log_path, 'stdout'], stderr=[log_path, 'stderr'] ) @staticmethod def _sanitize_path(base_path, relpath): """ Attempts to sanitize a path through path normalization, also making sure that the relative path is contained inside of base_path. """ if relpath is None: relpath = "." normalized_base = os.path.realpath(base_path) normalized = os.path.realpath(os.path.join(base_path, relpath)) if normalized.startswith(normalized_base): return (normalized_base, os.path.relpath(normalized, normalized_base)) return (None, None) @Lockable.sync def valid_file(self, task_id, path): """ Like valid_path, but also verify the given path is a file """ chroot, path = self.valid_path(task_id, path) if chroot and path and os.path.isfile(os.path.join(chroot, path)): return chroot, path return None, None @Lockable.sync def valid_path(self, task_id, path): """ Given a task_id and a path within that task_id's sandbox, verify: (1) it's actually in the sandbox and not outside (2) it's a valid, existing path Returns chroot and the pathname relative to that chroot. """ runner_state = self.raw_state(task_id) if runner_state is None or runner_state.header is None: return None, None try: chroot = runner_state.header.sandbox except AttributeError: return None, None chroot, path = self._sanitize_path(chroot, path) if chroot and path: return chroot, path return None, None @Lockable.sync def files(self, task_id, path=None): """ Returns dictionary { task_id: task_id chroot: absolute directory on machine path: sanitized relative path w.r.t. chroot dirs: list of directories files: list of files } """ # TODO(jon): DEPRECATED: most of the necessary logic is handled directly in the templates. # Also, global s/chroot/sandbox/? empty = dict(task_id=task_id, chroot=None, path=None, dirs=None, files=None) path = path if path is not None else '.' runner_state = self.raw_state(task_id) if runner_state is None: return empty try: chroot = runner_state.header.sandbox except AttributeError: return empty if chroot is None: # chroot-less job return empty chroot, path = self._sanitize_path(chroot, path) if (chroot is None or path is None or not os.path.isdir(os.path.join(chroot, path))): return empty names = os.listdir(os.path.join(chroot, path)) dirs, files = [], [] for name in names: if os.path.isdir(os.path.join(chroot, path, name)): dirs.append(name) else: files.append(name) return dict( task_id=task_id, chroot=chroot, path=path, dirs=dirs, files=files )