def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error('Expected a task to tail, got nothing!') if len(args) not in (1, 2): app.error('Expected at most two arguments (task and optional process), got %d' % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print('ERROR: No processes found.', file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print('ERROR: More than one process matches query.', file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath('process_logdir') logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout') monitor = TaskMonitor(TaskPath(root=options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print('Tail of terminal log %s' % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print('Tail of active log %s' % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0
def make_taskpath(td): return TaskPath( root=td, task_id='task', process='process', run=0, log_dir=os.path.join(td, '.logs'))
def get(cls, task_id, checkpoint_root): """ Get a TaskRunner bound to the task_id in checkpoint_root. """ path = TaskPath(root=checkpoint_root, task_id=task_id, state='active') task_json = path.getpath('task_path') task_checkpoint = path.getpath('runner_checkpoint') if not os.path.exists(task_json): return None task = ThermosConfigLoader.load_json(task_json) if task is None: return None if len(task.tasks()) == 0: return None try: checkpoint = CheckpointDispatcher.from_file(task_checkpoint) if checkpoint is None or checkpoint.header is None: return None return cls(task.tasks()[0].task(), checkpoint_root, checkpoint.header.sandbox, log_dir=checkpoint.header.log_dir, task_id=task_id, portmap=checkpoint.header.ports, hostname=checkpoint.header.hostname) except Exception as e: log.error( 'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e, exc_info=True) return None
def test_basic_as_job(self): proxy_driver = ProxyDriver() with temporary_dir() as tempdir: te = AuroraExecutor(runner_provider=make_provider(tempdir), sandbox_provider=DefaultTestSandboxProvider()) te.launchTask(proxy_driver, make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0)) te.runner_started.wait() while te._status_manager is None: time.sleep(0.1) te.terminated.wait() tm = TaskMonitor(TaskPath(root=tempdir), task_id=HELLO_WORLD_TASK_ID) runner_state = tm.get_state() assert 'hello_world_hello_world-001' in runner_state.processes, ( 'Could not find processes, got: %s' % ' '.join(runner_state.processes)) updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 3 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb2.TASK_STARTING assert status_updates[1].state == mesos_pb2.TASK_RUNNING assert status_updates[2].state == mesos_pb2.TASK_FINISHED
def logs(self, task_id, process, run=None): """ Given a task_id and a process and (optional) run number, return a dict: { stderr: [dir, filename] stdout: [dir, filename] } If the run number is unspecified, uses the latest run. TODO(wickman) Just return the filenames directly? """ runner_state = self.raw_state(task_id) if runner_state is None or runner_state.header is None: return {} run = self.get_run_number(runner_state, process, run) if run is None: return {} observed_task = self.all_tasks.get(task_id, None) if not observed_task: return {} log_path = TaskPath( root=observed_task.root, task_id=task_id, process=process, run=run, log_dir=runner_state.header.log_dir, ).getpath('process_logdir') return dict(stdout=[log_path, 'stdout'], stderr=[log_path, 'stderr'])
def write_header(root, sandbox, task_id): log_dir = os.path.join(sandbox, '.logs') path = TaskPath(root=root, task_id=task_id, log_dir=log_dir) header = RunnerHeader(task_id=task_id, sandbox=sandbox, log_dir=log_dir) ckpt = TaskRunnerHelper.open_checkpoint(path.getpath('runner_checkpoint')) ckpt.write(RunnerCkpt(runner_header=header)) ckpt.close()
def _runner_ckpt(self, task): """Return the runner checkpoint file for a given task. :param task: An instance of a task to retrieve checkpoint path :type task: :class:`RootedTask` instance """ return TaskPath(root=task.checkpoint_root, task_id=task.task_id).getpath('runner_checkpoint')
def erase_logs(self, task_id): for fn in self.get_logs(task_id, with_size=False): safe_delete(fn) state = self.state(task_id) if state and state.header: safe_rmtree( TaskPath( root=self._root, task_id=task_id, log_dir=state.header.log_dir).getpath('process_logbase'))
def __init__(self, checkpoint_root, task_id): """ :param checkpoint_root: The checkpoint root to find the given task. :param task_id: The task_id of the task whose state we wish to manage. """ self._detector = TaskDetector(checkpoint_root) self._task_id = task_id self._pathspec = TaskPath(root=checkpoint_root, task_id=task_id) self._state = CheckpointDispatcher.from_file( self._detector.get_checkpoint(task_id))
def __init__(self, root, resource_monitor_class=TaskResourceMonitor): self._pathspec = TaskPath(root=root) self._detector = TaskDetector(root) if not issubclass(resource_monitor_class, ResourceMonitorBase): raise ValueError("resource monitor class must implement ResourceMonitorBase!") self._resource_monitor = resource_monitor_class self._active_tasks = {} # task_id => ActiveObservedTask self._finished_tasks = {} # task_id => FinishedObservedTask self._stop_event = threading.Event() ExceptionalThread.__init__(self) Lockable.__init__(self) self.daemon = True
def from_assigned_task(self, assigned_task, sandbox): task_id = assigned_task.taskId resources = mesos_task_instance_from_assigned_task( assigned_task).task().resources() task_path = TaskPath(root=self._checkpoint_root, task_id=task_id) task_monitor = TaskMonitor(task_path, task_id) resource_monitor = TaskResourceMonitor( task_monitor, sandbox.root, disk_collector=self._disk_collector, disk_collection_interval=self._disk_collection_interval) return ResourceManager(resources, resource_monitor)
def test_other_user_fails_nonroot(): with temporary_dir() as td: taskpath = TaskPath(root=td, task_id='task', process='process', run=0) sandbox = setup_sandbox(td, taskpath) process = TestProcess('process', 'echo hello world', 0, taskpath, sandbox, user=get_other_nonroot_user().pw_name) with pytest.raises(Process.PermissionError): process.start()
def kill(cls, task_id, checkpoint_root, force=False, terminal_status=TaskState.KILLED, clock=time): """ An implementation of Task killing that doesn't require a fully hydrated TaskRunner object. Terminal status must be either KILLED or LOST state. """ if terminal_status not in (TaskState.KILLED, TaskState.LOST): raise cls.Error('terminal_status must be KILLED or LOST (got %s)' % TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status) pathspec = TaskPath(root=checkpoint_root, task_id=task_id) checkpoint = pathspec.getpath('runner_checkpoint') state = CheckpointDispatcher.from_file(checkpoint) if state is None or state.header is None or state.statuses is None: if force: log.error('Task has uninitialized TaskState - forcibly finalizing') cls.finalize_task(pathspec) return else: log.error('Cannot update states in uninitialized TaskState!') return ckpt = cls.open_checkpoint(checkpoint, force=force, state=state) def write_task_state(state): update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000), runner_pid=os.getpid(), runner_uid=os.getuid()) ckpt.write(RunnerCkpt(task_status=update)) def write_process_status(status): ckpt.write(RunnerCkpt(process_status=status)) if cls.is_task_terminal(state.statuses[-1].state): log.info('Task is already in terminal state! Finalizing.') cls.finalize_task(pathspec) return with closing(ckpt): write_task_state(TaskState.ACTIVE) for process, history in state.processes.items(): process_status = history[-1] if not cls.is_process_terminal(process_status.state): if cls.kill_process(state, process): write_process_status(ProcessStatus(process=process, state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9, stop_time=clock.time())) else: if process_status.state is not ProcessState.WAITING: write_process_status(ProcessStatus(process=process, state=ProcessState.LOST, seq=process_status.seq + 1)) write_task_state(terminal_status) cls.finalize_task(pathspec)
def get_metadata(self, task_id, with_size=True): runner_ckpt = self._detector.get_checkpoint(task_id) process_ckpts = [ ckpt for ckpt in self._detector.get_process_checkpoints(task_id) ] json_spec = TaskPath(root=self._root, task_id=task_id, state='finished').getpath('task_path') for path in [json_spec, runner_ckpt] + process_ckpts: if with_size: yield path, safe_bsize(path) else: yield path
def test_garbage_collector(safe_rmtree, safe_delete): with temporary_dir() as sandbox, temporary_dir( ) as checkpoint_root, temporary_dir() as log_dir: path = TaskPath(root=checkpoint_root, task_id='test', log_dir=log_dir) touch(os.path.join(sandbox, 'test_file1')) touch(os.path.join(sandbox, 'test_file2')) safe_mkdir( os.path.dirname(path.given(state='finished').getpath('task_path'))) safe_mkdir(os.path.dirname(path.getpath('runner_checkpoint'))) touch(path.given(state='finished').getpath('task_path')) header = RunnerHeader(task_id='test', sandbox=sandbox, log_dir=log_dir) ckpt = TaskRunnerHelper.open_checkpoint( path.getpath('runner_checkpoint')) ckpt.write(RunnerCkpt(runner_header=header)) ckpt.close() gc = TaskGarbageCollector(checkpoint_root, task_id='test') assert gc._state.header.log_dir == log_dir assert gc._state.header.sandbox == sandbox # erase metadata gc.erase_metadata() safe_delete.assert_has_calls([ call(path.given(state='finished').getpath('task_path')), call(path.getpath('runner_checkpoint')) ], any_order=True) safe_rmtree.assert_has_calls([call(path.getpath('checkpoint_path'))]) safe_delete.reset_mock() safe_rmtree.reset_mock() # erase logs gc.erase_logs() safe_rmtree.assert_has_calls([call(log_dir)]) safe_delete.reset_mock() safe_rmtree.reset_mock() # erase sandbox gc.erase_data() safe_delete.assert_has_calls([ call(os.path.join(sandbox, 'test_file1')), call(os.path.join(sandbox, 'test_file2')) ], any_order=True) safe_rmtree.assert_has_calls([call(sandbox)])
def test_sample_by_process_no_process(self, mock_get_active_processes): task_path = TaskPath(root='.') task_monitor = TaskMonitor(task_path, 'fake-task-id') mock_get_active_processes.return_value = [] task_resource_monitor = TaskResourceMonitor(task_monitor, '.') with self.assertRaises(ValueError): task_resource_monitor.sample_by_process('fake-process-name') assert mock_get_active_processes.mock_calls == [ mock.call(task_monitor) ]
def test_simple_process(): with temporary_dir() as td: taskpath = TaskPath(root=td, task_id='task', process='process', run=0) sandbox = setup_sandbox(td, taskpath) p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox) p.start() rc = wait_for_rc(taskpath.getpath('process_checkpoint')) assert rc == 0 stdout = taskpath.with_filename('stdout').getpath('process_logdir') assert os.path.exists(stdout) with open(stdout, 'r') as fp: assert fp.read() == 'hello world\n'
def run_with_class(process_class): with temporary_dir() as td: taskpath = TaskPath(root=td, task_id='task', process='process', run=0) sandbox = setup_sandbox(td, taskpath) with open(os.path.join(sandbox, 'silly_pants'), 'w') as silly_pants: p = process_class('process', 'echo test >&%s' % silly_pants.fileno(), 0, taskpath, sandbox) p.start() return wait_for_rc(taskpath.getpath('process_checkpoint'))
def test_empty_garbage_collector(): with temporary_dir() as checkpoint_root: path = TaskPath(root=checkpoint_root, task_id='does_not_exist') gc = TaskGarbageCollector(checkpoint_root, 'does_not_exist') assert gc.get_age() == 0 # assume runner, finished task assert set(gc.get_metadata(with_size=False)) == set([ path.getpath('runner_checkpoint'), path.given(state='finished').getpath('task_path'), ]) assert list(gc.get_logs()) == [] assert list(gc.get_data()) == []
def __init__(self, root, task_id): """Construct a TaskMonitor. :param root: The checkpoint root of the task. :param task_id: The task id of the task. """ pathspec = TaskPath(root=root, task_id=task_id) self._dispatcher = CheckpointDispatcher() self._runnerstate = RunnerState(processes={}) self._runner_ckpt = pathspec.getpath('runner_checkpoint') self._active_file, self._finished_file = (pathspec.given(state=state).getpath('task_path') for state in ('active', 'finished')) self._ckpt_head = 0 self._apply_states() self._lock = threading.Lock()
def test_log_permissions(): with temporary_dir() as td: taskpath = TaskPath(root=td, task_id='task', process='process', run=0) sandbox = setup_sandbox(td, taskpath) p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox) p.start() wait_for_rc(taskpath.getpath('process_checkpoint')) stdout = taskpath.with_filename('stdout').getpath('process_logdir') stderr = taskpath.with_filename('stderr').getpath('process_logdir') assert os.path.exists(stdout) assert os.path.exists(stderr) assert os.stat(stdout).st_uid == os.getuid() assert os.stat(stderr).st_uid == os.getuid()
def start(self, timeout=MAX_WAIT): """Fork the task runner and return once the underlying task is running, up to timeout.""" self.forking.set() self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id) cmdline_args = self._cmdline() log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args)) cwd = os.environ.get('MESOS_DIRECTORY') try: self._popen = subprocess.Popen(cmdline_args, cwd=cwd) except OSError as e: raise TaskError(e) self.forked.set() self.wait_start(timeout=timeout)
def setup_task(self, task, root, finished=False, corrupt=False): """Set up the checkpoint stream for the given task in the given checkpoint root, optionally finished and/or with a corrupt stream""" class FastTaskRunner(TaskRunner): COORDINATOR_INTERVAL_SLEEP = Amount(1, Time.MILLISECONDS) tr = FastTaskRunner( task=task, checkpoint_root=root, sandbox=os.path.join(root, 'sandbox', task.name().get()), clock=ThreadedClock(time.time())) with tr.control(): # initialize checkpoint stream pass if finished: tr.kill() if corrupt: ckpt_file = TaskPath(root=root, tr=tr.task_id).getpath('runner_checkpoint') with open(ckpt_file, 'w') as f: f.write("definitely not a valid checkpoint stream") return tr.task_id
def test_simple_process_other_user(*args): with temporary_dir() as td: some_user = get_other_nonroot_user() taskpath = TaskPath(root=td, task_id='task', process='process', run=0) sandbox = setup_sandbox(td, taskpath) p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox, user=some_user.pw_name) p.start() wait_for_rc(taskpath.getpath('process_checkpoint')) # since we're not actually root, the best we can do is check the right things were attempted assert os.setgroups.calledwith( [g.gr_gid for g in grp.getgrall() if some_user.pw_name in g]) assert os.setgid.calledwith(some_user.pw_uid) assert os.setuid.calledwith(some_user.pw_gid)
def test_sample_by_process(self, mock_get_active_processes, mock_sample): fake_process_name = 'fake-process-name' task_path = TaskPath(root='.') task_monitor = TaskMonitor(task_path, 'fake-task-id') fake_process_status = ProcessStatus(process=fake_process_name) mock_get_active_processes.return_value = [(fake_process_status, 1)] fake_process_sample = ProcessSample.empty() mock_sample.return_value = fake_process_sample task_resource_monitor = TaskResourceMonitor(task_monitor, '.') assert fake_process_sample == task_resource_monitor.sample_by_process( fake_process_name) assert mock_get_active_processes.mock_calls == [ mock.call(task_monitor) ] assert mock_sample.mock_calls == [ mock.call( task_resource_monitor._process_collectors[fake_process_status]) ]
def test_log_permissions_other_user(*mocks): with temporary_dir() as td: some_user = get_other_nonroot_user() taskpath = TaskPath(root=td, task_id='task', process='process', run=0) sandbox = setup_sandbox(td, taskpath) p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox, user=some_user.pw_name) p.start() wait_for_rc(taskpath.getpath('process_checkpoint')) # since we're not actually root, the best we can do is check the right things were attempted stdout = taskpath.with_filename('stdout').getpath('process_logdir') stderr = taskpath.with_filename('stderr').getpath('process_logdir') assert os.path.exists(stdout) assert os.path.exists(stderr) assert os.chown.calledwith(stdout, some_user.pw_uid, some_user.pw_gid) assert os.chown.calledwith(stderr, some_user.pw_uid, some_user.pw_gid)
def __init__(self, task, success_rate=100, random_seed=31337, **extra_task_runner_args): """ task = Thermos task portmap = port map success_rate = success rate of writing checkpoint to disk """ self.task = task with temporary_file(cleanup=False) as fp: self.job_filename = fp.name fp.write(ThermosTaskWrapper(task).to_json()) self.state_filename = tempfile.mktemp() self.tempdir = tempfile.mkdtemp() self.task_id = '%s-runner-base' % int(time.time() * 1000000) self.sandbox = os.path.join(self.tempdir, 'sandbox') self.extra_task_runner_args = extra_task_runner_args self.cleaned = False self.pathspec = TaskPath(root=self.tempdir, task_id=self.task_id) self.script_filename = None self.success_rate = success_rate self.random_seed = random_seed self._run_count = 0
def start(self, timeout=MAX_WAIT): """Fork the task runner and return once the underlying task is running, up to timeout.""" self.forking.set() try: chmod_plus_x(self._runner_pex) except OSError as e: if e.errno != errno.EPERM: raise TaskError('Failed to chmod +x runner: %s' % e) self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id) cmdline_args = self._cmdline() log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args)) try: self._popen = subprocess.Popen(cmdline_args) except OSError as e: raise TaskError(e) self.forked.set() log.debug('Waiting for task to start.') def is_started(): return self._monitor and (self._monitor.active or self._monitor.finished) waited = Amount(0, Time.SECONDS) while not is_started() and waited < timeout: log.debug(' - sleeping...') self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL if not is_started(): log.error('Task did not start with in deadline, forcing loss.') self.lose() raise TaskError('Task did not start within deadline.')
def _runner_ckpt(self, task_id): """Return the runner checkpoint file for a given task_id.""" return TaskPath(root=self._checkpoint_root, task_id=task_id).getpath('runner_checkpoint')
def __init__(self, task, checkpoint_root, sandbox, log_dir=None, task_id=None, portmap=None, user=None, chroot=False, clock=time, universal_handler=None, planner_class=TaskPlanner, hostname=None, process_logger_destination=None, process_logger_mode=None, rotate_log_size_mb=None, rotate_log_backups=None, preserve_env=False): """ required: task (config.Task) = the task to run checkpoint_root (path) = the checkpoint root sandbox (path) = the sandbox in which the path will be run [if None, cwd will be assumed, but garbage collection will be disabled for this task.] optional: log_dir (string) = directory to house stdout/stderr logs. If not specified, logs will be written into the sandbox directory under .logs/ task_id (string) = bind to this task id. if not specified, will synthesize an id based upon task.name() portmap (dict) = a map (string => integer) from name to port, e.g. { 'http': 80 } user (string) = the user to run the task as. if not current user, requires setuid privileges. chroot (boolean) = whether or not to chroot into the sandbox prior to exec. clock (time interface) = the clock to use throughout universal_handler = checkpoint record handler (only used for testing) planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task planning policy. process_logger_destination (string) = The destination of logger to use for all processes. process_logger_mode (string) = The mode of logger to use for all processes. rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB. rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups. preserve_env (boolean) = whether or not env variables for the runner should be in the env for the task being run """ if not issubclass(planner_class, TaskPlanner): raise TypeError('planner_class must be a TaskPlanner.') self._clock = clock launch_time = self._clock.time() launch_time_ms = '%06d' % int( (launch_time - int(launch_time)) * (10**6)) if not task_id: self._task_id = '%s-%s.%s' % ( task.name(), time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)), launch_time_ms) else: self._task_id = task_id current_user = TaskRunnerHelper.get_actual_user() self._user = user or current_user # TODO(wickman) This should be delegated to the ProcessPlatform / Helper if self._user != current_user: if os.geteuid() != 0: raise ValueError( 'task specifies user as %s, but %s does not have setuid permission!' % (self._user, current_user)) self._portmap = portmap or {} self._launch_time = launch_time self._log_dir = log_dir or os.path.join(sandbox, '.logs') self._process_logger_destination = process_logger_destination self._process_logger_mode = process_logger_mode self._rotate_log_size_mb = rotate_log_size_mb self._rotate_log_backups = rotate_log_backups self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir) self._hostname = hostname or socket.gethostname() try: ThermosTaskValidator.assert_valid_task(task) ThermosTaskValidator.assert_valid_ports(task, self._portmap) except ThermosTaskValidator.InvalidTaskError as e: raise self.InvalidTask('Invalid task: %s' % e) context = ThermosContext(task_id=self._task_id, ports=self._portmap, user=self._user) self._task, uninterp = (task % Environment(thermos=context)).interpolate() if len(uninterp) > 0: raise self.InvalidTask('Failed to interpolate task, missing: %s' % ', '.join(str(ref) for ref in uninterp)) try: ThermosTaskValidator.assert_same_task(self._pathspec, self._task) except ThermosTaskValidator.InvalidTaskError as e: raise self.InvalidTask('Invalid task: %s' % e) self._plan = None # plan currently being executed (updated by Handlers) self._regular_plan = planner_class( self._task, clock=clock, process_filter=lambda proc: proc.final().get() is False) self._finalizing_plan = planner_class( self._task, clock=clock, process_filter=lambda proc: proc.final().get() is True) self._chroot = chroot self._sandbox = sandbox self._terminal_state = None self._ckpt = None self._process_map = dict( (p.name().get(), p) for p in self._task.processes()) self._task_processes = {} self._stages = dict( (state, stage(self)) for state, stage in self.STAGES.items()) self._finalization_start = None self._preemption_deadline = None self._watcher = ProcessMuxer(self._pathspec) self._state = RunnerState(processes={}) self._preserve_env = preserve_env # create runner state universal_handler = universal_handler or TaskRunnerUniversalHandler self._dispatcher = CheckpointDispatcher() self._dispatcher.register_handler(universal_handler(self)) self._dispatcher.register_handler(TaskRunnerProcessHandler(self)) self._dispatcher.register_handler(TaskRunnerTaskHandler(self)) # recover checkpointed runner state and update plan self._recovery = True self._replay_runner_ckpt()