def test_process_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'process' assert run_number == 0 os.kill(process_state.pid, signal.SIGKILL) while True: if not hasattr(runner, 'state'): time.sleep(0.1) else: break assert runner.state.statuses[-1].state == TaskState.SUCCESS assert 'process' in runner.state.processes assert len(runner.state.processes['process']) == 2 assert runner.state.processes['process'][ 0].state == ProcessState.KILLED assert runner.state.processes['process'][ 0].return_code == -signal.SIGKILL assert runner.state.processes['process'][ 1].state == ProcessState.SUCCESS
def test_basic_as_job(self): proxy_driver = ProxyDriver() with temporary_dir() as tempdir: te = ThermosExecutor(runner_provider=make_provider(tempdir), sandbox_provider=DefaultTestSandboxProvider) te.launchTask(proxy_driver, make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0)) te.runner_started.wait() while te._status_manager is None: time.sleep(0.1) te.terminated.wait() tm = TaskMonitor(TaskPath(root=tempdir), task_id=HELLO_WORLD_TASK_ID) runner_state = tm.get_state() assert 'hello_world_hello_world-001' in runner_state.processes, ( 'Could not find processes, got: %s' % ' '.join(runner_state.processes)) updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 3 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb.TASK_STARTING assert status_updates[1].state == mesos_pb.TASK_RUNNING assert status_updates[2].state == mesos_pb.TASK_FINISHED
def add_active_task(self, task_id): if task_id in self.finished_tasks: log.error('Found an active task (%s) in finished tasks?' % task_id) return task_monitor = TaskMonitor(self._pathspec, task_id) if not task_monitor.get_state().header: log.info('Unable to load task "%s"' % task_id) return sandbox = task_monitor.get_state().header.sandbox resource_monitor = self._resource_monitor(task_monitor, sandbox) resource_monitor.start() self._active_tasks[task_id] = ActiveObservedTask( task_id=task_id, pathspec=self._pathspec, task_monitor=task_monitor, resource_monitor=resource_monitor )
def test_coordinator_dead_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "ignorant_process" assert run_number == 0 os.kill(runner.po.pid, signal.SIGKILL) os.kill(process_state.coordinator_pid, signal.SIGKILL) os.kill(process_state.pid, signal.SIGKILL) killer = TaskRunner.get(runner.task_id, runner.root) assert killer is not None killer.kill(force=True) state = tm.get_state() assert len(state.processes["ignorant_process"]) == 1 assert state.processes["ignorant_process"][0].state == ProcessState.LOST
def test_coordinator_dead_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(runner.po.pid, signal.SIGKILL) os.kill(process_state.coordinator_pid, signal.SIGKILL) os.kill(process_state.pid, signal.SIGKILL) killer = TaskRunner.get(runner.task_id, runner.root) assert killer is not None killer.kill(force=True) state = tm.get_state() assert len(state.processes['ignorant_process']) == 1 assert state.processes['ignorant_process'][ 0].state == ProcessState.LOST
def test_preemption_wait(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "ignorant_process" assert run_number == 0 preempter = TaskRunner.get(runner.task_id, runner.root) assert preempter is not None now = time.time() preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS)) duration = time.time() - now # This is arbitrary, but make sure we finish within half a second of # requested preemption wait. assert abs(duration - 1.0) < 0.5 assert preempter.state.statuses[-1].state == TaskState.KILLED assert preempter.state.processes["ignorant_process"][-1].state == ProcessState.KILLED
def test_basic(self): proxy_driver = ProxyDriver() with temporary_dir() as tempdir: te = ThermosExecutor( runner_provider=make_provider(tempdir), sandbox_provider=DefaultTestSandboxProvider) te.launchTask(proxy_driver, make_task(HELLO_WORLD_MTI)) te.terminated.wait() tm = TaskMonitor(TaskPath(root=tempdir), task_id=HELLO_WORLD_TASK_ID) runner_state = tm.get_state() assert 'hello_world_hello_world-001' in runner_state.processes, ( 'Could not find processes, got: %s' % ' '.join(runner_state.processes)) updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 3 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb.TASK_STARTING assert status_updates[1].state == mesos_pb.TASK_RUNNING assert status_updates[2].state == mesos_pb.TASK_FINISHED
def test_preemption_wait(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 preempter = TaskRunner.get(runner.task_id, runner.root) assert preempter is not None now = time.time() preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS)) duration = time.time() - now # This is arbitrary, but make sure we finish within half a second of # requested preemption wait. assert abs(duration - 1.0) < 0.5 assert preempter.state.statuses[-1].state == TaskState.KILLED assert preempter.state.processes['ignorant_process'][ -1].state == ProcessState.KILLED
def test_coordinator_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "process" assert run_number == 0 os.kill(process_state.coordinator_pid, signal.SIGKILL) while True: if not hasattr(runner, "state"): time.sleep(0.1) else: break assert runner.state.statuses[-1].state == TaskState.SUCCESS assert "process" in runner.state.processes assert len(runner.state.processes["process"]) == 2 assert runner.state.processes["process"][0].state == ProcessState.LOST assert runner.state.processes["process"][1].state == ProcessState.SUCCESS
def test_pg_is_killed(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'process' assert run_number == 0 child_pidfile = os.path.join(runner.sandbox, runner.task_id, 'child.txt') while not os.path.exists(child_pidfile): time.sleep(0.1) parent_pidfile = os.path.join(runner.sandbox, runner.task_id, 'parent.txt') while not os.path.exists(parent_pidfile): time.sleep(0.1) with open(child_pidfile) as fp: child_pid = int(fp.read().rstrip()) with open(parent_pidfile) as fp: parent_pid = int(fp.read().rstrip()) ps = ProcessProviderFactory.get() ps.collect_all() assert parent_pid in ps.pids() assert child_pid in ps.pids() assert child_pid in ps.children_of(parent_pid) with open(os.path.join(runner.sandbox, runner.task_id, 'exit.txt'), 'w') as fp: fp.write('go away!') while tm.task_state() is not TaskState.SUCCESS: time.sleep(0.1) state = tm.get_state() assert state.processes['process'][0].state == ProcessState.SUCCESS ps.collect_all() assert parent_pid not in ps.pids() assert child_pid not in ps.pids()
def test_pg_is_killed(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "process" assert run_number == 0 child_pidfile = os.path.join(runner.sandbox, runner.task_id, "child.txt") while not os.path.exists(child_pidfile): time.sleep(0.1) parent_pidfile = os.path.join(runner.sandbox, runner.task_id, "parent.txt") while not os.path.exists(parent_pidfile): time.sleep(0.1) with open(child_pidfile) as fp: child_pid = int(fp.read().rstrip()) with open(parent_pidfile) as fp: parent_pid = int(fp.read().rstrip()) ps = ProcessProviderFactory.get() ps.collect_all() assert parent_pid in ps.pids() assert child_pid in ps.pids() assert child_pid in ps.children_of(parent_pid) with open(os.path.join(runner.sandbox, runner.task_id, "exit.txt"), "w") as fp: fp.write("go away!") while tm.task_state() is not TaskState.SUCCESS: time.sleep(0.1) state = tm.get_state() assert state.processes["process"][0].state == ProcessState.SUCCESS ps.collect_all() assert parent_pid not in ps.pids() assert child_pid not in ps.pids()
def start(self, timeout=MAX_WAIT): """Fork the task runner and return once the underlying task is running, up to timeout.""" self.forking.set() try: chmod_plus_x(self._runner_pex) except OSError as e: if e.errno != errno.EPERM: raise TaskError('Failed to chmod +x runner: %s' % e) self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id) cmdline_args = self._cmdline() log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args)) try: self._popen = subprocess.Popen(cmdline_args) except OSError as e: raise TaskError(e) self.forked.set() log.debug('Waiting for task to start.') def is_started(): return self._monitor and (self._monitor.active or self._monitor.finished) waited = Amount(0, Time.SECONDS) while not is_started() and waited < timeout: log.debug(' - sleeping...') self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL if not is_started(): log.error('Task did not start with in deadline, forcing loss.') self.lose() raise TaskError('Task did not start within deadline.')
def test_coordinator_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(process_state.coordinator_pid, signal.SIGKILL) while True: active_procs = tm.get_active_processes() if active_procs and active_procs[0][1] > 0: break time.sleep(0.2) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 1 os.kill(process_state.pid, signal.SIGKILL) while True: active_procs = tm.get_active_processes() if active_procs and active_procs[0][1] > 1: break time.sleep(0.2) self.wait_until_running(tm) os.kill(runner.po.pid, signal.SIGKILL) try: state = tm.get_state() assert state.processes['ignorant_process'][ 0].state == ProcessState.LOST assert state.processes['ignorant_process'][ 1].state == ProcessState.KILLED assert state.processes['ignorant_process'][ 2].state == ProcessState.RUNNING finally: os.kill(state.processes['ignorant_process'][2].coordinator_pid, signal.SIGKILL) os.kill(state.processes['ignorant_process'][2].pid, signal.SIGKILL)
def test_coordinator_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "ignorant_process" assert run_number == 0 os.kill(process_state.coordinator_pid, signal.SIGKILL) while True: active_procs = tm.get_active_processes() if active_procs and active_procs[0][1] > 0: break time.sleep(0.2) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "ignorant_process" assert run_number == 1 os.kill(process_state.pid, signal.SIGKILL) while True: active_procs = tm.get_active_processes() if active_procs and active_procs[0][1] > 1: break time.sleep(0.2) self.wait_until_running(tm) os.kill(runner.po.pid, signal.SIGKILL) try: state = tm.get_state() assert state.processes["ignorant_process"][0].state == ProcessState.LOST assert state.processes["ignorant_process"][1].state == ProcessState.KILLED assert state.processes["ignorant_process"][2].state == ProcessState.RUNNING finally: os.kill(state.processes["ignorant_process"][2].coordinator_pid, signal.SIGKILL) os.kill(state.processes["ignorant_process"][2].pid, signal.SIGKILL)
def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error('Expected a task to tail, got nothing!') if len(args) not in (1, 2): app.error( 'Expected at most two arguments (task and optional process), got %d' % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file( detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print('ERROR: No processes found.', file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print('ERROR: More than one process matches query.', file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath('process_logdir') logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout') monitor = TaskMonitor(TaskPath(root=options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print('Tail of terminal log %s' % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print('Tail of active log %s' % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0