def test_basic_as_job(self): proxy_driver = ProxyDriver() with temporary_dir() as tempdir: te = AuroraExecutor(runner_provider=make_provider(tempdir), sandbox_provider=DefaultTestSandboxProvider()) te.launchTask(proxy_driver, make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0)) te.runner_started.wait() while te._status_manager is None: time.sleep(0.1) te.terminated.wait() tm = TaskMonitor(TaskPath(root=tempdir), task_id=HELLO_WORLD_TASK_ID) runner_state = tm.get_state() assert 'hello_world_hello_world-001' in runner_state.processes, ( 'Could not find processes, got: %s' % ' '.join(runner_state.processes)) updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 3 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb2.TASK_STARTING assert status_updates[1].state == mesos_pb2.TASK_RUNNING assert status_updates[2].state == mesos_pb2.TASK_FINISHED
def test_process_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'process' assert run_number == 0 os.kill(process_state.pid, signal.SIGKILL) while True: if not hasattr(runner, 'state'): time.sleep(0.1) else: break assert runner.state.statuses[-1].state == TaskState.SUCCESS assert 'process' in runner.state.processes assert len(runner.state.processes['process']) == 2 assert runner.state.processes['process'][ 0].state == ProcessState.KILLED assert runner.state.processes['process'][ 0].return_code == -signal.SIGKILL assert runner.state.processes['process'][ 1].state == ProcessState.SUCCESS
def add_active_task(self, task_id): if task_id in self.finished_tasks: log.error("Found an active task (%s) in finished tasks?" % task_id) return task_monitor = TaskMonitor(self._pathspec, task_id) if not task_monitor.get_state().header: log.info('Unable to load task "%s"' % task_id) return sandbox = task_monitor.get_state().header.sandbox resource_monitor = self._resource_monitor(task_monitor, sandbox) resource_monitor.start() self._active_tasks[task_id] = ActiveObservedTask( task_id=task_id, pathspec=self._pathspec, task_monitor=task_monitor, resource_monitor=resource_monitor )
def add_active_task(self, task_id): if task_id in self.finished_tasks: log.error('Found an active task (%s) in finished tasks?' % task_id) return task_monitor = TaskMonitor(self._pathspec, task_id) if not task_monitor.get_state().header: log.info('Unable to load task "%s"' % task_id) return sandbox = task_monitor.get_state().header.sandbox resource_monitor = self._resource_monitor(task_monitor, sandbox) resource_monitor.start() self._active_tasks[task_id] = ActiveObservedTask( task_id=task_id, pathspec=self._pathspec, task_monitor=task_monitor, resource_monitor=resource_monitor )
def __on_active(self, root, task_id): log.debug('on_active(%r, %r)', root, task_id) if task_id in self.finished_tasks: log.error('Found an active task (%s) in finished tasks?', task_id) return task_monitor = TaskMonitor(root, task_id) if self._disable_task_resource_collection: resource_monitor = NullTaskResourceMonitor() else: disk_collector_provider = DiskCollectorProvider( self._enable_mesos_disk_collector, self._disk_collector_settings) resource_monitor = TaskResourceMonitor( task_id, task_monitor, disk_collector_provider=disk_collector_provider, process_collection_interval=self. _task_process_collection_interval, disk_collection_interval=self._disk_collector_settings. disk_collection_interval) resource_monitor.start() self._active_tasks[task_id] = ActiveObservedTask( root, task_id, task_monitor, resource_monitor)
def test_sample_by_process_from_history(self, mock_get_active_processes): fake_process_name_1 = 'fake-process-name-1' fake_process_name_2 = 'fake-process-name-2' task_path = '.' task_monitor = TaskMonitor(task_path, 'fake-task-id') fake_process_status_1 = ProcessStatus(process=fake_process_name_1) fake_process_status_2 = ProcessStatus(process=fake_process_name_2) mock_get_active_processes.return_value = [(fake_process_status_1, 1), (fake_process_status_2, 2)] fake_history = ResourceHistory(2) fake_history.add(time(), ResourceMonitorBase.FullResourceResult( {fake_process_status_1: ResourceMonitorBase.ProcResourceResult(ProcessSample.empty(), 1), fake_process_status_2: ResourceMonitorBase.ProcResourceResult(ProcessSample.empty(), 2), }, 10)) task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor, history_provider=self.FakeResourceHistoryProvider(fake_history)) assert task_resource_monitor.name == 'TaskResourceMonitor[fake-task-id]' assert task_resource_monitor.sample_by_process(fake_process_name_1) == ProcessSample.empty() assert task_resource_monitor.sample_by_process(fake_process_name_2) == ProcessSample.empty() _, sample = task_resource_monitor.sample() assert sample.num_procs == 3 # 1 pid in fake_process_status_1 and 2 in fake_process_status_2 assert sample.process_sample == ProcessSample.empty() assert sample.disk_usage == 10 assert mock_get_active_processes.mock_calls == [mock.call(task_monitor), mock.call(task_monitor)]
def from_assigned_task(self, assigned_task, sandbox): task_id = assigned_task.taskId resources = mesos_task_instance_from_assigned_task( assigned_task).task().resources() task_monitor = TaskMonitor(self._checkpoint_root, task_id) resource_monitor = TaskResourceMonitor( task_id, task_monitor, **self._resource_monitor_options) return ResourceManager(resources, resource_monitor)
def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error('Expected a task to tail, got nothing!') if len(args) not in (1, 2): app.error('Expected at most two arguments (task and optional process), got %d' % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print('ERROR: No processes found.', file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print('ERROR: More than one process matches query.', file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath('process_logdir') logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout') monitor = TaskMonitor(TaskPath(root=options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print('Tail of terminal log %s' % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print('Tail of active log %s' % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0
def test_pg_is_killed(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'process' assert run_number == 0 child_pidfile = os.path.join(runner.sandbox, runner.task_id, 'child.txt') while not os.path.exists(child_pidfile): time.sleep(0.1) parent_pidfile = os.path.join(runner.sandbox, runner.task_id, 'parent.txt') while not os.path.exists(parent_pidfile): time.sleep(0.1) with open(child_pidfile) as fp: child_pid = int(fp.read().rstrip()) with open(parent_pidfile) as fp: parent_pid = int(fp.read().rstrip()) ps = ProcessProviderFactory.get() ps.collect_all() assert parent_pid in ps.pids() assert child_pid in ps.pids() assert child_pid in ps.children_of(parent_pid) with open(os.path.join(runner.sandbox, runner.task_id, 'exit.txt'), 'w') as fp: fp.write('go away!') while tm.task_state() is not TaskState.SUCCESS: time.sleep(0.1) state = tm.get_state() assert state.processes['process'][0].state == ProcessState.SUCCESS # Another test case may have called setup_child_subreaping(). We therefore have to reap any # terminated re-parented child processes to ensure that we don't list already terminated # processes (i.e. zombies) in ps.pids() below. TaskRunnerHelper.reap_children() ps.collect_all() assert parent_pid not in ps.pids() assert child_pid not in ps.pids()
def test_coordinator_dead_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(runner.po.pid, signal.SIGKILL) os.kill(process_state.coordinator_pid, signal.SIGKILL) os.kill(process_state.pid, signal.SIGKILL) killer = TaskRunner.get(runner.task_id, runner.root) assert killer is not None killer.kill(force=True) state = tm.get_state() assert len(state.processes['ignorant_process']) == 1 assert state.processes['ignorant_process'][0].state == ProcessState.LOST
def test_preemption_wait(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 preempter = TaskRunner.get(runner.task_id, runner.root) assert preempter is not None now = time.time() preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS)) duration = time.time() - now # This is arbitrary, but make sure we finish within half a second of # requested preemption wait. assert abs(duration - 1.0) < 0.5 assert preempter.state.statuses[-1].state == TaskState.KILLED assert preempter.state.processes['ignorant_process'][-1].state == ProcessState.KILLED
def test_coordinator_dead_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(runner.po.pid, signal.SIGKILL) os.kill(process_state.coordinator_pid, signal.SIGKILL) os.kill(process_state.pid, signal.SIGKILL) killer = TaskRunner.get(runner.task_id, runner.root) assert killer is not None killer.kill(force=True) state = tm.get_state() assert len(state.processes['ignorant_process']) == 1 assert state.processes['ignorant_process'][ 0].state == ProcessState.LOST
def test_basic(self): proxy_driver = ProxyDriver() with temporary_dir() as tempdir: te = AuroraExecutor(runner_provider=make_provider(tempdir), sandbox_provider=DefaultTestSandboxProvider()) te.launchTask(proxy_driver, make_task(HELLO_WORLD_MTI)) te.terminated.wait() tm = TaskMonitor(tempdir, task_id=HELLO_WORLD_TASK_ID) runner_state = tm.get_state() assert "hello_world_hello_world-001" in runner_state.processes, "Could not find processes, got: %s" % " ".join( runner_state.processes ) updates = proxy_driver.method_calls["sendStatusUpdate"] assert len(updates) == 3 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb2.TASK_STARTING assert status_updates[1].state == mesos_pb2.TASK_RUNNING assert status_updates[2].state == mesos_pb2.TASK_FINISHED
def test_coordinator_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'process' assert run_number == 0 os.kill(process_state.coordinator_pid, signal.SIGKILL) while True: if not hasattr(runner, 'state'): time.sleep(0.1) else: break assert runner.state.statuses[-1].state == TaskState.SUCCESS assert 'process' in runner.state.processes assert len(runner.state.processes['process']) == 2 assert runner.state.processes['process'][0].state == ProcessState.LOST assert runner.state.processes['process'][1].state == ProcessState.SUCCESS
def from_assigned_task(self, assigned_task, sandbox): task_id = assigned_task.taskId resources = mesos_task_instance_from_assigned_task( assigned_task).task().resources() task_path = TaskPath(root=self._checkpoint_root, task_id=task_id) task_monitor = TaskMonitor(task_path, task_id) resource_monitor = TaskResourceMonitor( task_monitor, sandbox.root, disk_collector=self._disk_collector, disk_collection_interval=self._disk_collection_interval) return ResourceManager(resources, resource_monitor)
def test_basic(self): proxy_driver = ProxyDriver() with temporary_dir() as tempdir: te = AuroraExecutor(runner_provider=make_provider(tempdir), sandbox_provider=DefaultTestSandboxProvider()) te.launchTask(proxy_driver, make_task(HELLO_WORLD_MTI)) te.terminated.wait() tm = TaskMonitor(tempdir, task_id=HELLO_WORLD_TASK_ID) runner_state = tm.get_state() assert 'hello_world_hello_world-001' in runner_state.processes, ( 'Could not find processes, got: %s' % ' '.join(runner_state.processes)) updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 3 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb2.TASK_STARTING assert status_updates[1].state == mesos_pb2.TASK_RUNNING assert status_updates[2].state == mesos_pb2.TASK_FINISHED
def test_preemption_wait(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 preempter = TaskRunner.get(runner.task_id, runner.root) assert preempter is not None now = time.time() preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS)) duration = time.time() - now # This is arbitrary, but make sure we finish within half a second of # requested preemption wait. assert abs(duration - 1.0) < 0.5 assert preempter.state.statuses[-1].state == TaskState.KILLED assert preempter.state.processes['ignorant_process'][ -1].state == ProcessState.KILLED
def test_sample_by_process_no_process(self, mock_get_active_processes): task_path = '.' task_monitor = TaskMonitor(task_path, 'fake-task-id') mock_get_active_processes.return_value = [] task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor) with self.assertRaises(ValueError): task_resource_monitor.sample_by_process('fake-process-name') assert mock_get_active_processes.mock_calls == [mock.call(task_monitor)]
def test_pg_is_killed(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'process' assert run_number == 0 child_pidfile = os.path.join(runner.sandbox, runner.task_id, 'child.txt') while not os.path.exists(child_pidfile): time.sleep(0.1) parent_pidfile = os.path.join(runner.sandbox, runner.task_id, 'parent.txt') while not os.path.exists(parent_pidfile): time.sleep(0.1) with open(child_pidfile) as fp: child_pid = int(fp.read().rstrip()) with open(parent_pidfile) as fp: parent_pid = int(fp.read().rstrip()) ps = ProcessProviderFactory.get() ps.collect_all() assert parent_pid in ps.pids() assert child_pid in ps.pids() assert child_pid in ps.children_of(parent_pid) with open(os.path.join(runner.sandbox, runner.task_id, 'exit.txt'), 'w') as fp: fp.write('go away!') while tm.task_state() is not TaskState.SUCCESS: time.sleep(0.1) state = tm.get_state() assert state.processes['process'][0].state == ProcessState.SUCCESS ps.collect_all() assert parent_pid not in ps.pids() assert child_pid not in ps.pids()
def test_process_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "process" assert run_number == 0 os.kill(process_state.pid, signal.SIGKILL) while True: if not hasattr(runner, "state"): time.sleep(0.1) else: break assert runner.state.statuses[-1].state == TaskState.SUCCESS assert "process" in runner.state.processes assert len(runner.state.processes["process"]) == 2 assert runner.state.processes["process"][0].state == ProcessState.KILLED assert runner.state.processes["process"][0].return_code == -signal.SIGKILL assert runner.state.processes["process"][1].state == ProcessState.SUCCESS
def test_basic_as_job(self): proxy_driver = ProxyDriver() with temporary_dir() as tempdir: te = AuroraExecutor( runner_provider=make_provider(tempdir), sandbox_provider=DefaultTestSandboxProvider()) te.launchTask(proxy_driver, make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0)) te.runner_started.wait() while te._status_manager is None: time.sleep(0.1) te.terminated.wait() tm = TaskMonitor(tempdir, task_id=HELLO_WORLD_TASK_ID) runner_state = tm.get_state() assert 'hello_world_hello_world-001' in runner_state.processes, ( 'Could not find processes, got: %s' % ' '.join(runner_state.processes)) updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 3 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb2.TASK_STARTING assert status_updates[1].state == mesos_pb2.TASK_RUNNING assert status_updates[2].state == mesos_pb2.TASK_FINISHED
def __on_active(self, root, task_id): log.debug('on_active(%r, %r)' % (root, task_id)) if task_id in self.finished_tasks: log.error('Found an active task (%s) in finished tasks?' % task_id) return task_monitor = TaskMonitor(root, task_id) resource_monitor = self._resource_monitor_class(task_id, task_monitor) resource_monitor.start() self._active_tasks[task_id] = ActiveObservedTask( root, task_id, task_monitor, resource_monitor )
def test_pg_is_killed(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'process' assert run_number == 0 child_pidfile = os.path.join(runner.sandbox, runner.task_id, 'child.txt') while not os.path.exists(child_pidfile): time.sleep(0.1) parent_pidfile = os.path.join(runner.sandbox, runner.task_id, 'parent.txt') while not os.path.exists(parent_pidfile): time.sleep(0.1) with open(child_pidfile) as fp: child_pid = int(fp.read().rstrip()) with open(parent_pidfile) as fp: parent_pid = int(fp.read().rstrip()) ps = ProcessProviderFactory.get() ps.collect_all() assert parent_pid in ps.pids() assert child_pid in ps.pids() assert child_pid in ps.children_of(parent_pid) with open(os.path.join(runner.sandbox, runner.task_id, 'exit.txt'), 'w') as fp: fp.write('go away!') while tm.task_state() is not TaskState.SUCCESS: time.sleep(0.1) state = tm.get_state() assert state.processes['process'][0].state == ProcessState.SUCCESS ps.collect_all() assert parent_pid not in ps.pids() assert child_pid not in ps.pids()
def test_sample_by_process(self, mock_get_active_processes, mock_sample): fake_process_name = 'fake-process-name' task_path = '.' task_monitor = TaskMonitor(task_path, 'fake-task-id') fake_process_status = ProcessStatus(process=fake_process_name) mock_get_active_processes.return_value = [(fake_process_status, 1)] fake_process_sample = ProcessSample.empty() mock_sample.return_value = fake_process_sample task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor) assert fake_process_sample == task_resource_monitor.sample_by_process(fake_process_name) assert mock_get_active_processes.mock_calls == [mock.call(task_monitor)] assert mock_sample.mock_calls == [mock.call( task_resource_monitor._process_collectors[fake_process_status])]
def start(self, timeout=MAX_WAIT): """Fork the task runner and return once the underlying task is running, up to timeout.""" self.forking.set() self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id) cmdline_args = self._cmdline() log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args)) cwd = os.environ.get('MESOS_DIRECTORY') try: self._popen = subprocess.Popen(cmdline_args, cwd=cwd) except OSError as e: raise TaskError(e) self.forked.set() self.wait_start(timeout=timeout)
def test_coordinator_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(process_state.coordinator_pid, signal.SIGKILL) while True: active_procs = tm.get_active_processes() if active_procs and active_procs[0][1] > 0: break time.sleep(0.2) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 1 os.kill(process_state.pid, signal.SIGKILL) while True: active_procs = tm.get_active_processes() if active_procs and active_procs[0][1] > 1: break time.sleep(0.2) self.wait_until_running(tm) os.kill(runner.po.pid, signal.SIGKILL) try: state = tm.get_state() assert state.processes['ignorant_process'][ 0].state == ProcessState.LOST assert state.processes['ignorant_process'][ 1].state == ProcessState.KILLED assert state.processes['ignorant_process'][ 2].state == ProcessState.RUNNING finally: os.kill(state.processes['ignorant_process'][2].coordinator_pid, signal.SIGKILL) os.kill(state.processes['ignorant_process'][2].pid, signal.SIGKILL)
def start(self, timeout=MAX_WAIT): """Fork the task runner and return once the underlying task is running, up to timeout.""" self.forking.set() try: chmod_plus_x(self._runner_pex) except OSError as e: if e.errno != errno.EPERM: raise TaskError('Failed to chmod +x runner: %s' % e) self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id) cmdline_args = self._cmdline() log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args)) try: self._popen = subprocess.Popen(cmdline_args) except OSError as e: raise TaskError(e) self.forked.set() log.debug('Waiting for task to start.') def is_started(): return self._monitor and (self._monitor.active or self._monitor.finished) waited = Amount(0, Time.SECONDS) while not is_started() and waited < timeout: log.debug(' - sleeping...') self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL if not is_started(): log.error('Task did not start with in deadline, forcing loss.') self.lose() raise TaskError('Task did not start within deadline.')
def test_coordinator_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(process_state.coordinator_pid, signal.SIGKILL) while True: active_procs = tm.get_active_processes() if active_procs and active_procs[0][1] > 0: break time.sleep(0.2) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 1 os.kill(process_state.pid, signal.SIGKILL) while True: active_procs = tm.get_active_processes() if active_procs and active_procs[0][1] > 1: break time.sleep(0.2) self.wait_until_running(tm) os.kill(runner.po.pid, signal.SIGKILL) try: state = tm.get_state() assert state.processes['ignorant_process'][0].state == ProcessState.LOST assert state.processes['ignorant_process'][1].state == ProcessState.KILLED assert state.processes['ignorant_process'][2].state == ProcessState.RUNNING finally: os.kill(state.processes['ignorant_process'][2].coordinator_pid, signal.SIGKILL) os.kill(state.processes['ignorant_process'][2].pid, signal.SIGKILL)