def test_process_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)

        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'process'
        assert run_number == 0
        os.kill(process_state.pid, signal.SIGKILL)

        while True:
            if not hasattr(runner, 'state'):
                time.sleep(0.1)
            else:
                break

        assert runner.state.statuses[-1].state == TaskState.SUCCESS
        assert 'process' in runner.state.processes
        assert len(runner.state.processes['process']) == 2
        assert runner.state.processes['process'][
            0].state == ProcessState.KILLED
        assert runner.state.processes['process'][
            0].return_code == -signal.SIGKILL
        assert runner.state.processes['process'][
            1].state == ProcessState.SUCCESS
Example #2
0
    def test_basic_as_job(self):
        proxy_driver = ProxyDriver()

        with temporary_dir() as tempdir:
            te = ThermosExecutor(runner_provider=make_provider(tempdir),
                                 sandbox_provider=DefaultTestSandboxProvider)
            te.launchTask(proxy_driver,
                          make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0))
            te.runner_started.wait()
            while te._status_manager is None:
                time.sleep(0.1)
            te.terminated.wait()
            tm = TaskMonitor(TaskPath(root=tempdir),
                             task_id=HELLO_WORLD_TASK_ID)
            runner_state = tm.get_state()

        assert 'hello_world_hello_world-001' in runner_state.processes, (
            'Could not find processes, got: %s' %
            ' '.join(runner_state.processes))
        updates = proxy_driver.method_calls['sendStatusUpdate']
        assert len(updates) == 3
        status_updates = [arg_tuple[0][0] for arg_tuple in updates]
        assert status_updates[0].state == mesos_pb.TASK_STARTING
        assert status_updates[1].state == mesos_pb.TASK_RUNNING
        assert status_updates[2].state == mesos_pb.TASK_FINISHED
Example #3
0
 def add_active_task(self, task_id):
   if task_id in self.finished_tasks:
     log.error('Found an active task (%s) in finished tasks?' % task_id)
     return
   task_monitor = TaskMonitor(self._pathspec, task_id)
   if not task_monitor.get_state().header:
     log.info('Unable to load task "%s"' % task_id)
     return
   sandbox = task_monitor.get_state().header.sandbox
   resource_monitor = self._resource_monitor(task_monitor, sandbox)
   resource_monitor.start()
   self._active_tasks[task_id] = ActiveObservedTask(
     task_id=task_id, pathspec=self._pathspec,
     task_monitor=task_monitor, resource_monitor=resource_monitor
   )
    def test_coordinator_dead_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == "ignorant_process"
        assert run_number == 0

        os.kill(runner.po.pid, signal.SIGKILL)
        os.kill(process_state.coordinator_pid, signal.SIGKILL)
        os.kill(process_state.pid, signal.SIGKILL)

        killer = TaskRunner.get(runner.task_id, runner.root)
        assert killer is not None
        killer.kill(force=True)

        state = tm.get_state()
        assert len(state.processes["ignorant_process"]) == 1
        assert state.processes["ignorant_process"][0].state == ProcessState.LOST
    def test_coordinator_dead_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 0

        os.kill(runner.po.pid, signal.SIGKILL)
        os.kill(process_state.coordinator_pid, signal.SIGKILL)
        os.kill(process_state.pid, signal.SIGKILL)

        killer = TaskRunner.get(runner.task_id, runner.root)
        assert killer is not None
        killer.kill(force=True)

        state = tm.get_state()
        assert len(state.processes['ignorant_process']) == 1
        assert state.processes['ignorant_process'][
            0].state == ProcessState.LOST
    def test_preemption_wait(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == "ignorant_process"
        assert run_number == 0

        preempter = TaskRunner.get(runner.task_id, runner.root)
        assert preempter is not None
        now = time.time()
        preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS))
        duration = time.time() - now

        # This is arbitrary, but make sure we finish within half a second of
        # requested preemption wait.
        assert abs(duration - 1.0) < 0.5

        assert preempter.state.statuses[-1].state == TaskState.KILLED
        assert preempter.state.processes["ignorant_process"][-1].state == ProcessState.KILLED
  def test_basic(self):
    proxy_driver = ProxyDriver()

    with temporary_dir() as tempdir:
      te = ThermosExecutor(
          runner_provider=make_provider(tempdir),
          sandbox_provider=DefaultTestSandboxProvider)
      te.launchTask(proxy_driver, make_task(HELLO_WORLD_MTI))
      te.terminated.wait()
      tm = TaskMonitor(TaskPath(root=tempdir), task_id=HELLO_WORLD_TASK_ID)
      runner_state = tm.get_state()

    assert 'hello_world_hello_world-001' in runner_state.processes, (
      'Could not find processes, got: %s' % ' '.join(runner_state.processes))

    updates = proxy_driver.method_calls['sendStatusUpdate']
    assert len(updates) == 3
    status_updates = [arg_tuple[0][0] for arg_tuple in updates]
    assert status_updates[0].state == mesos_pb.TASK_STARTING
    assert status_updates[1].state == mesos_pb.TASK_RUNNING
    assert status_updates[2].state == mesos_pb.TASK_FINISHED
    def test_preemption_wait(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 0

        preempter = TaskRunner.get(runner.task_id, runner.root)
        assert preempter is not None
        now = time.time()
        preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS))
        duration = time.time() - now

        # This is arbitrary, but make sure we finish within half a second of
        # requested preemption wait.
        assert abs(duration - 1.0) < 0.5

        assert preempter.state.statuses[-1].state == TaskState.KILLED
        assert preempter.state.processes['ignorant_process'][
            -1].state == ProcessState.KILLED
    def test_coordinator_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)

        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == "process"
        assert run_number == 0
        os.kill(process_state.coordinator_pid, signal.SIGKILL)

        while True:
            if not hasattr(runner, "state"):
                time.sleep(0.1)
            else:
                break

        assert runner.state.statuses[-1].state == TaskState.SUCCESS
        assert "process" in runner.state.processes
        assert len(runner.state.processes["process"]) == 2
        assert runner.state.processes["process"][0].state == ProcessState.LOST
        assert runner.state.processes["process"][1].state == ProcessState.SUCCESS
    def test_pg_is_killed(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'process'
        assert run_number == 0

        child_pidfile = os.path.join(runner.sandbox, runner.task_id,
                                     'child.txt')
        while not os.path.exists(child_pidfile):
            time.sleep(0.1)
        parent_pidfile = os.path.join(runner.sandbox, runner.task_id,
                                      'parent.txt')
        while not os.path.exists(parent_pidfile):
            time.sleep(0.1)
        with open(child_pidfile) as fp:
            child_pid = int(fp.read().rstrip())
        with open(parent_pidfile) as fp:
            parent_pid = int(fp.read().rstrip())

        ps = ProcessProviderFactory.get()
        ps.collect_all()
        assert parent_pid in ps.pids()
        assert child_pid in ps.pids()
        assert child_pid in ps.children_of(parent_pid)

        with open(os.path.join(runner.sandbox, runner.task_id, 'exit.txt'),
                  'w') as fp:
            fp.write('go away!')

        while tm.task_state() is not TaskState.SUCCESS:
            time.sleep(0.1)

        state = tm.get_state()
        assert state.processes['process'][0].state == ProcessState.SUCCESS

        ps.collect_all()
        assert parent_pid not in ps.pids()
        assert child_pid not in ps.pids()
    def test_pg_is_killed(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == "process"
        assert run_number == 0

        child_pidfile = os.path.join(runner.sandbox, runner.task_id, "child.txt")
        while not os.path.exists(child_pidfile):
            time.sleep(0.1)
        parent_pidfile = os.path.join(runner.sandbox, runner.task_id, "parent.txt")
        while not os.path.exists(parent_pidfile):
            time.sleep(0.1)
        with open(child_pidfile) as fp:
            child_pid = int(fp.read().rstrip())
        with open(parent_pidfile) as fp:
            parent_pid = int(fp.read().rstrip())

        ps = ProcessProviderFactory.get()
        ps.collect_all()
        assert parent_pid in ps.pids()
        assert child_pid in ps.pids()
        assert child_pid in ps.children_of(parent_pid)

        with open(os.path.join(runner.sandbox, runner.task_id, "exit.txt"), "w") as fp:
            fp.write("go away!")

        while tm.task_state() is not TaskState.SUCCESS:
            time.sleep(0.1)

        state = tm.get_state()
        assert state.processes["process"][0].state == ProcessState.SUCCESS

        ps.collect_all()
        assert parent_pid not in ps.pids()
        assert child_pid not in ps.pids()
Example #12
0
    def start(self, timeout=MAX_WAIT):
        """Fork the task runner and return once the underlying task is running, up to timeout."""
        self.forking.set()

        try:
            chmod_plus_x(self._runner_pex)
        except OSError as e:
            if e.errno != errno.EPERM:
                raise TaskError('Failed to chmod +x runner: %s' % e)

        self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root),
                                    self._task_id)

        cmdline_args = self._cmdline()
        log.info('Forking off runner with cmdline: %s' %
                 ' '.join(cmdline_args))

        try:
            self._popen = subprocess.Popen(cmdline_args)
        except OSError as e:
            raise TaskError(e)

        self.forked.set()

        log.debug('Waiting for task to start.')

        def is_started():
            return self._monitor and (self._monitor.active
                                      or self._monitor.finished)

        waited = Amount(0, Time.SECONDS)
        while not is_started() and waited < timeout:
            log.debug('  - sleeping...')
            self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
            waited += self.POLL_INTERVAL

        if not is_started():
            log.error('Task did not start with in deadline, forcing loss.')
            self.lose()
            raise TaskError('Task did not start within deadline.')
    def test_coordinator_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 0
        os.kill(process_state.coordinator_pid, signal.SIGKILL)

        while True:
            active_procs = tm.get_active_processes()
            if active_procs and active_procs[0][1] > 0:
                break
            time.sleep(0.2)
        self.wait_until_running(tm)

        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 1
        os.kill(process_state.pid, signal.SIGKILL)

        while True:
            active_procs = tm.get_active_processes()
            if active_procs and active_procs[0][1] > 1:
                break
            time.sleep(0.2)
        self.wait_until_running(tm)

        os.kill(runner.po.pid, signal.SIGKILL)

        try:
            state = tm.get_state()
            assert state.processes['ignorant_process'][
                0].state == ProcessState.LOST
            assert state.processes['ignorant_process'][
                1].state == ProcessState.KILLED
            assert state.processes['ignorant_process'][
                2].state == ProcessState.RUNNING
        finally:
            os.kill(state.processes['ignorant_process'][2].coordinator_pid,
                    signal.SIGKILL)
            os.kill(state.processes['ignorant_process'][2].pid, signal.SIGKILL)
    def test_coordinator_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.pathspec, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == "ignorant_process"
        assert run_number == 0
        os.kill(process_state.coordinator_pid, signal.SIGKILL)

        while True:
            active_procs = tm.get_active_processes()
            if active_procs and active_procs[0][1] > 0:
                break
            time.sleep(0.2)
        self.wait_until_running(tm)

        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == "ignorant_process"
        assert run_number == 1
        os.kill(process_state.pid, signal.SIGKILL)

        while True:
            active_procs = tm.get_active_processes()
            if active_procs and active_procs[0][1] > 1:
                break
            time.sleep(0.2)
        self.wait_until_running(tm)

        os.kill(runner.po.pid, signal.SIGKILL)

        try:
            state = tm.get_state()
            assert state.processes["ignorant_process"][0].state == ProcessState.LOST
            assert state.processes["ignorant_process"][1].state == ProcessState.KILLED
            assert state.processes["ignorant_process"][2].state == ProcessState.RUNNING
        finally:
            os.kill(state.processes["ignorant_process"][2].coordinator_pid, signal.SIGKILL)
            os.kill(state.processes["ignorant_process"][2].pid, signal.SIGKILL)
Example #15
0
def tail(args, options):
    """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
    if len(args) == 0:
        app.error('Expected a task to tail, got nothing!')
    if len(args) not in (1, 2):
        app.error(
            'Expected at most two arguments (task and optional process), got %d'
            % len(args))

    task_id = args[0]
    detector = TaskDetector(root=options.root)
    checkpoint = CheckpointDispatcher.from_file(
        detector.get_checkpoint(task_id))
    log_dir = checkpoint.header.log_dir
    process_runs = [(process, run)
                    for (process,
                         run) in detector.get_process_runs(task_id, log_dir)]
    if len(args) == 2:
        process_runs = [(process, run) for (process, run) in process_runs
                        if process == args[1]]

    if len(process_runs) == 0:
        print('ERROR: No processes found.', file=sys.stderr)
        sys.exit(1)

    processes = set([process for process, _ in process_runs])
    if len(processes) != 1:
        print('ERROR: More than one process matches query.', file=sys.stderr)
        sys.exit(1)

    process = processes.pop()
    run = max([run for _, run in process_runs])

    logdir = TaskPath(root=options.root,
                      task_id=args[0],
                      process=process,
                      run=run,
                      log_dir=log_dir).getpath('process_logdir')
    logfile = os.path.join(logdir,
                           'stderr' if options.use_stderr else 'stdout')

    monitor = TaskMonitor(TaskPath(root=options.root), args[0])

    def log_is_active():
        active_processes = monitor.get_active_processes()
        for process_status, process_run in active_processes:
            if process_status.process == process and process_run == run:
                return True
        return False

    if not log_is_active():
        print('Tail of terminal log %s' % logfile)
        for line in tail_closed(logfile):
            print(line.rstrip())
        return

    now = time.time()
    next_check = now + 5.0
    print('Tail of active log %s' % logfile)
    for line in tail_f(logfile, include_last=True, forever=False):
        print(line.rstrip())
        if time.time() > next_check:
            if not log_is_active():
                break
            else:
                next_check = time.time() + 5.0