Beispiel #1
0
    def test_basic_as_job(self):
        proxy_driver = ProxyDriver()

        with temporary_dir() as tempdir:
            te = ThermosExecutor(runner_provider=make_provider(tempdir),
                                 sandbox_provider=DefaultTestSandboxProvider)
            te.launchTask(proxy_driver,
                          make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0))
            te.runner_started.wait()
            while te._status_manager is None:
                time.sleep(0.1)
            te.terminated.wait()
            tm = TaskMonitor(TaskPath(root=tempdir),
                             task_id=HELLO_WORLD_TASK_ID)
            runner_state = tm.get_state()

        assert 'hello_world_hello_world-001' in runner_state.processes, (
            'Could not find processes, got: %s' %
            ' '.join(runner_state.processes))
        updates = proxy_driver.method_calls['sendStatusUpdate']
        assert len(updates) == 3
        status_updates = [arg_tuple[0][0] for arg_tuple in updates]
        assert status_updates[0].state == mesos_pb.TASK_STARTING
        assert status_updates[1].state == mesos_pb.TASK_RUNNING
        assert status_updates[2].state == mesos_pb.TASK_FINISHED
Beispiel #2
0
def test_other_user_fails_nonroot():
  with temporary_dir() as td:
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    with pytest.raises(Process.PermissionError):
      p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox,
            user=get_other_nonroot_user().pw_name)
Beispiel #3
0
 def run_with_class(process_class):
   with temporary_dir() as td:
     taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
     sandbox = setup_sandbox(td, taskpath)
     with open(os.path.join(sandbox, 'silly_pants'), 'w') as silly_pants:
       p = process_class('process', 'echo test >&%s' % silly_pants.fileno(),
           0, taskpath, sandbox)
       p.start()
       return wait_for_rc(taskpath.getpath('process_checkpoint'))
Beispiel #4
0
 def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
   self._pathspec = TaskPath(root=root)
   self._detector = TaskDetector(root)
   if not issubclass(resource_monitor_class, ResourceMonitorBase):
     raise ValueError("resource monitor class must implement ResourceMonitorBase!")
   self._resource_monitor = resource_monitor_class
   self._active_tasks = {}    # task_id => ActiveObservedTask
   self._finished_tasks = {}  # task_id => FinishedObservedTask
   self._stop_event = threading.Event()
   ExceptionalThread.__init__(self)
   Lockable.__init__(self)
   self.daemon = True
Beispiel #5
0
def test_simple_process_other_user(*args):
  with temporary_dir() as td:
    some_user = get_other_nonroot_user()
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox, user=some_user.pw_name)
    p.start()
    rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

    # since we're not actually root, the best we can do is check the right things were attempted
    assert os.setgroups.calledwith([g.gr_gid for g in grp.getgrall() if some_user.pw_name in g])
    assert os.setgid.calledwith(some_user.pw_uid)
    assert os.setuid.calledwith(some_user.pw_gid)
Beispiel #6
0
def test_simple_process():
  with temporary_dir() as td:
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox)
    p.start()
    rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

    assert rc == 0
    stdout = taskpath.with_filename('stdout').getpath('process_logdir')
    assert os.path.exists(stdout)
    with open(stdout, 'r') as fp:
      assert fp.read() == 'hello world\n'
Beispiel #7
0
def test_log_permissions():
  with temporary_dir() as td:
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox)
    p.start()
    rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

    stdout = taskpath.with_filename('stdout').getpath('process_logdir')
    stderr = taskpath.with_filename('stderr').getpath('process_logdir')
    assert os.path.exists(stdout)
    assert os.path.exists(stderr)
    assert os.stat(stdout).st_uid == os.getuid()
    assert os.stat(stderr).st_uid == os.getuid()
Beispiel #8
0
def test_log_permissions_other_user(*mocks):
  with temporary_dir() as td:
    some_user = get_other_nonroot_user()
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox, user=some_user.pw_name)
    p.start()
    rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

    # since we're not actually root, the best we can do is check the right things were attempted
    stdout = taskpath.with_filename('stdout').getpath('process_logdir')
    stderr = taskpath.with_filename('stderr').getpath('process_logdir')
    assert os.path.exists(stdout)
    assert os.path.exists(stderr)
    assert os.chown.calledwith(stdout, some_user.pw_uid, some_user.pw_gid)
    assert os.chown.calledwith(stderr, some_user.pw_uid, some_user.pw_gid)
Beispiel #9
0
    def start(self, timeout=MAX_WAIT):
        """Fork the task runner and return once the underlying task is running, up to timeout."""
        self.forking.set()

        try:
            chmod_plus_x(self._runner_pex)
        except OSError as e:
            if e.errno != errno.EPERM:
                raise TaskError('Failed to chmod +x runner: %s' % e)

        self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root),
                                    self._task_id)

        cmdline_args = self._cmdline()
        log.info('Forking off runner with cmdline: %s' %
                 ' '.join(cmdline_args))

        try:
            self._popen = subprocess.Popen(cmdline_args)
        except OSError as e:
            raise TaskError(e)

        self.forked.set()

        log.debug('Waiting for task to start.')

        def is_started():
            return self._monitor and (self._monitor.active
                                      or self._monitor.finished)

        waited = Amount(0, Time.SECONDS)
        while not is_started() and waited < timeout:
            log.debug('  - sleeping...')
            self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
            waited += self.POLL_INTERVAL

        if not is_started():
            log.error('Task did not start with in deadline, forcing loss.')
            self.lose()
            raise TaskError('Task did not start within deadline.')
Beispiel #10
0
    def setup_task(self, task, root, finished=False, corrupt=False):
        """Set up the checkpoint stream for the given task in the given checkpoint root, optionally
    finished and/or with a corrupt stream"""
        class FastTaskRunner(TaskRunner):
            COORDINATOR_INTERVAL_SLEEP = Amount(1, Time.MICROSECONDS)

        tr = FastTaskRunner(task=task,
                            checkpoint_root=root,
                            sandbox=os.path.join(root, 'sandbox',
                                                 task.name().get()),
                            clock=ThreadedClock(time.time()))
        with tr.control():
            # initialize checkpoint stream
            pass
        if finished:
            tr.kill()
        if corrupt:
            ckpt_file = TaskPath(root=root,
                                 tr=tr.task_id).getpath('runner_checkpoint')
            with open(ckpt_file, 'w') as f:
                f.write("definitely not a valid checkpoint stream")
        return tr.task_id
Beispiel #11
0
    def __init__(self, task, portmap={}, success_rate=100, random_seed=31337):
        """
      task = Thermos task
      portmap = port map
      success_rate = success rate of writing checkpoint to disk
    """
        self.task = task

        with temporary_file(cleanup=False) as fp:
            self.job_filename = fp.name
            fp.write(ThermosTaskWrapper(task).to_json())

        self.state_filename = tempfile.mktemp()
        self.tempdir = tempfile.mkdtemp()
        self.task_id = '%s-runner-base' % int(time.time() * 1000000)
        self.sandbox = os.path.join(self.tempdir, 'sandbox')
        self.portmap = portmap
        self.cleaned = False
        self.pathspec = TaskPath(root=self.tempdir, task_id=self.task_id)
        self.script_filename = None
        self.success_rate = success_rate
        self.random_seed = random_seed
        self._run_count = 0
Beispiel #12
0
 def get(cls, task_id, checkpoint_root):
   """
     Get a TaskRunner bound to the task_id in checkpoint_root.
   """
   path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
   task_json = path.getpath('task_path')
   task_checkpoint = path.getpath('runner_checkpoint')
   if not os.path.exists(task_json):
     return None
   task = ThermosConfigLoader.load_json(task_json)
   if task is None:
     return None
   if len(task.tasks()) == 0:
     return None
   try:
     checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
     if checkpoint is None or checkpoint.header is None:
       return None
     return cls(task.tasks()[0].task(), checkpoint_root, checkpoint.header.sandbox,
                log_dir=checkpoint.header.log_dir, task_id=task_id,
                portmap=checkpoint.header.ports)
   except Exception as e:
     log.error('Failed to reconstitute checkpoint in TaskRunner.get: %s' % e, exc_info=True)
     return None
def test_legacy_task_roots():
    assert TaskPath().given(
        task_id='foo').getpath('checkpoint_path').startswith(
            TaskPath.DEFAULT_CHECKPOINT_ROOT)
    assert TaskPath(root='/var/lib/mesos').given(
        task_id='foo').getpath('checkpoint_path').startswith('/var/lib/mesos')
def test_legacy_log_dirs():
    assert TaskPath().given(
        task_id='foo').getpath('process_logbase') == os.path.join(
            TaskPath.DEFAULT_CHECKPOINT_ROOT, 'logs', 'foo')
    assert TaskPath(log_dir='sloth_love_chunk').given(
        task_id='foo').getpath('process_logbase') == 'sloth_love_chunk'
Beispiel #15
0
def tail(args, options):
    """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
    if len(args) == 0:
        app.error('Expected a task to tail, got nothing!')
    if len(args) not in (1, 2):
        app.error(
            'Expected at most two arguments (task and optional process), got %d'
            % len(args))

    task_id = args[0]
    detector = TaskDetector(root=options.root)
    checkpoint = CheckpointDispatcher.from_file(
        detector.get_checkpoint(task_id))
    log_dir = checkpoint.header.log_dir
    process_runs = [(process, run)
                    for (process,
                         run) in detector.get_process_runs(task_id, log_dir)]
    if len(args) == 2:
        process_runs = [(process, run) for (process, run) in process_runs
                        if process == args[1]]

    if len(process_runs) == 0:
        print('ERROR: No processes found.', file=sys.stderr)
        sys.exit(1)

    processes = set([process for process, _ in process_runs])
    if len(processes) != 1:
        print('ERROR: More than one process matches query.', file=sys.stderr)
        sys.exit(1)

    process = processes.pop()
    run = max([run for _, run in process_runs])

    logdir = TaskPath(root=options.root,
                      task_id=args[0],
                      process=process,
                      run=run,
                      log_dir=log_dir).getpath('process_logdir')
    logfile = os.path.join(logdir,
                           'stderr' if options.use_stderr else 'stdout')

    monitor = TaskMonitor(TaskPath(root=options.root), args[0])

    def log_is_active():
        active_processes = monitor.get_active_processes()
        for process_status, process_run in active_processes:
            if process_status.process == process and process_run == run:
                return True
        return False

    if not log_is_active():
        print('Tail of terminal log %s' % logfile)
        for line in tail_closed(logfile):
            print(line.rstrip())
        return

    now = time.time()
    next_check = now + 5.0
    print('Tail of active log %s' % logfile)
    for line in tail_f(logfile, include_last=True, forever=False):
        print(line.rstrip())
        if time.time() > next_check:
            if not log_is_active():
                break
            else:
                next_check = time.time() + 5.0
Beispiel #16
0
 def _runner_ckpt(self, task_id):
   """Return the runner checkpoint file for a given task_id."""
   return TaskPath(root=self._checkpoint_root, task_id=task_id).getpath('runner_checkpoint')
Beispiel #17
0
  def __init__(self, task, checkpoint_root, sandbox, log_dir=None,
               task_id=None, portmap=None, user=None, chroot=False, clock=time,
               universal_handler=None, planner_class=TaskPlanner):
    """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
    """
    if not issubclass(planner_class, TaskPlanner):
      raise TypeError('planner_class must be a TaskPlanner.')
    self._clock = clock
    launch_time = self._clock.time()
    launch_time_ms = '%06d' % int((launch_time - int(launch_time)) * 10**6)
    if not task_id:
      self._task_id = '%s-%s.%s' % (task.name(),
                                    time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)),
                                    launch_time_ms)
    else:
      self._task_id = task_id
    current_user = TaskRunnerHelper.get_actual_user()
    self._user = user or current_user
    # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
    if self._user != current_user:
      if os.geteuid() != 0:
        raise ValueError('task specifies user as %s, but %s does not have setuid permission!' % (
          self._user, current_user))
    self._portmap = portmap or {}
    self._launch_time = launch_time
    self._log_dir = log_dir or os.path.join(sandbox, '.logs')
    self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir)
    try:
      ThermosTaskValidator.assert_valid_task(task)
      ThermosTaskValidator.assert_valid_ports(task, self._portmap)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    context = ThermosContext(
        task_id=self._task_id,
        ports=self._portmap,
        user=self._user)
    self._task, uninterp = (task % Environment(thermos=context)).interpolate()
    if len(uninterp) > 0:
      raise self.InvalidTask('Failed to interpolate task, missing: %s' %
          ', '.join(str(ref) for ref in uninterp))
    try:
      ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    self._plan = None # plan currently being executed (updated by Handlers)
    self._regular_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() == False)
    self._finalizing_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() == True)
    self._chroot = chroot
    self._sandbox = sandbox
    self._terminal_state = None
    self._ckpt = None
    self._process_map = dict((p.name().get(), p) for p in self._task.processes())
    self._task_processes = {}
    self._stages = dict((state, stage(self)) for state, stage in self.STAGES.items())
    self._finalization_start = None
    self._preemption_deadline = None
    self._watcher = ProcessMuxer(self._pathspec)
    self._state   = RunnerState(processes = {})

    # create runner state
    universal_handler = universal_handler or TaskRunnerUniversalHandler
    self._dispatcher = CheckpointDispatcher()
    self._dispatcher.register_handler(universal_handler(self))
    self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
    self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

    # recover checkpointed runner state and update plan
    self._recovery = True
    self._replay_runner_ckpt()
Beispiel #18
0
 def __init__(self, root):
     self._root_dir = root
     self._pathspec = TaskPath()
Beispiel #19
0
    def kill(cls,
             task_id,
             checkpoint_root,
             force=False,
             terminal_status=TaskState.KILLED,
             clock=time):
        """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
        if terminal_status not in (TaskState.KILLED, TaskState.LOST):
            raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                            TaskState._VALUES_TO_NAMES.get(terminal_status)
                            or terminal_status)
        pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
        checkpoint = pathspec.getpath('runner_checkpoint')
        state = CheckpointDispatcher.from_file(checkpoint)

        if state is None or state.header is None or state.statuses is None:
            if force:
                log.error(
                    'Task has uninitialized TaskState - forcibly finalizing')
                cls.finalize_task(pathspec)
                return
            else:
                log.error('Cannot update states in uninitialized TaskState!')
                return

        ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

        def write_task_state(state):
            update = TaskStatus(state=state,
                                timestamp_ms=int(clock.time() * 1000),
                                runner_pid=os.getpid(),
                                runner_uid=os.getuid())
            ckpt.write(RunnerCkpt(task_status=update))

        def write_process_status(status):
            ckpt.write(RunnerCkpt(process_status=status))

        if cls.is_task_terminal(state.statuses[-1].state):
            log.info('Task is already in terminal state!  Finalizing.')
            cls.finalize_task(pathspec)
            return

        with closing(ckpt):
            write_task_state(TaskState.ACTIVE)
            for process, history in state.processes.items():
                process_status = history[-1]
                if not cls.is_process_terminal(process_status.state):
                    if cls.kill_process(state, process):
                        write_process_status(
                            ProcessStatus(process=process,
                                          state=ProcessState.KILLED,
                                          seq=process_status.seq + 1,
                                          return_code=-9,
                                          stop_time=clock.time()))
                    else:
                        if process_status.state is not ProcessState.WAITING:
                            write_process_status(
                                ProcessStatus(process=process,
                                              state=ProcessState.LOST,
                                              seq=process_status.seq + 1))
            write_task_state(terminal_status)
        cls.finalize_task(pathspec)
Beispiel #20
0
 def __init__(self, checkpoint_root):
     self._path = TaskPath(root=checkpoint_root)