def tail(args, options):
  """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
  if len(args) == 0:
    app.error('Expected a task to tail, got nothing!')
  if len(args) not in (1, 2):
    app.error('Expected at most two arguments (task and optional process), got %d' % len(args))

  task_id = args[0]
  detector = TaskDetector(root=options.root)
  checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
  log_dir = checkpoint.header.log_dir
  process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
  if len(args) == 2:
    process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

  if len(process_runs) == 0:
    print('ERROR: No processes found.', file=sys.stderr)
    sys.exit(1)

  processes = set([process for process, _ in process_runs])
  if len(processes) != 1:
    print('ERROR: More than one process matches query.', file=sys.stderr)
    sys.exit(1)

  process = processes.pop()
  run = max([run for _, run in process_runs])

  logdir = TaskPath(root=options.root, task_id=args[0], process=process,
     run=run, log_dir=log_dir).getpath('process_logdir')
  logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout')

  monitor = TaskMonitor(TaskPath(root=options.root), args[0])
  def log_is_active():
    active_processes = monitor.get_active_processes()
    for process_status, process_run in active_processes:
      if process_status.process == process and process_run == run:
        return True
    return False

  if not log_is_active():
    print('Tail of terminal log %s' % logfile)
    for line in tail_closed(logfile):
      print(line.rstrip())
    return

  now = time.time()
  next_check = now + 5.0
  print('Tail of active log %s' % logfile)
  for line in tail_f(logfile, include_last=True, forever=False):
    print(line.rstrip())
    if time.time() > next_check:
      if not log_is_active():
        break
      else:
        next_check = time.time() + 5.0
Example #2
0
def make_taskpath(td):
  return TaskPath(
      root=td,
      task_id='task',
      process='process',
      run=0,
      log_dir=os.path.join(td, '.logs'))
Example #3
0
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
     task_json = path.getpath('task_path')
     task_checkpoint = path.getpath('runner_checkpoint')
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(task.tasks()[0].task(),
                    checkpoint_root,
                    checkpoint.header.sandbox,
                    log_dir=checkpoint.header.log_dir,
                    task_id=task_id,
                    portmap=checkpoint.header.ports,
                    hostname=checkpoint.header.hostname)
     except Exception as e:
         log.error(
             'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e,
             exc_info=True)
         return None
Example #4
0
    def test_basic_as_job(self):
        proxy_driver = ProxyDriver()

        with temporary_dir() as tempdir:
            te = AuroraExecutor(runner_provider=make_provider(tempdir),
                                sandbox_provider=DefaultTestSandboxProvider())
            te.launchTask(proxy_driver,
                          make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0))
            te.runner_started.wait()
            while te._status_manager is None:
                time.sleep(0.1)
            te.terminated.wait()
            tm = TaskMonitor(TaskPath(root=tempdir),
                             task_id=HELLO_WORLD_TASK_ID)
            runner_state = tm.get_state()

        assert 'hello_world_hello_world-001' in runner_state.processes, (
            'Could not find processes, got: %s' %
            ' '.join(runner_state.processes))
        updates = proxy_driver.method_calls['sendStatusUpdate']
        assert len(updates) == 3
        status_updates = [arg_tuple[0][0] for arg_tuple in updates]
        assert status_updates[0].state == mesos_pb2.TASK_STARTING
        assert status_updates[1].state == mesos_pb2.TASK_RUNNING
        assert status_updates[2].state == mesos_pb2.TASK_FINISHED
Example #5
0
    def logs(self, task_id, process, run=None):
        """
      Given a task_id and a process and (optional) run number, return a dict:
      {
        stderr: [dir, filename]
        stdout: [dir, filename]
      }

      If the run number is unspecified, uses the latest run.

      TODO(wickman)  Just return the filenames directly?
    """
        runner_state = self.raw_state(task_id)
        if runner_state is None or runner_state.header is None:
            return {}
        run = self.get_run_number(runner_state, process, run)
        if run is None:
            return {}
        observed_task = self.all_tasks.get(task_id, None)
        if not observed_task:
            return {}

        log_path = TaskPath(
            root=observed_task.root,
            task_id=task_id,
            process=process,
            run=run,
            log_dir=runner_state.header.log_dir,
        ).getpath('process_logdir')

        return dict(stdout=[log_path, 'stdout'], stderr=[log_path, 'stderr'])
def write_header(root, sandbox, task_id):
    log_dir = os.path.join(sandbox, '.logs')
    path = TaskPath(root=root, task_id=task_id, log_dir=log_dir)
    header = RunnerHeader(task_id=task_id, sandbox=sandbox, log_dir=log_dir)
    ckpt = TaskRunnerHelper.open_checkpoint(path.getpath('runner_checkpoint'))
    ckpt.write(RunnerCkpt(runner_header=header))
    ckpt.close()
Example #7
0
  def _runner_ckpt(self, task):
    """Return the runner checkpoint file for a given task.

    :param task: An instance of a task to retrieve checkpoint path
    :type task: :class:`RootedTask` instance
    """
    return TaskPath(root=task.checkpoint_root, task_id=task.task_id).getpath('runner_checkpoint')
Example #8
0
 def erase_logs(self, task_id):
     for fn in self.get_logs(task_id, with_size=False):
         safe_delete(fn)
     state = self.state(task_id)
     if state and state.header:
         safe_rmtree(
             TaskPath(
                 root=self._root,
                 task_id=task_id,
                 log_dir=state.header.log_dir).getpath('process_logbase'))
Example #9
0
    def __init__(self, checkpoint_root, task_id):
        """
    :param checkpoint_root: The checkpoint root to find the given task.
    :param task_id: The task_id of the task whose state we wish to manage.
    """

        self._detector = TaskDetector(checkpoint_root)
        self._task_id = task_id
        self._pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
        self._state = CheckpointDispatcher.from_file(
            self._detector.get_checkpoint(task_id))
 def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
   self._pathspec = TaskPath(root=root)
   self._detector = TaskDetector(root)
   if not issubclass(resource_monitor_class, ResourceMonitorBase):
     raise ValueError("resource monitor class must implement ResourceMonitorBase!")
   self._resource_monitor = resource_monitor_class
   self._active_tasks = {}    # task_id => ActiveObservedTask
   self._finished_tasks = {}  # task_id => FinishedObservedTask
   self._stop_event = threading.Event()
   ExceptionalThread.__init__(self)
   Lockable.__init__(self)
   self.daemon = True
Example #11
0
 def from_assigned_task(self, assigned_task, sandbox):
     task_id = assigned_task.taskId
     resources = mesos_task_instance_from_assigned_task(
         assigned_task).task().resources()
     task_path = TaskPath(root=self._checkpoint_root, task_id=task_id)
     task_monitor = TaskMonitor(task_path, task_id)
     resource_monitor = TaskResourceMonitor(
         task_monitor,
         sandbox.root,
         disk_collector=self._disk_collector,
         disk_collection_interval=self._disk_collection_interval)
     return ResourceManager(resources, resource_monitor)
def test_other_user_fails_nonroot():
    with temporary_dir() as td:
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)
        process = TestProcess('process',
                              'echo hello world',
                              0,
                              taskpath,
                              sandbox,
                              user=get_other_nonroot_user().pw_name)
        with pytest.raises(Process.PermissionError):
            process.start()
Example #13
0
  def kill(cls, task_id, checkpoint_root, force=False,
           terminal_status=TaskState.KILLED, clock=time):
    """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
    if terminal_status not in (TaskState.KILLED, TaskState.LOST):
      raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                      TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status)
    pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
    checkpoint = pathspec.getpath('runner_checkpoint')
    state = CheckpointDispatcher.from_file(checkpoint)

    if state is None or state.header is None or state.statuses is None:
      if force:
        log.error('Task has uninitialized TaskState - forcibly finalizing')
        cls.finalize_task(pathspec)
        return
      else:
        log.error('Cannot update states in uninitialized TaskState!')
        return

    ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

    def write_task_state(state):
      update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000),
                          runner_pid=os.getpid(), runner_uid=os.getuid())
      ckpt.write(RunnerCkpt(task_status=update))

    def write_process_status(status):
      ckpt.write(RunnerCkpt(process_status=status))

    if cls.is_task_terminal(state.statuses[-1].state):
      log.info('Task is already in terminal state!  Finalizing.')
      cls.finalize_task(pathspec)
      return

    with closing(ckpt):
      write_task_state(TaskState.ACTIVE)
      for process, history in state.processes.items():
        process_status = history[-1]
        if not cls.is_process_terminal(process_status.state):
          if cls.kill_process(state, process):
            write_process_status(ProcessStatus(process=process,
              state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9,
              stop_time=clock.time()))
          else:
            if process_status.state is not ProcessState.WAITING:
              write_process_status(ProcessStatus(process=process,
                state=ProcessState.LOST, seq=process_status.seq + 1))
      write_task_state(terminal_status)
    cls.finalize_task(pathspec)
Example #14
0
 def get_metadata(self, task_id, with_size=True):
     runner_ckpt = self._detector.get_checkpoint(task_id)
     process_ckpts = [
         ckpt for ckpt in self._detector.get_process_checkpoints(task_id)
     ]
     json_spec = TaskPath(root=self._root,
                          task_id=task_id,
                          state='finished').getpath('task_path')
     for path in [json_spec, runner_ckpt] + process_ckpts:
         if with_size:
             yield path, safe_bsize(path)
         else:
             yield path
Example #15
0
def test_garbage_collector(safe_rmtree, safe_delete):
    with temporary_dir() as sandbox, temporary_dir(
    ) as checkpoint_root, temporary_dir() as log_dir:

        path = TaskPath(root=checkpoint_root, task_id='test', log_dir=log_dir)

        touch(os.path.join(sandbox, 'test_file1'))
        touch(os.path.join(sandbox, 'test_file2'))
        safe_mkdir(
            os.path.dirname(path.given(state='finished').getpath('task_path')))
        safe_mkdir(os.path.dirname(path.getpath('runner_checkpoint')))
        touch(path.given(state='finished').getpath('task_path'))

        header = RunnerHeader(task_id='test', sandbox=sandbox, log_dir=log_dir)
        ckpt = TaskRunnerHelper.open_checkpoint(
            path.getpath('runner_checkpoint'))
        ckpt.write(RunnerCkpt(runner_header=header))
        ckpt.close()

        gc = TaskGarbageCollector(checkpoint_root, task_id='test')
        assert gc._state.header.log_dir == log_dir
        assert gc._state.header.sandbox == sandbox

        # erase metadata
        gc.erase_metadata()
        safe_delete.assert_has_calls([
            call(path.given(state='finished').getpath('task_path')),
            call(path.getpath('runner_checkpoint'))
        ],
                                     any_order=True)
        safe_rmtree.assert_has_calls([call(path.getpath('checkpoint_path'))])

        safe_delete.reset_mock()
        safe_rmtree.reset_mock()

        # erase logs
        gc.erase_logs()
        safe_rmtree.assert_has_calls([call(log_dir)])

        safe_delete.reset_mock()
        safe_rmtree.reset_mock()

        # erase sandbox
        gc.erase_data()

        safe_delete.assert_has_calls([
            call(os.path.join(sandbox, 'test_file1')),
            call(os.path.join(sandbox, 'test_file2'))
        ],
                                     any_order=True)
        safe_rmtree.assert_has_calls([call(sandbox)])
    def test_sample_by_process_no_process(self, mock_get_active_processes):
        task_path = TaskPath(root='.')

        task_monitor = TaskMonitor(task_path, 'fake-task-id')
        mock_get_active_processes.return_value = []

        task_resource_monitor = TaskResourceMonitor(task_monitor, '.')

        with self.assertRaises(ValueError):
            task_resource_monitor.sample_by_process('fake-process-name')

        assert mock_get_active_processes.mock_calls == [
            mock.call(task_monitor)
        ]
def test_simple_process():
    with temporary_dir() as td:
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)

        p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox)
        p.start()
        rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

        assert rc == 0
        stdout = taskpath.with_filename('stdout').getpath('process_logdir')
        assert os.path.exists(stdout)
        with open(stdout, 'r') as fp:
            assert fp.read() == 'hello world\n'
 def run_with_class(process_class):
     with temporary_dir() as td:
         taskpath = TaskPath(root=td,
                             task_id='task',
                             process='process',
                             run=0)
         sandbox = setup_sandbox(td, taskpath)
         with open(os.path.join(sandbox, 'silly_pants'),
                   'w') as silly_pants:
             p = process_class('process',
                               'echo test >&%s' % silly_pants.fileno(), 0,
                               taskpath, sandbox)
             p.start()
             return wait_for_rc(taskpath.getpath('process_checkpoint'))
Example #19
0
def test_empty_garbage_collector():
    with temporary_dir() as checkpoint_root:
        path = TaskPath(root=checkpoint_root, task_id='does_not_exist')
        gc = TaskGarbageCollector(checkpoint_root, 'does_not_exist')

        assert gc.get_age() == 0

        # assume runner, finished task
        assert set(gc.get_metadata(with_size=False)) == set([
            path.getpath('runner_checkpoint'),
            path.given(state='finished').getpath('task_path'),
        ])

        assert list(gc.get_logs()) == []
        assert list(gc.get_data()) == []
Example #20
0
  def __init__(self, root, task_id):
    """Construct a TaskMonitor.

    :param root: The checkpoint root of the task.
    :param task_id: The task id of the task.
    """
    pathspec = TaskPath(root=root, task_id=task_id)
    self._dispatcher = CheckpointDispatcher()
    self._runnerstate = RunnerState(processes={})
    self._runner_ckpt = pathspec.getpath('runner_checkpoint')
    self._active_file, self._finished_file = (pathspec.given(state=state).getpath('task_path')
        for state in ('active', 'finished'))
    self._ckpt_head = 0
    self._apply_states()
    self._lock = threading.Lock()
def test_log_permissions():
    with temporary_dir() as td:
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)

        p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox)
        p.start()
        wait_for_rc(taskpath.getpath('process_checkpoint'))

        stdout = taskpath.with_filename('stdout').getpath('process_logdir')
        stderr = taskpath.with_filename('stderr').getpath('process_logdir')
        assert os.path.exists(stdout)
        assert os.path.exists(stderr)
        assert os.stat(stdout).st_uid == os.getuid()
        assert os.stat(stderr).st_uid == os.getuid()
Example #22
0
  def start(self, timeout=MAX_WAIT):
    """Fork the task runner and return once the underlying task is running, up to timeout."""
    self.forking.set()

    self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id)

    cmdline_args = self._cmdline()
    log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args))

    cwd = os.environ.get('MESOS_DIRECTORY')
    try:
      self._popen = subprocess.Popen(cmdline_args, cwd=cwd)
    except OSError as e:
      raise TaskError(e)

    self.forked.set()

    self.wait_start(timeout=timeout)
Example #23
0
 def setup_task(self, task, root, finished=False, corrupt=False):
   """Set up the checkpoint stream for the given task in the given checkpoint root, optionally
   finished and/or with a corrupt stream"""
   class FastTaskRunner(TaskRunner):
     COORDINATOR_INTERVAL_SLEEP = Amount(1, Time.MILLISECONDS)
   tr = FastTaskRunner(
       task=task,
       checkpoint_root=root,
       sandbox=os.path.join(root, 'sandbox', task.name().get()),
       clock=ThreadedClock(time.time()))
   with tr.control():
     # initialize checkpoint stream
     pass
   if finished:
     tr.kill()
   if corrupt:
     ckpt_file = TaskPath(root=root, tr=tr.task_id).getpath('runner_checkpoint')
     with open(ckpt_file, 'w') as f:
       f.write("definitely not a valid checkpoint stream")
   return tr.task_id
def test_simple_process_other_user(*args):
    with temporary_dir() as td:
        some_user = get_other_nonroot_user()
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)

        p = TestProcess('process',
                        'echo hello world',
                        0,
                        taskpath,
                        sandbox,
                        user=some_user.pw_name)
        p.start()
        wait_for_rc(taskpath.getpath('process_checkpoint'))

        # since we're not actually root, the best we can do is check the right things were attempted
        assert os.setgroups.calledwith(
            [g.gr_gid for g in grp.getgrall() if some_user.pw_name in g])
        assert os.setgid.calledwith(some_user.pw_uid)
        assert os.setuid.calledwith(some_user.pw_gid)
    def test_sample_by_process(self, mock_get_active_processes, mock_sample):
        fake_process_name = 'fake-process-name'
        task_path = TaskPath(root='.')
        task_monitor = TaskMonitor(task_path, 'fake-task-id')
        fake_process_status = ProcessStatus(process=fake_process_name)
        mock_get_active_processes.return_value = [(fake_process_status, 1)]
        fake_process_sample = ProcessSample.empty()
        mock_sample.return_value = fake_process_sample

        task_resource_monitor = TaskResourceMonitor(task_monitor, '.')

        assert fake_process_sample == task_resource_monitor.sample_by_process(
            fake_process_name)
        assert mock_get_active_processes.mock_calls == [
            mock.call(task_monitor)
        ]
        assert mock_sample.mock_calls == [
            mock.call(
                task_resource_monitor._process_collectors[fake_process_status])
        ]
def test_log_permissions_other_user(*mocks):
    with temporary_dir() as td:
        some_user = get_other_nonroot_user()
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)

        p = TestProcess('process',
                        'echo hello world',
                        0,
                        taskpath,
                        sandbox,
                        user=some_user.pw_name)
        p.start()
        wait_for_rc(taskpath.getpath('process_checkpoint'))

        # since we're not actually root, the best we can do is check the right things were attempted
        stdout = taskpath.with_filename('stdout').getpath('process_logdir')
        stderr = taskpath.with_filename('stderr').getpath('process_logdir')
        assert os.path.exists(stdout)
        assert os.path.exists(stderr)
        assert os.chown.calledwith(stdout, some_user.pw_uid, some_user.pw_gid)
        assert os.chown.calledwith(stderr, some_user.pw_uid, some_user.pw_gid)
Example #27
0
  def __init__(self, task, success_rate=100, random_seed=31337, **extra_task_runner_args):
    """
      task = Thermos task
      portmap = port map
      success_rate = success rate of writing checkpoint to disk
    """
    self.task = task

    with temporary_file(cleanup=False) as fp:
      self.job_filename = fp.name
      fp.write(ThermosTaskWrapper(task).to_json())

    self.state_filename = tempfile.mktemp()
    self.tempdir = tempfile.mkdtemp()
    self.task_id = '%s-runner-base' % int(time.time() * 1000000)
    self.sandbox = os.path.join(self.tempdir, 'sandbox')
    self.extra_task_runner_args = extra_task_runner_args
    self.cleaned = False
    self.pathspec = TaskPath(root=self.tempdir, task_id=self.task_id)
    self.script_filename = None
    self.success_rate = success_rate
    self.random_seed = random_seed
    self._run_count = 0
  def start(self, timeout=MAX_WAIT):
    """Fork the task runner and return once the underlying task is running, up to timeout."""
    self.forking.set()

    try:
      chmod_plus_x(self._runner_pex)
    except OSError as e:
      if e.errno != errno.EPERM:
        raise TaskError('Failed to chmod +x runner: %s' % e)

    self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id)

    cmdline_args = self._cmdline()
    log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args))

    try:
      self._popen = subprocess.Popen(cmdline_args)
    except OSError as e:
      raise TaskError(e)

    self.forked.set()

    log.debug('Waiting for task to start.')

    def is_started():
      return self._monitor and (self._monitor.active or self._monitor.finished)

    waited = Amount(0, Time.SECONDS)
    while not is_started() and waited < timeout:
      log.debug('  - sleeping...')
      self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
      waited += self.POLL_INTERVAL

    if not is_started():
      log.error('Task did not start with in deadline, forcing loss.')
      self.lose()
      raise TaskError('Task did not start within deadline.')
Example #29
0
 def _runner_ckpt(self, task_id):
     """Return the runner checkpoint file for a given task_id."""
     return TaskPath(root=self._checkpoint_root,
                     task_id=task_id).getpath('runner_checkpoint')
Example #30
0
    def __init__(self,
                 task,
                 checkpoint_root,
                 sandbox,
                 log_dir=None,
                 task_id=None,
                 portmap=None,
                 user=None,
                 chroot=False,
                 clock=time,
                 universal_handler=None,
                 planner_class=TaskPlanner,
                 hostname=None,
                 process_logger_destination=None,
                 process_logger_mode=None,
                 rotate_log_size_mb=None,
                 rotate_log_backups=None,
                 preserve_env=False):
        """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
    """
        if not issubclass(planner_class, TaskPlanner):
            raise TypeError('planner_class must be a TaskPlanner.')
        self._clock = clock
        launch_time = self._clock.time()
        launch_time_ms = '%06d' % int(
            (launch_time - int(launch_time)) * (10**6))
        if not task_id:
            self._task_id = '%s-%s.%s' % (
                task.name(),
                time.strftime('%Y%m%d-%H%M%S',
                              time.localtime(launch_time)), launch_time_ms)
        else:
            self._task_id = task_id
        current_user = TaskRunnerHelper.get_actual_user()
        self._user = user or current_user
        # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
        if self._user != current_user:
            if os.geteuid() != 0:
                raise ValueError(
                    'task specifies user as %s, but %s does not have setuid permission!'
                    % (self._user, current_user))
        self._portmap = portmap or {}
        self._launch_time = launch_time
        self._log_dir = log_dir or os.path.join(sandbox, '.logs')
        self._process_logger_destination = process_logger_destination
        self._process_logger_mode = process_logger_mode
        self._rotate_log_size_mb = rotate_log_size_mb
        self._rotate_log_backups = rotate_log_backups
        self._pathspec = TaskPath(root=checkpoint_root,
                                  task_id=self._task_id,
                                  log_dir=self._log_dir)
        self._hostname = hostname or socket.gethostname()
        try:
            ThermosTaskValidator.assert_valid_task(task)
            ThermosTaskValidator.assert_valid_ports(task, self._portmap)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        context = ThermosContext(task_id=self._task_id,
                                 ports=self._portmap,
                                 user=self._user)
        self._task, uninterp = (task %
                                Environment(thermos=context)).interpolate()
        if len(uninterp) > 0:
            raise self.InvalidTask('Failed to interpolate task, missing: %s' %
                                   ', '.join(str(ref) for ref in uninterp))
        try:
            ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        self._plan = None  # plan currently being executed (updated by Handlers)
        self._regular_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is False)
        self._finalizing_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is True)
        self._chroot = chroot
        self._sandbox = sandbox
        self._terminal_state = None
        self._ckpt = None
        self._process_map = dict(
            (p.name().get(), p) for p in self._task.processes())
        self._task_processes = {}
        self._stages = dict(
            (state, stage(self)) for state, stage in self.STAGES.items())
        self._finalization_start = None
        self._preemption_deadline = None
        self._watcher = ProcessMuxer(self._pathspec)
        self._state = RunnerState(processes={})
        self._preserve_env = preserve_env

        # create runner state
        universal_handler = universal_handler or TaskRunnerUniversalHandler
        self._dispatcher = CheckpointDispatcher()
        self._dispatcher.register_handler(universal_handler(self))
        self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
        self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

        # recover checkpointed runner state and update plan
        self._recovery = True
        self._replay_runner_ckpt()