Ejemplo n.º 1
0
def read(args, options):
    """Replay a thermos checkpoint.

  Usage: thermos read [options] checkpoint_filename
  Options:
    --simple	Do not replay the full task state machine.  Only print out the contents of
                each checkpoint log message.
  """
    if len(args) != 1:
        app.error('Expected one checkpoint file, got %s' % len(args))
    if not os.path.exists(args[0]):
        app.error('Could not find %s' % args[0])

    dispatcher = CheckpointDispatcher()
    state = RunnerState(processes={})
    with open(args[0], 'r') as fp:
        try:
            for record in ThriftRecordReader(fp, RunnerCkpt):
                if not options.simple:
                    dispatcher.dispatch(state, record)
                else:
                    print('CKPT: %s' % record)
        except RecordIO.Error as err:
            print("Failed to recover from %s: %s" % (fp.name, err))
            return

    if not options.simple:
        if state is None or state.header is None:
            print('Checkpoint stream CORRUPT or outdated format')
            return
        print('Recovered Task Header:')
        print('  id:      %s' % state.header.task_id)
        print('  user:    %s' % state.header.user)
        print('  host:    %s' % state.header.hostname)
        print('  sandbox: %s' % state.header.sandbox)
        if state.header.ports:
            print('  ports:   %s' %
                  ' '.join('%s->%s' % (name, port)
                           for (name, port) in state.header.ports.items()))
        print('Recovered Task States:')
        for task_status in state.statuses:
            print(
                '  %s [pid: %d] => %s' %
                (time.asctime(time.localtime(task_status.timestamp_ms /
                                             1000.0)), task_status.runner_pid,
                 TaskState._VALUES_TO_NAMES[task_status.state]))
        print('Recovered Processes:')
        for process, process_history in state.processes.items():
            print('  %s   runs: %s' % (process, len(process_history)))
            for k in reversed(range(len(process_history))):
                run = process_history[k]
                print(
                    '    %2d: pid=%d, rc=%s, finish:%s, state:%s' %
                    (k, run.pid, run.return_code if run.return_code is not None
                     else '', time.asctime(time.localtime(run.stop_time))
                     if run.stop_time else 'None',
                     ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))
Ejemplo n.º 2
0
 def __init__(self, pathspec, task_id):
     self._task_id = task_id
     self._dispatcher = CheckpointDispatcher()
     self._runnerstate = RunnerState(processes={})
     self._runner_ckpt = pathspec.given(
         task_id=task_id).getpath('runner_checkpoint')
     self._active_file, self._finished_file = (pathspec.given(
         task_id=task_id, state=state).getpath('task_path')
                                               for state in ('active',
                                                             'finished'))
     self._ckpt_head = 0
     self._apply_states()
     self._lock = threading.Lock()
Ejemplo n.º 3
0
def main(args):
    values = app.get_options()

    if len(args) > 0:
        print >> sys.stderr, "ERROR: unrecognized arguments: %s\n" % (
            " ".join(args))
        app.help()
        sys.exit(1)

    if not values.ckpt:
        print >> sys.stderr, "ERROR: must supply --checkpoint"
        app.help()
        sys.exit(1)

    fp = file(values.ckpt, "r")
    rr = ThriftRecordReader(fp, RunnerCkpt)
    wrs = RunnerState(processes={})
    dispatcher = CheckpointDispatcher()
    try:
        for wts in rr:
            print 'Recovering: ', wts
            if values.assemble is True:
                dispatcher.dispatch(wrs, wts)
    except RecordIO.Error as err:
        print 'Error recovering checkpoint stream: %s' % err
        return
    print '\n\n\n'
    if values.assemble:
        print 'Recovered Task Header'
        pprint.pprint(wrs.header, indent=4)

        print '\nRecovered Task States'
        for task_status in wrs.statuses:
            print '  %s [pid: %d] => %s' % (
                time.asctime(time.localtime(task_status.timestamp_ms /
                                            1000.0)), task_status.runner_pid,
                TaskState._VALUES_TO_NAMES[task_status.state])

        print '\nRecovered Processes'
        pprint.pprint(wrs.processes, indent=4)
Ejemplo n.º 4
0
  def __init__(self, task, checkpoint_root, sandbox, log_dir=None,
               task_id=None, portmap=None, user=None, chroot=False, clock=time,
               universal_handler=None, planner_class=TaskPlanner):
    """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
    """
    if not issubclass(planner_class, TaskPlanner):
      raise TypeError('planner_class must be a TaskPlanner.')
    self._clock = clock
    launch_time = self._clock.time()
    launch_time_ms = '%06d' % int((launch_time - int(launch_time)) * 10**6)
    if not task_id:
      self._task_id = '%s-%s.%s' % (task.name(),
                                    time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)),
                                    launch_time_ms)
    else:
      self._task_id = task_id
    current_user = TaskRunnerHelper.get_actual_user()
    self._user = user or current_user
    # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
    if self._user != current_user:
      if os.geteuid() != 0:
        raise ValueError('task specifies user as %s, but %s does not have setuid permission!' % (
          self._user, current_user))
    self._portmap = portmap or {}
    self._launch_time = launch_time
    self._log_dir = log_dir or os.path.join(sandbox, '.logs')
    self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir)
    try:
      ThermosTaskValidator.assert_valid_task(task)
      ThermosTaskValidator.assert_valid_ports(task, self._portmap)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    context = ThermosContext(
        task_id=self._task_id,
        ports=self._portmap,
        user=self._user)
    self._task, uninterp = (task % Environment(thermos=context)).interpolate()
    if len(uninterp) > 0:
      raise self.InvalidTask('Failed to interpolate task, missing: %s' %
          ', '.join(str(ref) for ref in uninterp))
    try:
      ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    self._plan = None # plan currently being executed (updated by Handlers)
    self._regular_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() == False)
    self._finalizing_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() == True)
    self._chroot = chroot
    self._sandbox = sandbox
    self._terminal_state = None
    self._ckpt = None
    self._process_map = dict((p.name().get(), p) for p in self._task.processes())
    self._task_processes = {}
    self._stages = dict((state, stage(self)) for state, stage in self.STAGES.items())
    self._finalization_start = None
    self._preemption_deadline = None
    self._watcher = ProcessMuxer(self._pathspec)
    self._state   = RunnerState(processes = {})

    # create runner state
    universal_handler = universal_handler or TaskRunnerUniversalHandler
    self._dispatcher = CheckpointDispatcher()
    self._dispatcher.register_handler(universal_handler(self))
    self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
    self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

    # recover checkpointed runner state and update plan
    self._recovery = True
    self._replay_runner_ckpt()
Ejemplo n.º 5
0
    def inspect(self, task_id):
        """
      Reconstructs the checkpoint stream and returns a CheckpointInspection.
    """
        dispatcher = CheckpointDispatcher()
        state = RunnerState(processes={})
        muxer = ProcessMuxer(self._path.given(task_id=task_id))

        runner_processes = []
        coordinator_processes = set()
        processes = set()

        def consume_process_record(record):
            if not record.process_status:
                return
            try:
                user_uid = pwd.getpwnam(state.header.user).pw_uid
            except KeyError:
                log.error('Could not find user: %s' % state.header.user)
                return
            if record.process_status.state == ProcessState.FORKED:
                coordinator_processes.add(
                    (record.process_status.coordinator_pid, user_uid,
                     record.process_status.fork_time))
            elif record.process_status.state == ProcessState.RUNNING:
                processes.add((record.process_status.pid, user_uid,
                               record.process_status.start_time))

        # replay runner checkpoint
        runner_pid = None
        runner_latest_update = 0
        try:
            with open(
                    self._path.given(
                        task_id=task_id).getpath('runner_checkpoint')) as fp:
                with closing(ThriftRecordReader(fp, RunnerCkpt)) as ckpt:
                    for record in ckpt:
                        dispatcher.dispatch(state, record)
                        runner_latest_update = max(
                            runner_latest_update,
                            self.get_timestamp(record.process_status))
                        # collect all bound runners
                        if record.task_status:
                            if record.task_status.runner_pid != runner_pid:
                                runner_processes.append(
                                    (record.task_status.runner_pid,
                                     record.task_status.runner_uid
                                     or 0, record.task_status.timestamp_ms))
                                runner_pid = record.task_status.runner_pid
                        elif record.process_status:
                            consume_process_record(record)
        except (IOError, OSError, RecordIO.Error) as err:
            log.debug('Error inspecting task runner checkpoint: %s' % err)
            return

        # register existing processes in muxer
        for process_name in state.processes:
            muxer.register(process_name)

        # read process checkpoints
        process_latest_update = runner_latest_update
        for record in muxer.select():
            process_latest_update = max(
                process_latest_update,
                self.get_timestamp(record.process_status))
            consume_process_record(record)

        return CheckpointInspection(
            runner_latest_update=runner_latest_update,
            process_latest_update=process_latest_update,
            runner_processes=runner_processes,
            coordinator_processes=coordinator_processes,
            processes=processes)