예제 #1
0
def read(args, options):
  """Replay a thermos checkpoint.

  Usage: thermos read [options] checkpoint_filename
  Options:
    --simple	Do not replay the full task state machine.  Only print out the contents of
                each checkpoint log message.
  """
  if len(args) != 1:
    app.error('Expected one checkpoint file, got %s' % len(args))
  if not os.path.exists(args[0]):
    app.error('Could not find %s' % args[0])

  dispatcher = CheckpointDispatcher()
  state = RunnerState(processes={})
  with open(args[0], 'r') as fp:
    try:
      for record in ThriftRecordReader(fp, RunnerCkpt):
        if not options.simple:
          dispatcher.dispatch(state, record)
        else:
          print('CKPT: %s' % record)
    except RecordIO.Error as err:
      print("Failed to recover from %s: %s" % (fp.name, err))
      return

  if not options.simple:
    if state is None or state.header is None:
      print('Checkpoint stream CORRUPT or outdated format')
      return
    print('Recovered Task Header:')
    print('  id:      %s' % state.header.task_id)
    print('  user:    %s' % state.header.user)
    print('  host:    %s' % state.header.hostname)
    print('  sandbox: %s' % state.header.sandbox)
    if state.header.ports:
      print('  ports:   %s' % ' '.join(
        '%s->%s' % (name, port) for (name, port) in state.header.ports.items()))
    print('Recovered Task States:')
    for task_status in state.statuses:
      print('  %s [pid: %d] => %s' % (
        time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)),
        task_status.runner_pid,
        TaskState._VALUES_TO_NAMES[task_status.state]))
    print('Recovered Processes:')
    for process, process_history in state.processes.items():
      print('  %s   runs: %s' % (process, len(process_history)))
      for k in reversed(range(len(process_history))):
        run = process_history[k]
        print('    %2d: pid=%d, rc=%s, finish:%s, state:%s' % (
          k,
          run.pid,
          run.return_code if run.return_code is not None else '',
          time.asctime(time.localtime(run.stop_time)) if run.stop_time else 'None',
          ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))
예제 #2
0
 def __init__(self, pathspec, task_id):
     self._task_id = task_id
     self._dispatcher = CheckpointDispatcher()
     self._runnerstate = RunnerState(processes={})
     self._runner_ckpt = pathspec.given(
         task_id=task_id).getpath('runner_checkpoint')
     self._active_file, self._finished_file = (pathspec.given(
         task_id=task_id, state=state).getpath('task_path')
                                               for state in ('active',
                                                             'finished'))
     self._ckpt_head = 0
     self._apply_states()
     self._lock = threading.Lock()
예제 #3
0
  def __init__(self, root, task_id):
    """Construct a TaskMonitor.

    :param root: The checkpoint root of the task.
    :param task_id: The task id of the task.
    """
    pathspec = TaskPath(root=root, task_id=task_id)
    self._dispatcher = CheckpointDispatcher()
    self._runnerstate = RunnerState(processes={})
    self._runner_ckpt = pathspec.getpath('runner_checkpoint')
    self._active_file, self._finished_file = (pathspec.given(state=state).getpath('task_path')
        for state in ('active', 'finished'))
    self._ckpt_head = 0
    self._apply_states()
    self._lock = threading.Lock()
def main(args):
  values = app.get_options()

  if len(args) > 0:
    print("ERROR: unrecognized arguments: %s\n" % (" ".join(args)), file=sys.stderr)
    app.help()
    sys.exit(1)

  if not values.ckpt:
    print("ERROR: must supply --checkpoint", file=sys.stderr)
    app.help()
    sys.exit(1)

  fp = file(values.ckpt, "r")
  rr = ThriftRecordReader(fp, RunnerCkpt)
  wrs = RunnerState(processes={})
  dispatcher = CheckpointDispatcher()
  try:
    for wts in rr:
      print('Recovering: %s' % wts)
      if values.assemble is True:
        dispatcher.dispatch(wrs, wts)
  except RecordIO.Error as err:
    print('Error recovering checkpoint stream: %s' % err, file=sys.stderr)
    return
  print('\n\n\n')
  if values.assemble:
    print('Recovered Task Header')
    pprint.pprint(wrs.header, indent=4)

    print('\nRecovered Task States')
    for task_status in wrs.statuses:
      print('  %s [pid: %d] => %s' % (
          time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)),
          task_status.runner_pid,
          TaskState._VALUES_TO_NAMES[task_status.state]))

    print('\nRecovered Processes')
    pprint.pprint(wrs.processes, indent=4)
예제 #5
0
    def __init__(self,
                 task,
                 checkpoint_root,
                 sandbox,
                 log_dir=None,
                 task_id=None,
                 portmap=None,
                 user=None,
                 chroot=False,
                 clock=time,
                 universal_handler=None,
                 planner_class=TaskPlanner,
                 hostname=None,
                 process_logger_destination=None,
                 process_logger_mode=None,
                 rotate_log_size_mb=None,
                 rotate_log_backups=None,
                 preserve_env=False):
        """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
    """
        if not issubclass(planner_class, TaskPlanner):
            raise TypeError('planner_class must be a TaskPlanner.')
        self._clock = clock
        launch_time = self._clock.time()
        launch_time_ms = '%06d' % int(
            (launch_time - int(launch_time)) * (10**6))
        if not task_id:
            self._task_id = '%s-%s.%s' % (
                task.name(),
                time.strftime('%Y%m%d-%H%M%S',
                              time.localtime(launch_time)), launch_time_ms)
        else:
            self._task_id = task_id
        current_user = TaskRunnerHelper.get_actual_user()
        self._user = user or current_user
        # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
        if self._user != current_user:
            if os.geteuid() != 0:
                raise ValueError(
                    'task specifies user as %s, but %s does not have setuid permission!'
                    % (self._user, current_user))
        self._portmap = portmap or {}
        self._launch_time = launch_time
        self._log_dir = log_dir or os.path.join(sandbox, '.logs')
        self._process_logger_destination = process_logger_destination
        self._process_logger_mode = process_logger_mode
        self._rotate_log_size_mb = rotate_log_size_mb
        self._rotate_log_backups = rotate_log_backups
        self._pathspec = TaskPath(root=checkpoint_root,
                                  task_id=self._task_id,
                                  log_dir=self._log_dir)
        self._hostname = hostname or socket.gethostname()
        try:
            ThermosTaskValidator.assert_valid_task(task)
            ThermosTaskValidator.assert_valid_ports(task, self._portmap)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        context = ThermosContext(task_id=self._task_id,
                                 ports=self._portmap,
                                 user=self._user)
        self._task, uninterp = (task %
                                Environment(thermos=context)).interpolate()
        if len(uninterp) > 0:
            raise self.InvalidTask('Failed to interpolate task, missing: %s' %
                                   ', '.join(str(ref) for ref in uninterp))
        try:
            ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        self._plan = None  # plan currently being executed (updated by Handlers)
        self._regular_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is False)
        self._finalizing_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is True)
        self._chroot = chroot
        self._sandbox = sandbox
        self._terminal_state = None
        self._ckpt = None
        self._process_map = dict(
            (p.name().get(), p) for p in self._task.processes())
        self._task_processes = {}
        self._stages = dict(
            (state, stage(self)) for state, stage in self.STAGES.items())
        self._finalization_start = None
        self._preemption_deadline = None
        self._watcher = ProcessMuxer(self._pathspec)
        self._state = RunnerState(processes={})
        self._preserve_env = preserve_env

        # create runner state
        universal_handler = universal_handler or TaskRunnerUniversalHandler
        self._dispatcher = CheckpointDispatcher()
        self._dispatcher.register_handler(universal_handler(self))
        self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
        self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

        # recover checkpointed runner state and update plan
        self._recovery = True
        self._replay_runner_ckpt()
예제 #6
0
    def inspect(self, task_id):
        """
      Reconstructs the checkpoint stream and returns a CheckpointInspection.
    """
        dispatcher = CheckpointDispatcher()
        state = RunnerState(processes={})
        muxer = ProcessMuxer(self._path.given(task_id=task_id))

        runner_processes = []
        coordinator_processes = set()
        processes = set()

        def consume_process_record(record):
            if not record.process_status:
                return
            try:
                user_uid = pwd.getpwnam(state.header.user).pw_uid
            except KeyError:
                log.error('Could not find user: %s' % state.header.user)
                return
            if record.process_status.state == ProcessState.FORKED:
                coordinator_processes.add(
                    (record.process_status.coordinator_pid, user_uid,
                     record.process_status.fork_time))
            elif record.process_status.state == ProcessState.RUNNING:
                processes.add((record.process_status.pid, user_uid,
                               record.process_status.start_time))

        # replay runner checkpoint
        runner_pid = None
        runner_latest_update = 0
        try:
            with open(
                    self._path.given(
                        task_id=task_id).getpath('runner_checkpoint')) as fp:
                with closing(ThriftRecordReader(fp, RunnerCkpt)) as ckpt:
                    for record in ckpt:
                        dispatcher.dispatch(state, record)
                        runner_latest_update = max(
                            runner_latest_update,
                            self.get_timestamp(record.process_status))
                        # collect all bound runners
                        if record.task_status:
                            if record.task_status.runner_pid != runner_pid:
                                runner_processes.append(
                                    (record.task_status.runner_pid,
                                     record.task_status.runner_uid
                                     or 0, record.task_status.timestamp_ms))
                                runner_pid = record.task_status.runner_pid
                        elif record.process_status:
                            consume_process_record(record)
        except (IOError, OSError, RecordIO.Error) as err:
            log.debug('Error inspecting task runner checkpoint: %s' % err)
            return

        # register existing processes in muxer
        for process_name in state.processes:
            muxer.register(process_name)

        # read process checkpoints
        process_latest_update = runner_latest_update
        for record in muxer.select():
            process_latest_update = max(
                process_latest_update,
                self.get_timestamp(record.process_status))
            consume_process_record(record)

        return CheckpointInspection(
            runner_latest_update=runner_latest_update,
            process_latest_update=process_latest_update,
            runner_processes=runner_processes,
            coordinator_processes=coordinator_processes,
            processes=processes)