Python CheckpointDispatcher Examples, apache.thermos.common.ckpt.CheckpointDispatcher Python Examples

Example #1

0

Show file

File: thermos.py Project: zhanglong2015/incubator-aurora

def read(args, options):
  """Replay a thermos checkpoint.

  Usage: thermos read [options] checkpoint_filename
  Options:
    --simple	Do not replay the full task state machine.  Only print out the contents of
                each checkpoint log message.
  """
  if len(args) != 1:
    app.error('Expected one checkpoint file, got %s' % len(args))
  if not os.path.exists(args[0]):
    app.error('Could not find %s' % args[0])

  dispatcher = CheckpointDispatcher()
  state = RunnerState(processes={})
  with open(args[0], 'r') as fp:
    try:
      for record in ThriftRecordReader(fp, RunnerCkpt):
        if not options.simple:
          dispatcher.dispatch(state, record)
        else:
          print('CKPT: %s' % record)
    except RecordIO.Error as err:
      print("Failed to recover from %s: %s" % (fp.name, err))
      return

  if not options.simple:
    if state is None or state.header is None:
      print('Checkpoint stream CORRUPT or outdated format')
      return
    print('Recovered Task Header:')
    print('  id:      %s' % state.header.task_id)
    print('  user:    %s' % state.header.user)
    print('  host:    %s' % state.header.hostname)
    print('  sandbox: %s' % state.header.sandbox)
    if state.header.ports:
      print('  ports:   %s' % ' '.join(
        '%s->%s' % (name, port) for (name, port) in state.header.ports.items()))
    print('Recovered Task States:')
    for task_status in state.statuses:
      print('  %s [pid: %d] => %s' % (
        time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)),
        task_status.runner_pid,
        TaskState._VALUES_TO_NAMES[task_status.state]))
    print('Recovered Processes:')
    for process, process_history in state.processes.items():
      print('  %s   runs: %s' % (process, len(process_history)))
      for k in reversed(range(len(process_history))):
        run = process_history[k]
        print('    %2d: pid=%d, rc=%s, finish:%s, state:%s' % (
          k,
          run.pid,
          run.return_code if run.return_code is not None else '',
          time.asctime(time.localtime(run.stop_time)) if run.stop_time else 'None',
          ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))

Example #2

0

Show file

File: monitor.py Project: xuchenw/incubator-aurora

 def __init__(self, pathspec, task_id):
     self._task_id = task_id
     self._dispatcher = CheckpointDispatcher()
     self._runnerstate = RunnerState(processes={})
     self._runner_ckpt = pathspec.given(
         task_id=task_id).getpath('runner_checkpoint')
     self._active_file, self._finished_file = (pathspec.given(
         task_id=task_id, state=state).getpath('task_path')
                                               for state in ('active',
                                                             'finished'))
     self._ckpt_head = 0
     self._apply_states()
     self._lock = threading.Lock()

Example #3

0

Show file

File: monitor.py Project: radhikari54/Mastering-Mesos

  def __init__(self, root, task_id):
    """Construct a TaskMonitor.

    :param root: The checkpoint root of the task.
    :param task_id: The task id of the task.
    """
    pathspec = TaskPath(root=root, task_id=task_id)
    self._dispatcher = CheckpointDispatcher()
    self._runnerstate = RunnerState(processes={})
    self._runner_ckpt = pathspec.getpath('runner_checkpoint')
    self._active_file, self._finished_file = (pathspec.given(state=state).getpath('task_path')
        for state in ('active', 'finished'))
    self._ckpt_head = 0
    self._apply_states()
    self._lock = threading.Lock()

Example #4

0

Show file

File: gc_executor.py Project: songaal/aurora

 def get_states(self, task):
   """Returns the (timestamp, status) tuples of the task or [] if could not replay."""
   statuses = CheckpointDispatcher.iter_statuses(self._runner_ckpt(task))
   try:
     return [(state.timestamp_ms / 1000.0, state.state) for state in statuses]
   except CheckpointDispatcher.ErrorRecoveringState:
     return []

Example #5

0

Show file

File: runner.py Project: StephanErb/aurora

 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state="active")
     task_json = path.getpath("task_path")
     task_checkpoint = path.getpath("runner_checkpoint")
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(
             task.tasks()[0].task(),
             checkpoint_root,
             checkpoint.header.sandbox,
             log_dir=checkpoint.header.log_dir,
             task_id=task_id,
             portmap=checkpoint.header.ports,
             hostname=checkpoint.header.hostname,
         )
     except Exception as e:
         log.error("Failed to reconstitute checkpoint in TaskRunner.get: %s" % e, exc_info=True)
         return None

Example #6

0

Show file

File: thermos.py Project: sumanau7/incubator-aurora

def tail(args, options):
    """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
    if len(args) == 0:
        app.error("Expected a task to tail, got nothing!")
    if len(args) not in (1, 2):
        app.error("Expected at most two arguments (task and optional process), got %d" % len(args))

    task_id = args[0]
    detector = TaskDetector(root=options.root)
    checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
    log_dir = checkpoint.header.log_dir
    process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
    if len(args) == 2:
        process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

    if len(process_runs) == 0:
        print("ERROR: No processes found.", file=sys.stderr)
        sys.exit(1)

    processes = set([process for process, _ in process_runs])
    if len(processes) != 1:
        print("ERROR: More than one process matches query.", file=sys.stderr)
        sys.exit(1)

    process = processes.pop()
    run = max([run for _, run in process_runs])

    logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath(
        "process_logdir"
    )
    logfile = os.path.join(logdir, "stderr" if options.use_stderr else "stdout")

    monitor = TaskMonitor(TaskPath(root=options.root), args[0])

    def log_is_active():
        active_processes = monitor.get_active_processes()
        for process_status, process_run in active_processes:
            if process_status.process == process and process_run == run:
                return True
        return False

    if not log_is_active():
        print("Tail of terminal log %s" % logfile)
        for line in tail_closed(logfile):
            print(line.rstrip())
        return

    now = time.time()
    next_check = now + 5.0
    print("Tail of active log %s" % logfile)
    for line in tail_f(logfile, include_last=True, forever=False):
        print(line.rstrip())
        if time.time() > next_check:
            if not log_is_active():
                break
            else:
                next_check = time.time() + 5.0

Example #7

0

Show file

File: gc_executor.py Project: josephglanville/incubator-aurora

 def get_states(self, task_id):
   """Returns the (timestamp, status) tuples of the task or [] if could not replay."""
   statuses = CheckpointDispatcher.iter_statuses(self._runner_ckpt(task_id))
   try:
     return [(state.timestamp_ms / 1000.0, state.state) for state in statuses]
   except CheckpointDispatcher.ErrorRecoveringState:
     return []

Example #8

0

Show file

File: runner.py Project: theevocater/aurora

 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
     task_json = path.getpath('task_path')
     task_checkpoint = path.getpath('runner_checkpoint')
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(task.tasks()[0].task(),
                    checkpoint_root,
                    checkpoint.header.sandbox,
                    log_dir=checkpoint.header.log_dir,
                    task_id=task_id,
                    portmap=checkpoint.header.ports,
                    hostname=checkpoint.header.hostname)
     except Exception as e:
         log.error(
             'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e,
             exc_info=True)
         return None

Example #9

0

Show file

 def open_checkpoint(cls, filename, force=False, state=None):
     """
   Acquire a locked checkpoint stream.
 """
     safe_mkdir(os.path.dirname(filename))
     fp = lock_file(filename, "a+")
     if fp in (None, False):
         if force:
             log.info('Found existing runner, forcing leadership forfeit.')
             state = state or CheckpointDispatcher.from_file(filename)
             if cls.kill_runner(state):
                 log.info('Successfully killed leader.')
                 # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
                 # a maximum timeout.  But blocking is necessary because os.kill does not immediately
                 # release the lock if we're in force mode.
                 fp = lock_file(filename, "a+", blocking=True)
         else:
             log.error('Found existing runner, cannot take control.')
     if fp in (None, False):
         raise cls.PermissionError(
             'Could not open locked checkpoint: %s, lock_file = %s' %
             (filename, fp))
     ckpt = ThriftRecordWriter(fp)
     ckpt.set_sync(True)
     return ckpt

Example #10

0

Show file

File: gc_executor.py Project: josephglanville/incubator-aurora

 def get_sandbox(self, task_id):
   """Returns the sandbox of the task, or None if it has not yet been initialized."""
   try:
     for update in CheckpointDispatcher.iter_updates(self._runner_ckpt(task_id)):
       if update.runner_header and update.runner_header.sandbox:
         return update.runner_header.sandbox
   except CheckpointDispatcher.ErrorRecoveringState:
     return None

Example #11

0

Show file

File: gc_executor.py Project: songaal/aurora

 def get_sandbox(self, task):
   """Returns the sandbox of the task, or None if it has not yet been initialized."""
   try:
     for update in CheckpointDispatcher.iter_updates(self._runner_ckpt(task)):
       if update.runner_header and update.runner_header.sandbox:
         return update.runner_header.sandbox
   except CheckpointDispatcher.ErrorRecoveringState:
     return None

Example #12

0

Show file

File: runner.py Project: zmyer/aurora

  def run(self):
    self._run_count += 1
    atexit.register(self.cleanup)

    if self.script_filename:
      os.unlink(self.script_filename)

    with temporary_file(cleanup=False) as fp:
      self.script_filename = fp.name
      fp.write(self.RUN_JOB_SCRIPT % {
        'filename': self.job_filename,
        'sandbox': self.sandbox,
        'root': self.tempdir,
        'task_id': self.task_id,
        'state_filename': self.state_filename,
        'success_rate': self.success_rate,
        'random_seed': self.random_seed + self._run_count,
        'extra_task_runner_args': self.extra_task_runner_args,
      })

    with environment_as(PYTHONPATH=os.pathsep.join(sys.path)):
      self.po = subprocess.Popen([sys.executable, self.script_filename],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      try:
        so, se = self.po.communicate()
      except OSError as e:
        if e.errno == errno.ECHILD:
          so = se = 'Killed'
        else:
          raise

    rc = self.po.returncode
    if rc != 0:
      if os.path.exists(self.job_filename):
        with open(self.job_filename) as fp:
          config = fp.read()
      else:
        config = 'Nonexistent!'
      if 'THERMOS_DEBUG' in os.environ:
        print("Runner failed!\n\n\nconfig:%s\n\n\nstdout:%s\n\n\nstderr:%s\n\n\n" % (
            config, so, se))

    try:
      with open(self.state_filename, 'r') as fp:
        self.state = thrift_deserialize(RunnerState(), fp.read())
    except Exception as e:
      if 'THERMOS_DEBUG' in os.environ:
        print('Failed to load Runner state: %s' % e, file=sys.stderr)
      self.state = RunnerState()

    try:
      self.reconstructed_state = CheckpointDispatcher.from_file(
          self.pathspec.getpath('runner_checkpoint'))
    except Exception as e:
      print('Failed to replay checkpoint: %s' % e, file=sys.stderr)
      self.reconstructed_state = None
    self.initialized = True
    return rc

Example #13

0

Show file

File: thermos.py Project: zhanglong2015/incubator-aurora

def tail(args, options):
  """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
  if len(args) == 0:
    app.error('Expected a task to tail, got nothing!')
  if len(args) not in (1, 2):
    app.error('Expected at most two arguments (task and optional process), got %d' % len(args))

  task_id = args[0]
  detector = TaskDetector(root=options.root)
  checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
  log_dir = checkpoint.header.log_dir
  process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
  if len(args) == 2:
    process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

  if len(process_runs) == 0:
    print('ERROR: No processes found.', file=sys.stderr)
    sys.exit(1)

  processes = set([process for process, _ in process_runs])
  if len(processes) != 1:
    print('ERROR: More than one process matches query.', file=sys.stderr)
    sys.exit(1)

  process = processes.pop()
  run = max([run for _, run in process_runs])

  logdir = TaskPath(root=options.root, task_id=args[0], process=process,
     run=run, log_dir=log_dir).getpath('process_logdir')
  logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout')

  monitor = TaskMonitor(TaskPath(root=options.root), args[0])
  def log_is_active():
    active_processes = monitor.get_active_processes()
    for process_status, process_run in active_processes:
      if process_status.process == process and process_run == run:
        return True
    return False

  if not log_is_active():
    print('Tail of terminal log %s' % logfile)
    for line in tail_closed(logfile):
      print(line.rstrip())
    return

  now = time.time()
  next_check = now + 5.0
  print('Tail of active log %s' % logfile)
  for line in tail_f(logfile, include_last=True, forever=False):
    print(line.rstrip())
    if time.time() > next_check:
      if not log_is_active():
        break
      else:
        next_check = time.time() + 5.0

Example #14

0

Show file

File: garbage.py Project: songaal/aurora

    def __init__(self, checkpoint_root, task_id):
        """
    :param checkpoint_root: The checkpoint root to find the given task.
    :param task_id: The task_id of the task whose state we wish to manage.
    """

        self._detector = TaskDetector(checkpoint_root)
        self._task_id = task_id
        self._pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
        self._state = CheckpointDispatcher.from_file(
            self._detector.get_checkpoint(task_id))

Example #15

0

Show file

File: monitor.py Project: betepahos/incubator-aurora

 def __init__(self, pathspec, task_id):
   self._task_id = task_id
   self._dispatcher = CheckpointDispatcher()
   self._runnerstate = RunnerState(processes={})
   self._runner_ckpt = pathspec.given(task_id=task_id).getpath('runner_checkpoint')
   self._active_file, self._finished_file = (
       pathspec.given(task_id=task_id, state=state).getpath('task_path')
       for state in ('active', 'finished'))
   self._ckpt_head = 0
   self._apply_states()
   self._lock = threading.Lock()

Example #16

0

Show file

File: helper.py Project: betepahos/incubator-aurora

  def kill(cls, task_id, checkpoint_root, force=False,
           terminal_status=TaskState.KILLED, clock=time):
    """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
    if terminal_status not in (TaskState.KILLED, TaskState.LOST):
      raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                      TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status)
    pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
    checkpoint = pathspec.getpath('runner_checkpoint')
    state = CheckpointDispatcher.from_file(checkpoint)

    if state is None or state.header is None or state.statuses is None:
      if force:
        log.error('Task has uninitialized TaskState - forcibly finalizing')
        cls.finalize_task(pathspec)
        return
      else:
        log.error('Cannot update states in uninitialized TaskState!')
        return

    ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

    def write_task_state(state):
      update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000),
                          runner_pid=os.getpid(), runner_uid=os.getuid())
      ckpt.write(RunnerCkpt(task_status=update))

    def write_process_status(status):
      ckpt.write(RunnerCkpt(process_status=status))

    if cls.is_task_terminal(state.statuses[-1].state):
      log.info('Task is already in terminal state!  Finalizing.')
      cls.finalize_task(pathspec)
      return

    with closing(ckpt):
      write_task_state(TaskState.ACTIVE)
      for process, history in state.processes.items():
        process_status = history[-1]
        if not cls.is_process_terminal(process_status.state):
          if cls.kill_process(state, process):
            write_process_status(ProcessStatus(process=process,
              state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9,
              stop_time=clock.time()))
          else:
            if process_status.state is not ProcessState.WAITING:
              write_process_status(ProcessStatus(process=process,
                state=ProcessState.LOST, seq=process_status.seq + 1))
      write_task_state(terminal_status)
    cls.finalize_task(pathspec)

Example #17

0

Show file

  def kill(cls, task_id, checkpoint_root, force=False,
           terminal_status=TaskState.KILLED, clock=time):
    """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
    if terminal_status not in (TaskState.KILLED, TaskState.LOST):
      raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                      TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status)
    pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
    checkpoint = pathspec.getpath('runner_checkpoint')
    state = CheckpointDispatcher.from_file(checkpoint)

    if state is None or state.header is None or state.statuses is None:
      if force:
        log.error('Task has uninitialized TaskState - forcibly finalizing')
        cls.finalize_task(pathspec)
        return
      else:
        log.error('Cannot update states in uninitialized TaskState!')
        return

    ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

    def write_task_state(state):
      update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000),
                          runner_pid=os.getpid(), runner_uid=os.getuid())
      ckpt.write(RunnerCkpt(task_status=update))

    def write_process_status(status):
      ckpt.write(RunnerCkpt(process_status=status))

    if cls.is_task_terminal(state.statuses[-1].state):
      log.info('Task is already in terminal state!  Finalizing.')
      cls.finalize_task(pathspec)
      return

    with closing(ckpt):
      write_task_state(TaskState.ACTIVE)
      for process, history in state.processes.items():
        process_status = history[-1]
        if not cls.is_process_terminal(process_status.state):
          if cls.kill_process(state, process):
            write_process_status(ProcessStatus(process=process,
              state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9,
              stop_time=clock.time()))
          else:
            if process_status.state is not ProcessState.WAITING:
              write_process_status(ProcessStatus(process=process,
                state=ProcessState.LOST, seq=process_status.seq + 1))
      write_task_state(terminal_status)
    cls.finalize_task(pathspec)

Example #18

0

Show file

File: thermos_ckpt.py Project: zhanglong2015/incubator-aurora

def main(args):
  values = app.get_options()

  if len(args) > 0:
    print("ERROR: unrecognized arguments: %s\n" % (" ".join(args)), file=sys.stderr)
    app.help()
    sys.exit(1)

  if not values.ckpt:
    print("ERROR: must supply --checkpoint", file=sys.stderr)
    app.help()
    sys.exit(1)

  fp = file(values.ckpt, "r")
  rr = ThriftRecordReader(fp, RunnerCkpt)
  wrs = RunnerState(processes={})
  dispatcher = CheckpointDispatcher()
  try:
    for wts in rr:
      print('Recovering: %s' % wts)
      if values.assemble is True:
        dispatcher.dispatch(wrs, wts)
  except RecordIO.Error as err:
    print('Error recovering checkpoint stream: %s' % err, file=sys.stderr)
    return
  print('\n\n\n')
  if values.assemble:
    print('Recovered Task Header')
    pprint.pprint(wrs.header, indent=4)

    print('\nRecovered Task States')
    for task_status in wrs.statuses:
      print('  %s [pid: %d] => %s' % (
          time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)),
          task_status.runner_pid,
          TaskState._VALUES_TO_NAMES[task_status.state]))

    print('\nRecovered Processes')
    pprint.pprint(wrs.processes, indent=4)

Example #19

0

Show file

File: monitor.py Project: rowoot/aurora

    def __init__(self, root, task_id):
        """Construct a TaskMonitor.

    :param root: The checkpoint root of the task.
    :param task_id: The task id of the task.
    """
        pathspec = TaskPath(root=root, task_id=task_id)
        self._dispatcher = CheckpointDispatcher()
        self._runnerstate = RunnerState(processes={})
        self._runner_ckpt = pathspec.getpath("runner_checkpoint")
        self._active_file, self._finished_file = (
            pathspec.given(state=state).getpath("task_path") for state in ("active", "finished")
        )
        self._ckpt_head = 0
        self._apply_states()
        self._lock = threading.Lock()

Example #20

0

Show file

 def format_task(detector, task_id):
     checkpoint_filename = detector.get_checkpoint(task_id)
     checkpoint_stat = os.stat(checkpoint_filename)
     try:
         checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
     except KeyError:
         checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid
     print('  %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='')
     if options.verbose == 0:
         print()
     if options.verbose > 0:
         state = CheckpointDispatcher.from_file(checkpoint_filename)
         if state is None or state.header is None:
             print(' - checkpoint stream CORRUPT or outdated format')
             return
         print('  state: %8s' % TaskState._VALUES_TO_NAMES.get(
             state.statuses[-1].state, 'Unknown'),
               end='')
         print(' start: %25s' % time.asctime(
             time.localtime(state.header.launch_time_ms / 1000.0)))
     if options.verbose > 1:
         print('    user: %s' % state.header.user, end='')
         if state.header.ports:
             print(' ports: %s' %
                   ' '.join('%s -> %s' % (key, val)
                            for key, val in state.header.ports.items()))
         else:
             print(' ports: None')
         print('    sandbox: %s' % state.header.sandbox)
     if options.verbose > 2:
         print('    process table:')
         for process, process_history in state.processes.items():
             print('      - %s runs: %s' % (process, len(process_history)),
                   end='')
             last_run = process_history[-1]
             print(' last: pid=%s, rc=%s, finish:%s, state:%s' %
                   (last_run.pid or 'None', last_run.return_code
                    if last_run.return_code is not None else '',
                    time.asctime(time.localtime(last_run.stop_time))
                    if last_run.stop_time else 'None',
                    ProcessState._VALUES_TO_NAMES.get(
                        last_run.state, 'Unknown')))
         print()

Example #21

0

Show file

File: thermos.py Project: sumanau7/incubator-aurora

 def format_task(task_id):
     checkpoint_filename = detector.get_checkpoint(task_id)
     checkpoint_stat = os.stat(checkpoint_filename)
     try:
         checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
     except:
         checkpoint_owner = "uid:%s" % checkpoint_stat.st_uid
     print("  %-20s [owner: %8s]" % (task_id, checkpoint_owner), end="")
     if options.verbose == 0:
         print()
     if options.verbose > 0:
         state = CheckpointDispatcher.from_file(checkpoint_filename)
         if state is None or state.header is None:
             print(" - checkpoint stream CORRUPT or outdated format")
             return
         print("  state: %8s" % TaskState._VALUES_TO_NAMES.get(state.statuses[-1].state, "Unknown"), end="")
         print(" start: %25s" % time.asctime(time.localtime(state.header.launch_time_ms / 1000.0)))
     if options.verbose > 1:
         print("    user: %s" % state.header.user, end="")
         if state.header.ports:
             print(" ports: %s" % " ".join("%s -> %s" % (key, val) for key, val in state.header.ports.items()))
         else:
             print(" ports: None")
         print("    sandbox: %s" % state.header.sandbox)
     if options.verbose > 2:
         print("    process table:")
         for process, process_history in state.processes.items():
             print("      - %s runs: %s" % (process, len(process_history)), end="")
             last_run = process_history[-1]
             print(
                 " last: pid=%s, rc=%s, finish:%s, state:%s"
                 % (
                     last_run.pid or "None",
                     last_run.return_code if last_run.return_code is not None else "",
                     time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else "None",
                     ProcessState._VALUES_TO_NAMES.get(last_run.state, "Unknown"),
                 )
             )
         print()

Example #22

0

Show file

File: thermos.py Project: dominichamon/incubator-aurora

 def format_task(task_id):
   checkpoint_filename = detector.get_checkpoint(task_id)
   checkpoint_stat = os.stat(checkpoint_filename)
   try:
     checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
   except KeyError:
     checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid
   print('  %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='')
   if options.verbose == 0:
     print()
   if options.verbose > 0:
     state = CheckpointDispatcher.from_file(checkpoint_filename)
     if state is None or state.header is None:
       print(' - checkpoint stream CORRUPT or outdated format')
       return
     print('  state: %8s' % TaskState._VALUES_TO_NAMES.get(state.statuses[-1].state, 'Unknown'),
       end='')
     print(' start: %25s' % time.asctime(time.localtime(state.header.launch_time_ms / 1000.0)))
   if options.verbose > 1:
     print('    user: %s' % state.header.user, end='')
     if state.header.ports:
       print(' ports: %s' % ' '.join('%s -> %s' % (key, val)
                                        for key, val in state.header.ports.items()))
     else:
       print(' ports: None')
     print('    sandbox: %s' % state.header.sandbox)
   if options.verbose > 2:
     print('    process table:')
     for process, process_history in state.processes.items():
       print('      - %s runs: %s' % (process, len(process_history)), end='')
       last_run = process_history[-1]
       print(' last: pid=%s, rc=%s, finish:%s, state:%s' % (
         last_run.pid or 'None',
         last_run.return_code if last_run.return_code is not None else '',
         time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else 'None',
         ProcessState._VALUES_TO_NAMES.get(last_run.state, 'Unknown')))
     print()

Example #23

0

Show file

File: helper.py Project: betepahos/incubator-aurora

 def open_checkpoint(cls, filename, force=False, state=None):
   """
     Acquire a locked checkpoint stream.
   """
   safe_mkdir(os.path.dirname(filename))
   fp = lock_file(filename, "a+")
   if fp in (None, False):
     if force:
       log.info('Found existing runner, forcing leadership forfeit.')
       state = state or CheckpointDispatcher.from_file(filename)
       if cls.kill_runner(state):
         log.info('Successfully killed leader.')
         # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
         # a maximum timeout.  But blocking is necessary because os.kill does not immediately
         # release the lock if we're in force mode.
         fp = lock_file(filename, "a+", blocking=True)
     else:
       log.error('Found existing runner, cannot take control.')
   if fp in (None, False):
     raise cls.PermissionError('Could not open locked checkpoint: %s, lock_file = %s' %
       (filename, fp))
   ckpt = ThriftRecordWriter(fp)
   ckpt.set_sync(True)
   return ckpt

Example #24

0

Show file

File: runner.py Project: apache/aurora

class TaskRunner(object):
  """
    Run a ThermosTask.

    This class encapsulates the core logic to run and control the state of a Thermos task.
    Typically, it will be instantiated directly to control a new task, but a TaskRunner can also be
    synthesised from an existing task's checkpoint root
  """
  class Error(Exception): pass
  class InternalError(Error): pass
  class InvalidTask(Error): pass
  class PermissionError(Error): pass
  class StateError(Error): pass

  # Maximum amount of time we spend waiting for new updates from the checkpoint streams
  # before doing housecleaning (checking for LOST tasks, dead PIDs.)
  MAX_ITERATION_TIME = Amount(10, Time.SECONDS)

  # Minimum amount of time we wait between polls for updates on coordinator checkpoints.
  COORDINATOR_INTERVAL_SLEEP = Amount(1, Time.SECONDS)

  # Amount of time we're willing to wait after forking before we expect the runner to have
  # exec'ed the child process.
  LOST_TIMEOUT = Amount(60, Time.SECONDS)

  # Active task stages
  STAGES = {
    TaskState.ACTIVE: TaskRunnerStage_ACTIVE,
    TaskState.CLEANING: TaskRunnerStage_CLEANING,
    TaskState.FINALIZING: TaskRunnerStage_FINALIZING
  }

  @classmethod
  def get(cls, task_id, checkpoint_root):
    """
      Get a TaskRunner bound to the task_id in checkpoint_root.
    """
    path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
    task_json = path.getpath('task_path')
    task_checkpoint = path.getpath('runner_checkpoint')
    if not os.path.exists(task_json):
      return None
    task = ThermosConfigLoader.load_json(task_json)
    if task is None:
      return None
    if len(task.tasks()) == 0:
      return None
    try:
      checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
      if checkpoint is None or checkpoint.header is None:
        return None
      return cls(task.tasks()[0].task(), checkpoint_root, checkpoint.header.sandbox,
                 log_dir=checkpoint.header.log_dir, task_id=task_id,
                 portmap=checkpoint.header.ports, hostname=checkpoint.header.hostname)
    except Exception as e:
      log.error('Failed to reconstitute checkpoint in TaskRunner.get: %s', e, exc_info=True)
      return None

  def __init__(self, task, checkpoint_root, sandbox, log_dir=None,
               task_id=None, portmap=None, user=None, chroot=False, clock=time,
               universal_handler=None, planner_class=TaskPlanner, hostname=None,
               process_logger_destination=None, process_logger_mode=None,
               rotate_log_size_mb=None, rotate_log_backups=None,
               preserve_env=False, mesos_containerizer_path=None, container_sandbox=None):
    """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
        mesos_containerizer_path = the path to the mesos-containerizer executable that will be used
                                   to isolate the task's filesystem (if using a filesystem image).
        container_sandbox = the path within the isolated filesystem where the task's sandbox is
                            mounted.
    """
    if not issubclass(planner_class, TaskPlanner):
      raise TypeError('planner_class must be a TaskPlanner.')
    self._clock = clock
    launch_time = self._clock.time()
    launch_time_ms = '%06d' % int((launch_time - int(launch_time)) * (10 ** 6))
    if not task_id:
      self._task_id = '%s-%s.%s' % (task.name(),
                                    time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)),
                                    launch_time_ms)
    else:
      self._task_id = task_id
    current_user = TaskRunnerHelper.get_actual_user()
    self._user = user or current_user
    # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
    if self._user != current_user:
      if os.geteuid() != 0:
        raise ValueError('task specifies user as %s, but %s does not have setuid permission!' % (
          self._user, current_user))
    self._portmap = portmap or {}
    self._launch_time = launch_time
    self._log_dir = log_dir or os.path.join(sandbox, '.logs')
    self._process_logger_destination = process_logger_destination
    self._process_logger_mode = process_logger_mode
    self._rotate_log_size_mb = rotate_log_size_mb
    self._rotate_log_backups = rotate_log_backups
    self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir)
    self._hostname = hostname or socket.gethostname()
    try:
      ThermosTaskValidator.assert_valid_task(task)
      ThermosTaskValidator.assert_valid_ports(task, self._portmap)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    context = ThermosContext(
        task_id=self._task_id,
        ports=self._portmap,
        user=self._user)
    self._task, uninterp = (task % Environment(thermos=context)).interpolate()
    if len(uninterp) > 0:
      raise self.InvalidTask('Failed to interpolate task, missing: %s' %
          ', '.join(str(ref) for ref in uninterp))
    try:
      ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    self._plan = None  # plan currently being executed (updated by Handlers)
    self._regular_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is False)
    self._finalizing_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is True)
    self._chroot = chroot
    self._sandbox = sandbox
    self._container_sandbox = container_sandbox
    self._terminal_state = None
    self._ckpt = None
    self._process_map = dict((p.name().get(), p) for p in self._task.processes())
    self._task_processes = {}
    self._stages = dict((state, stage(self)) for state, stage in self.STAGES.items())
    self._finalization_start = None
    self._preemption_deadline = None
    self._watcher = ProcessMuxer(self._pathspec)
    self._state = RunnerState(processes={})
    self._preserve_env = preserve_env
    self._mesos_containerizer_path = mesos_containerizer_path

    # create runner state
    universal_handler = universal_handler or TaskRunnerUniversalHandler
    self._dispatcher = CheckpointDispatcher()
    self._dispatcher.register_handler(universal_handler(self))
    self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
    self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

    # recover checkpointed runner state and update plan
    self._recovery = True
    self._replay_runner_ckpt()

  @property
  def task(self):
    return self._task

  @property
  def task_id(self):
    return self._task_id

  @property
  def state(self):
    return self._state

  @property
  def processes(self):
    return self._task_processes

  def task_state(self):
    return self._state.statuses[-1].state if self._state.statuses else TaskState.ACTIVE

  def close_ckpt(self):
    """Force close the checkpoint stream.  This is necessary for runners terminated through
       exception propagation."""
    log.debug('Closing the checkpoint stream.')
    self._ckpt.close()

  @contextmanager
  def control(self, force=False):
    """
      Bind to the checkpoint associated with this task, position to the end of the log if
      it exists, or create it if it doesn't.  Fails if we cannot get "leadership" i.e. a
      file lock on the checkpoint stream.
    """
    if self.is_terminal():
      raise self.StateError('Cannot take control of a task in terminal state.')
    if self._sandbox:
      safe_mkdir(self._sandbox)
    ckpt_file = self._pathspec.getpath('runner_checkpoint')
    try:
      self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file, force=force, state=self._state)
    except TaskRunnerHelper.PermissionError:
      raise self.PermissionError('Unable to open checkpoint %s' % ckpt_file)
    log.debug('Flipping recovery mode off.')
    self._recovery = False
    self._set_task_status(self.task_state())
    self._resume_task()
    try:
      yield
    except Exception as e:
      log.error('Caught exception in self.control(): %s', e)
      log.error('  %s', traceback.format_exc())
    self._ckpt.close()

  def _resume_task(self):
    assert self._ckpt is not None
    unapplied_updates = self._replay_process_ckpts()
    if self.is_terminal():
      raise self.StateError('Cannot resume terminal task.')
    self._initialize_ckpt_header()
    self._replay(unapplied_updates)

  def _ckpt_write(self, record):
    """
      Write to the checkpoint stream if we're not in recovery mode.
    """
    if not self._recovery:
      self._ckpt.write(record)

  def _replay(self, checkpoints):
    """
      Replay a sequence of RunnerCkpts.
    """
    for checkpoint in checkpoints:
      self._dispatcher.dispatch(self._state, checkpoint)

  def _replay_runner_ckpt(self):
    """
      Replay the checkpoint stream associated with this task.
    """
    ckpt_file = self._pathspec.getpath('runner_checkpoint')
    if os.path.exists(ckpt_file):
      with open(ckpt_file, 'r') as fp:
        ckpt_recover = ThriftRecordReader(fp, RunnerCkpt)
        for record in ckpt_recover:
          log.debug('Replaying runner checkpoint record: %s', record)
          self._dispatcher.dispatch(self._state, record, recovery=True)

  def _replay_process_ckpts(self):
    """
      Replay the unmutating process checkpoints.  Return the unapplied process updates that
      would mutate the runner checkpoint stream.
    """
    process_updates = self._watcher.select()
    unapplied_process_updates = []
    for process_update in process_updates:
      if self._dispatcher.would_update(self._state, process_update):
        unapplied_process_updates.append(process_update)
      else:
        self._dispatcher.dispatch(self._state, process_update, recovery=True)
    return unapplied_process_updates

  def _initialize_ckpt_header(self):
    """
      Initializes the RunnerHeader for this checkpoint stream if it has not already
      been constructed.
    """
    if self._state.header is None:
      try:
        uid = pwd.getpwnam(self._user).pw_uid
      except KeyError:
        # This will cause failures downstream, but they will at least be correctly
        # reflected in the process state.
        log.error('Unknown user %s.', self._user)
        uid = None

      header = RunnerHeader(
          task_id=self._task_id,
          launch_time_ms=int(self._launch_time * 1000),
          sandbox=self._sandbox,
          log_dir=self._log_dir,
          hostname=self._hostname,
          user=self._user,
          uid=uid,
          ports=self._portmap)
      runner_ckpt = RunnerCkpt(runner_header=header)
      self._dispatcher.dispatch(self._state, runner_ckpt)

  def _set_task_status(self, state):
    update = TaskStatus(state=state, timestamp_ms=int(self._clock.time() * 1000),
                        runner_pid=os.getpid(), runner_uid=os.getuid())
    runner_ckpt = RunnerCkpt(task_status=update)
    self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)

  def _finalization_remaining(self):
    # If a preemption deadline has been set, use that.
    if self._preemption_deadline:
      return max(0, self._preemption_deadline - self._clock.time())

    # Otherwise, use the finalization wait provided in the configuration.
    finalization_allocation = self.task.finalization_wait().get()
    if self._finalization_start is None:
      return sys.float_info.max
    else:
      waited = max(0, self._clock.time() - self._finalization_start)
      return max(0, finalization_allocation - waited)

  def _set_process_status(self, process_name, process_state, **kw):
    if 'sequence_number' in kw:
      sequence_number = kw.pop('sequence_number')
      log.debug('_set_process_status(%s <= %s, seq=%s[force])', process_name,
        ProcessState._VALUES_TO_NAMES.get(process_state), sequence_number)
    else:
      current_run = self._current_process_run(process_name)
      if not current_run:
        assert process_state == ProcessState.WAITING
        sequence_number = 0
      else:
        sequence_number = current_run.seq + 1
      log.debug('_set_process_status(%s <= %s, seq=%s[auto])', process_name,
        ProcessState._VALUES_TO_NAMES.get(process_state), sequence_number)
    runner_ckpt = RunnerCkpt(process_status=ProcessStatus(
        process=process_name, state=process_state, seq=sequence_number, **kw))
    self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)

  def _task_process_from_process_name(self, process_name, sequence_number):
    """
      Construct a Process() object from a process_name, populated with its
      correct run number and fully interpolated commandline.
    """
    run_number = len(self.state.processes[process_name]) - 1
    pathspec = self._pathspec.given(process=process_name, run=run_number)
    process = self._process_map.get(process_name)
    if process is None:
      raise self.InternalError('FATAL: Could not find process: %s' % process_name)
    def close_ckpt_and_fork():
      pid = os.fork()
      if pid == 0 and self._ckpt is not None:
        self._ckpt.close()
      return pid

    (logger_destination,
     logger_mode,
     rotate_log_size,
     rotate_log_backups) = self._build_process_logger_args(process)

    return Process(
      process.name().get(),
      process.cmdline().get(),
      sequence_number,
      pathspec,
      self._sandbox,
      self._user,
      chroot=self._chroot,
      fork=close_ckpt_and_fork,
      logger_destination=logger_destination,
      logger_mode=logger_mode,
      rotate_log_size=rotate_log_size,
      rotate_log_backups=rotate_log_backups,
      preserve_env=self._preserve_env,
      mesos_containerizer_path=self._mesos_containerizer_path,
      container_sandbox=self._container_sandbox)

  _DEFAULT_LOGGER = Logger()
  _DEFAULT_ROTATION = RotatePolicy()

  def _build_process_logger_args(self, process):
    """
      Build the appropriate logging configuration based on flags + process
      configuration settings.

      If no configuration (neither flags nor process config), default to
      "standard" mode.
    """

    destination, mode, size, backups = (self._DEFAULT_LOGGER.destination().get(),
                                        self._DEFAULT_LOGGER.mode().get(),
                                        None,
                                        None)

    logger = process.logger()
    if logger is Empty:
      if self._process_logger_destination:
        destination = self._process_logger_destination
      if self._process_logger_mode:
        mode = self._process_logger_mode
    else:
      destination = logger.destination().get()
      mode = logger.mode().get()

    if mode == LoggerMode.ROTATE:
      size = Amount(self._DEFAULT_ROTATION.log_size().get(), Data.BYTES)
      backups = self._DEFAULT_ROTATION.backups().get()
      if logger is Empty:
        if self._rotate_log_size_mb:
          size = Amount(self._rotate_log_size_mb, Data.MB)
        if self._rotate_log_backups:
          backups = self._rotate_log_backups
      else:
        rotate = logger.rotate()
        if rotate is not Empty:
          size = Amount(rotate.log_size().get(), Data.BYTES)
          backups = rotate.backups().get()

    return destination, mode, size, backups

  def deadlocked(self, plan=None):
    """Check whether a plan is deadlocked, i.e. there are no running/runnable processes, and the
    plan is not complete."""
    plan = plan or self._regular_plan
    now = self._clock.time()
    running = list(plan.running)
    runnable = list(plan.runnable_at(now))
    waiting = list(plan.waiting_at(now))
    log.debug('running:%d runnable:%d waiting:%d complete:%s',
      len(running), len(runnable), len(waiting), plan.is_complete())
    return len(running + runnable + waiting) == 0 and not plan.is_complete()

  def is_healthy(self):
    """Check whether the TaskRunner is healthy. A healthy TaskRunner is not deadlocked and has not
    reached its max_failures count."""
    max_failures = self._task.max_failures().get()
    deadlocked = self.deadlocked()
    under_failure_limit = max_failures == 0 or len(self._regular_plan.failed) < max_failures
    log.debug('max_failures:%d failed:%d under_failure_limit:%s deadlocked:%s ==> health:%s',
      max_failures, len(self._regular_plan.failed), under_failure_limit, deadlocked,
      not deadlocked and under_failure_limit)
    return not deadlocked and under_failure_limit

  def _current_process_run(self, process_name):
    if process_name not in self._state.processes or len(self._state.processes[process_name]) == 0:
      return None
    return self._state.processes[process_name][-1]

  def is_process_lost(self, process_name):
    """Determine whether or not we should mark a task as LOST and do so if necessary."""
    current_run = self._current_process_run(process_name)
    if not current_run:
      raise self.InternalError('No current_run for process %s!' % process_name)

    def forked_but_never_came_up():
      return current_run.state == ProcessState.FORKED and (
        self._clock.time() - current_run.fork_time > self.LOST_TIMEOUT.as_(Time.SECONDS))

    def running_but_coordinator_died():
      if current_run.state != ProcessState.RUNNING:
        return False
      coordinator_pid, _, _ = TaskRunnerHelper.scan_process(self.state, process_name)
      if coordinator_pid is not None:
        return False
      elif self._watcher.has_data(process_name):
        return False
      return True

    if forked_but_never_came_up() or running_but_coordinator_died():
      log.info('Detected a LOST task: %s', current_run)
      log.debug('  forked_but_never_came_up: %s', forked_but_never_came_up())
      log.debug('  running_but_coordinator_died: %s', running_but_coordinator_died())
      return True

    return False

  def _run_plan(self, plan):
    log.debug('Schedule pass:'******'running: %s', ' '.join(plan.running))
    log.debug('finished: %s', ' '.join(plan.finished))

    launched = []
    for process_name in plan.running:
      if self.is_process_lost(process_name):
        self._set_process_status(process_name, ProcessState.LOST)

    now = self._clock.time()
    runnable = list(plan.runnable_at(now))
    waiting = list(plan.waiting_at(now))
    log.debug('runnable: %s', ' '.join(runnable))
    log.debug('waiting: %s', ' '.join(
        '%s[T-%.1fs]' % (process, plan.get_wait(process)) for process in waiting))

    def pick_processes(process_list):
      if self._task.max_concurrency().get() == 0:
        return process_list
      num_to_pick = max(self._task.max_concurrency().get() - len(running), 0)
      return process_list[:num_to_pick]

    for process_name in pick_processes(runnable):
      tp = self._task_processes.get(process_name)
      if tp:
        current_run = self._current_process_run(process_name)
        assert current_run.state == ProcessState.WAITING
      else:
        self._set_process_status(process_name, ProcessState.WAITING)
        tp = self._task_processes[process_name]
      log.info('Forking Process(%s)', process_name)
      try:
        tp.start()
        launched.append(tp)
      except Process.Error as e:
        log.error('Failed to launch process: %s', e)
        self._set_process_status(process_name, ProcessState.FAILED)

    return len(launched) > 0

  def _terminate_plan(self, plan):
    TaskRunnerHelper.terminate_orphans(self.state)

    for process in plan.running:
      last_run = self._current_process_run(process)
      if last_run and last_run.state in (ProcessState.FORKED, ProcessState.RUNNING):
        TaskRunnerHelper.terminate_process(self.state, process)

  def has_running_processes(self):
    """
      Returns True if any processes associated with this task have active pids.
    """
    process_tree = TaskRunnerHelper.scan_tree(self.state)
    return any(any(process_set) for process_set in process_tree.values())

  def has_active_processes(self):
    """
      Returns True if any processes are in non-terminal states.
    """
    return any(not TaskRunnerHelper.is_process_terminal(run.state) for run in
        filter(None, (self._current_process_run(process) for process in self.state.processes)))

  def collect_updates(self, timeout=None):
    """
      Collects and applies updates from process checkpoint streams.  Returns the number
      of applied process checkpoints.
    """
    if not self.has_active_processes():
      return 0

    sleep_interval = self.COORDINATOR_INTERVAL_SLEEP.as_(Time.SECONDS)
    total_time = 0.0

    while True:
      process_updates = self._watcher.select()
      for process_update in process_updates:
        self._dispatcher.dispatch(self._state, process_update, self._recovery)
      if process_updates:
        return len(process_updates)
      if timeout is not None and total_time >= timeout:
        return 0
      total_time += sleep_interval
      self._clock.sleep(sleep_interval)

  def is_terminal(self):
    return TaskRunnerHelper.is_task_terminal(self.task_state())

  def terminal_state(self):
    if self._terminal_state:
      log.debug('Forced terminal state: %s' %
          TaskState._VALUES_TO_NAMES.get(self._terminal_state, 'UNKNOWN'))
      return self._terminal_state
    else:
      return TaskState.SUCCESS if self.is_healthy() else TaskState.FAILED

  def run(self, force=False):
    """
      Entrypoint to runner. Assume control of checkpoint stream, and execute TaskRunnerStages
      until runner is terminal.
    """
    if self.is_terminal():
      return
    with self.control(force):
      self._run()

  def _run(self):
    while not self.is_terminal():
      start = self._clock.time()
      # step 1: execute stage corresponding to the state we're currently in
      runner = self._stages[self.task_state()]
      iteration_wait = runner.run()
      if iteration_wait is None:
        log.debug('Run loop: No more work to be done in state %s' %
            TaskState._VALUES_TO_NAMES.get(self.task_state(), 'UNKNOWN'))
        self._set_task_status(runner.transition_to())
        continue
      log.debug('Run loop: Work to be done within %.1fs', iteration_wait)
      # step 2: check child process checkpoint streams for updates
      if not self.collect_updates(iteration_wait):
        # If we don't collect any updates, at least 'touch' the checkpoint stream
        # so as to prevent garbage collection.
        elapsed = self._clock.time() - start
        if elapsed < iteration_wait:
          log.debug('Update collection only took %.1fs, idling %.1fs',
              elapsed, iteration_wait - elapsed)
          self._clock.sleep(iteration_wait - elapsed)
        log.debug('Run loop: No updates collected, touching checkpoint.')
        os.utime(self._pathspec.getpath('runner_checkpoint'), None)
      # step 3: reap any zombie child processes
      TaskRunnerHelper.reap_children()

  def kill(self, force=False, terminal_status=TaskState.KILLED,
           preemption_wait=Amount(1, Time.MINUTES)):
    """
      Kill all processes associated with this task and set task/process states as terminal_status
      (defaults to KILLED)
    """
    log.debug('Runner issued kill: force:%s, preemption_wait:%s',
      force, preemption_wait)
    assert terminal_status in (TaskState.KILLED, TaskState.LOST)
    self._preemption_deadline = self._clock.time() + preemption_wait.as_(Time.SECONDS)
    with self.control(force):
      if self.is_terminal():
        log.warning('Task is not in ACTIVE state, cannot issue kill.')
        return
      self._terminal_state = terminal_status
      if self.task_state() == TaskState.ACTIVE:
        self._set_task_status(TaskState.CLEANING)
      self._run()

  def lose(self, force=False):
    """
      Mark a task as LOST and kill any straggling processes.
    """
    self.kill(force, preemption_wait=Amount(0, Time.SECONDS), terminal_status=TaskState.LOST)

  def _kill(self):
    processes = TaskRunnerHelper.scan_tree(self._state)
    for process, pid_tuple in processes.items():
      current_run = self._current_process_run(process)
      coordinator_pid, pid, tree = pid_tuple
      if TaskRunnerHelper.is_process_terminal(current_run.state):
        if coordinator_pid or pid or tree:
          log.warning('Terminal process (%s) still has running pids:', process)
          log.warning('  coordinator_pid: %s', coordinator_pid)
          log.warning('              pid: %s', pid)
          log.warning('             tree: %s', tree)
        TaskRunnerHelper.kill_process(self.state, process)
      else:
        if coordinator_pid or pid or tree:
          log.info('Transitioning %s to KILLED', process)
          self._set_process_status(process, ProcessState.KILLED,
            stop_time=self._clock.time(), return_code=-1)
        else:
          log.info('Transitioning %s to LOST', process)
          if current_run.state != ProcessState.WAITING:
            self._set_process_status(process, ProcessState.LOST)

Example #25

0

Show file

File: garbage.py Project: aalzabarah/incubator-aurora

 def state(self, task_id):
   if task_id not in self._states:
     self._states[task_id] = CheckpointDispatcher.from_file(self._detector.get_checkpoint(task_id))
   return self._states[task_id]

Example #26

0

Show file

File: monitor.py Project: radhikari54/Mastering-Mesos

class TaskMonitor(object):
  """
    Class responsible for reconstructing and monitoring the state of an individual Thermos task via
    its runner checkpoint. Also exports information on active processes in the task.
  """

  def __init__(self, root, task_id):
    """Construct a TaskMonitor.

    :param root: The checkpoint root of the task.
    :param task_id: The task id of the task.
    """
    pathspec = TaskPath(root=root, task_id=task_id)
    self._dispatcher = CheckpointDispatcher()
    self._runnerstate = RunnerState(processes={})
    self._runner_ckpt = pathspec.getpath('runner_checkpoint')
    self._active_file, self._finished_file = (pathspec.given(state=state).getpath('task_path')
        for state in ('active', 'finished'))
    self._ckpt_head = 0
    self._apply_states()
    self._lock = threading.Lock()

  def _apply_states(self):
    """
      os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt
      records.  Attempt to read those records and update the high watermark for that stream.
      Returns True if new states were applied, False otherwise.
    """
    ckpt_offset = None
    try:
      ckpt_offset = os.stat(self._runner_ckpt).st_size

      updated = False
      if self._ckpt_head < ckpt_offset:
        with open(self._runner_ckpt, 'r') as fp:
          fp.seek(self._ckpt_head)
          rr = ThriftRecordReader(fp, RunnerCkpt)
          while True:
            runner_update = rr.try_read()
            if not runner_update:
              break
            try:
              self._dispatcher.dispatch(self._runnerstate, runner_update)
            except CheckpointDispatcher.InvalidSequenceNumber as e:
              log.error('Checkpoint stream is corrupt: %s' % e)
              break
          new_ckpt_head = fp.tell()
          updated = self._ckpt_head != new_ckpt_head
          self._ckpt_head = new_ckpt_head
      return updated
    except OSError as e:
      if e.errno == errno.ENOENT:
        # The log doesn't yet exist, will retry later.
        log.warning('Could not read from checkpoint %s' % self._runner_ckpt)
        return False
      else:
        raise

  def refresh(self):
    """
      Check to see if there are new updates and apply them.  Return true if
      updates were applied, false otherwise.
    """
    with self._lock:
      return self._apply_states()

  def get_sandbox(self):
    """Get the sandbox of this task, or None if it has not yet been discovered."""
    state = self.get_state()
    if state.header:
      return state.header.sandbox

  def get_state(self):
    """Get the latest state of this Task."""
    with self._lock:
      self._apply_states()
      return copy.deepcopy(self._runnerstate)

  def task_state(self):
    state = self.get_state()
    return state.statuses[-1].state if state.statuses else TaskState.ACTIVE

  @property
  def active(self):
    return os.path.exists(self._active_file)

  @property
  def finished(self):
    return os.path.exists(self._finished_file)

  def get_active_processes(self):
    """
      Get active processes.  Returned is a list of tuples of the form:
        (ProcessStatus object of running object, its run number)
    """
    active_processes = []
    with self._lock:
      self._apply_states()
      state = self._runnerstate
      for process, runs in state.processes.items():
        if len(runs) == 0:
          continue
        last_run = runs[-1]
        if last_run.state == ProcessState.RUNNING:
          active_processes.append((last_run, len(runs) - 1))
    return active_processes

Example #27

0

Show file

File: runner.py Project: theevocater/aurora

    def __init__(self,
                 task,
                 checkpoint_root,
                 sandbox,
                 log_dir=None,
                 task_id=None,
                 portmap=None,
                 user=None,
                 chroot=False,
                 clock=time,
                 universal_handler=None,
                 planner_class=TaskPlanner,
                 hostname=None,
                 process_logger_destination=None,
                 process_logger_mode=None,
                 rotate_log_size_mb=None,
                 rotate_log_backups=None,
                 preserve_env=False):
        """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
    """
        if not issubclass(planner_class, TaskPlanner):
            raise TypeError('planner_class must be a TaskPlanner.')
        self._clock = clock
        launch_time = self._clock.time()
        launch_time_ms = '%06d' % int(
            (launch_time - int(launch_time)) * (10**6))
        if not task_id:
            self._task_id = '%s-%s.%s' % (
                task.name(),
                time.strftime('%Y%m%d-%H%M%S',
                              time.localtime(launch_time)), launch_time_ms)
        else:
            self._task_id = task_id
        current_user = TaskRunnerHelper.get_actual_user()
        self._user = user or current_user
        # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
        if self._user != current_user:
            if os.geteuid() != 0:
                raise ValueError(
                    'task specifies user as %s, but %s does not have setuid permission!'
                    % (self._user, current_user))
        self._portmap = portmap or {}
        self._launch_time = launch_time
        self._log_dir = log_dir or os.path.join(sandbox, '.logs')
        self._process_logger_destination = process_logger_destination
        self._process_logger_mode = process_logger_mode
        self._rotate_log_size_mb = rotate_log_size_mb
        self._rotate_log_backups = rotate_log_backups
        self._pathspec = TaskPath(root=checkpoint_root,
                                  task_id=self._task_id,
                                  log_dir=self._log_dir)
        self._hostname = hostname or socket.gethostname()
        try:
            ThermosTaskValidator.assert_valid_task(task)
            ThermosTaskValidator.assert_valid_ports(task, self._portmap)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        context = ThermosContext(task_id=self._task_id,
                                 ports=self._portmap,
                                 user=self._user)
        self._task, uninterp = (task %
                                Environment(thermos=context)).interpolate()
        if len(uninterp) > 0:
            raise self.InvalidTask('Failed to interpolate task, missing: %s' %
                                   ', '.join(str(ref) for ref in uninterp))
        try:
            ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        self._plan = None  # plan currently being executed (updated by Handlers)
        self._regular_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is False)
        self._finalizing_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is True)
        self._chroot = chroot
        self._sandbox = sandbox
        self._terminal_state = None
        self._ckpt = None
        self._process_map = dict(
            (p.name().get(), p) for p in self._task.processes())
        self._task_processes = {}
        self._stages = dict(
            (state, stage(self)) for state, stage in self.STAGES.items())
        self._finalization_start = None
        self._preemption_deadline = None
        self._watcher = ProcessMuxer(self._pathspec)
        self._state = RunnerState(processes={})
        self._preserve_env = preserve_env

        # create runner state
        universal_handler = universal_handler or TaskRunnerUniversalHandler
        self._dispatcher = CheckpointDispatcher()
        self._dispatcher.register_handler(universal_handler(self))
        self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
        self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

        # recover checkpointed runner state and update plan
        self._recovery = True
        self._replay_runner_ckpt()

Example #28

0

Show file

File: monitor.py Project: rowoot/aurora

class TaskMonitor(object):
    """
    Class responsible for reconstructing and monitoring the state of an individual Thermos task via
    its runner checkpoint. Also exports information on active processes in the task.
  """

    def __init__(self, root, task_id):
        """Construct a TaskMonitor.

    :param root: The checkpoint root of the task.
    :param task_id: The task id of the task.
    """
        pathspec = TaskPath(root=root, task_id=task_id)
        self._dispatcher = CheckpointDispatcher()
        self._runnerstate = RunnerState(processes={})
        self._runner_ckpt = pathspec.getpath("runner_checkpoint")
        self._active_file, self._finished_file = (
            pathspec.given(state=state).getpath("task_path") for state in ("active", "finished")
        )
        self._ckpt_head = 0
        self._apply_states()
        self._lock = threading.Lock()

    def _apply_states(self):
        """
      os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt
      records.  Attempt to read those records and update the high watermark for that stream.
      Returns True if new states were applied, False otherwise.
    """
        ckpt_offset = None
        try:
            ckpt_offset = os.stat(self._runner_ckpt).st_size

            updated = False
            if self._ckpt_head < ckpt_offset:
                with open(self._runner_ckpt, "r") as fp:
                    fp.seek(self._ckpt_head)
                    rr = ThriftRecordReader(fp, RunnerCkpt)
                    while True:
                        runner_update = rr.try_read()
                        if not runner_update:
                            break
                        try:
                            self._dispatcher.dispatch(self._runnerstate, runner_update)
                        except CheckpointDispatcher.InvalidSequenceNumber as e:
                            log.error("Checkpoint stream is corrupt: %s" % e)
                            break
                    new_ckpt_head = fp.tell()
                    updated = self._ckpt_head != new_ckpt_head
                    self._ckpt_head = new_ckpt_head
            return updated
        except OSError as e:
            if e.errno == errno.ENOENT:
                # The log doesn't yet exist, will retry later.
                log.warning("Could not read from checkpoint %s" % self._runner_ckpt)
                return False
            else:
                raise

    def refresh(self):
        """
      Check to see if there are new updates and apply them.  Return true if
      updates were applied, false otherwise.
    """
        with self._lock:
            return self._apply_states()

    def get_sandbox(self):
        """Get the sandbox of this task, or None if it has not yet been discovered."""
        state = self.get_state()
        if state.header:
            return state.header.sandbox

    def get_state(self):
        """Get the latest state of this Task."""
        with self._lock:
            self._apply_states()
            return copy.deepcopy(self._runnerstate)

    def task_state(self):
        state = self.get_state()
        return state.statuses[-1].state if state.statuses else TaskState.ACTIVE

    @property
    def active(self):
        return os.path.exists(self._active_file)

    @property
    def finished(self):
        return os.path.exists(self._finished_file)

    def get_active_processes(self):
        """
      Get active processes.  Returned is a list of tuples of the form:
        (ProcessStatus object of running object, its run number)
    """
        active_processes = []
        with self._lock:
            self._apply_states()
            state = self._runnerstate
            for process, runs in state.processes.items():
                if len(runs) == 0:
                    continue
                last_run = runs[-1]
                if last_run.state == ProcessState.RUNNING:
                    active_processes.append((last_run, len(runs) - 1))
        return active_processes

Example #29

0

Show file

File: runner.py Project: theevocater/aurora

class TaskRunner(object):
    """
    Run a ThermosTask.

    This class encapsulates the core logic to run and control the state of a Thermos task.
    Typically, it will be instantiated directly to control a new task, but a TaskRunner can also be
    synthesised from an existing task's checkpoint root
  """
    class Error(Exception):
        pass

    class InternalError(Error):
        pass

    class InvalidTask(Error):
        pass

    class PermissionError(Error):
        pass

    class StateError(Error):
        pass

    # Maximum amount of time we spend waiting for new updates from the checkpoint streams
    # before doing housecleaning (checking for LOST tasks, dead PIDs.)
    MAX_ITERATION_TIME = Amount(10, Time.SECONDS)

    # Minimum amount of time we wait between polls for updates on coordinator checkpoints.
    COORDINATOR_INTERVAL_SLEEP = Amount(1, Time.SECONDS)

    # Amount of time we're willing to wait after forking before we expect the runner to have
    # exec'ed the child process.
    LOST_TIMEOUT = Amount(60, Time.SECONDS)

    # Active task stages
    STAGES = {
        TaskState.ACTIVE: TaskRunnerStage_ACTIVE,
        TaskState.CLEANING: TaskRunnerStage_CLEANING,
        TaskState.FINALIZING: TaskRunnerStage_FINALIZING
    }

    @classmethod
    def get(cls, task_id, checkpoint_root):
        """
      Get a TaskRunner bound to the task_id in checkpoint_root.
    """
        path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
        task_json = path.getpath('task_path')
        task_checkpoint = path.getpath('runner_checkpoint')
        if not os.path.exists(task_json):
            return None
        task = ThermosConfigLoader.load_json(task_json)
        if task is None:
            return None
        if len(task.tasks()) == 0:
            return None
        try:
            checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
            if checkpoint is None or checkpoint.header is None:
                return None
            return cls(task.tasks()[0].task(),
                       checkpoint_root,
                       checkpoint.header.sandbox,
                       log_dir=checkpoint.header.log_dir,
                       task_id=task_id,
                       portmap=checkpoint.header.ports,
                       hostname=checkpoint.header.hostname)
        except Exception as e:
            log.error(
                'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e,
                exc_info=True)
            return None

    def __init__(self,
                 task,
                 checkpoint_root,
                 sandbox,
                 log_dir=None,
                 task_id=None,
                 portmap=None,
                 user=None,
                 chroot=False,
                 clock=time,
                 universal_handler=None,
                 planner_class=TaskPlanner,
                 hostname=None,
                 process_logger_destination=None,
                 process_logger_mode=None,
                 rotate_log_size_mb=None,
                 rotate_log_backups=None,
                 preserve_env=False):
        """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
    """
        if not issubclass(planner_class, TaskPlanner):
            raise TypeError('planner_class must be a TaskPlanner.')
        self._clock = clock
        launch_time = self._clock.time()
        launch_time_ms = '%06d' % int(
            (launch_time - int(launch_time)) * (10**6))
        if not task_id:
            self._task_id = '%s-%s.%s' % (
                task.name(),
                time.strftime('%Y%m%d-%H%M%S',
                              time.localtime(launch_time)), launch_time_ms)
        else:
            self._task_id = task_id
        current_user = TaskRunnerHelper.get_actual_user()
        self._user = user or current_user
        # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
        if self._user != current_user:
            if os.geteuid() != 0:
                raise ValueError(
                    'task specifies user as %s, but %s does not have setuid permission!'
                    % (self._user, current_user))
        self._portmap = portmap or {}
        self._launch_time = launch_time
        self._log_dir = log_dir or os.path.join(sandbox, '.logs')
        self._process_logger_destination = process_logger_destination
        self._process_logger_mode = process_logger_mode
        self._rotate_log_size_mb = rotate_log_size_mb
        self._rotate_log_backups = rotate_log_backups
        self._pathspec = TaskPath(root=checkpoint_root,
                                  task_id=self._task_id,
                                  log_dir=self._log_dir)
        self._hostname = hostname or socket.gethostname()
        try:
            ThermosTaskValidator.assert_valid_task(task)
            ThermosTaskValidator.assert_valid_ports(task, self._portmap)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        context = ThermosContext(task_id=self._task_id,
                                 ports=self._portmap,
                                 user=self._user)
        self._task, uninterp = (task %
                                Environment(thermos=context)).interpolate()
        if len(uninterp) > 0:
            raise self.InvalidTask('Failed to interpolate task, missing: %s' %
                                   ', '.join(str(ref) for ref in uninterp))
        try:
            ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        self._plan = None  # plan currently being executed (updated by Handlers)
        self._regular_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is False)
        self._finalizing_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is True)
        self._chroot = chroot
        self._sandbox = sandbox
        self._terminal_state = None
        self._ckpt = None
        self._process_map = dict(
            (p.name().get(), p) for p in self._task.processes())
        self._task_processes = {}
        self._stages = dict(
            (state, stage(self)) for state, stage in self.STAGES.items())
        self._finalization_start = None
        self._preemption_deadline = None
        self._watcher = ProcessMuxer(self._pathspec)
        self._state = RunnerState(processes={})
        self._preserve_env = preserve_env

        # create runner state
        universal_handler = universal_handler or TaskRunnerUniversalHandler
        self._dispatcher = CheckpointDispatcher()
        self._dispatcher.register_handler(universal_handler(self))
        self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
        self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

        # recover checkpointed runner state and update plan
        self._recovery = True
        self._replay_runner_ckpt()

    @property
    def task(self):
        return self._task

    @property
    def task_id(self):
        return self._task_id

    @property
    def state(self):
        return self._state

    @property
    def processes(self):
        return self._task_processes

    def task_state(self):
        return self._state.statuses[
            -1].state if self._state.statuses else TaskState.ACTIVE

    def close_ckpt(self):
        """Force close the checkpoint stream.  This is necessary for runners terminated through
       exception propagation."""
        log.debug('Closing the checkpoint stream.')
        self._ckpt.close()

    @contextmanager
    def control(self, force=False):
        """
      Bind to the checkpoint associated with this task, position to the end of the log if
      it exists, or create it if it doesn't.  Fails if we cannot get "leadership" i.e. a
      file lock on the checkpoint stream.
    """
        if self.is_terminal():
            raise self.StateError(
                'Cannot take control of a task in terminal state.')
        if self._sandbox:
            safe_mkdir(self._sandbox)
        ckpt_file = self._pathspec.getpath('runner_checkpoint')
        try:
            self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file,
                                                          force=force,
                                                          state=self._state)
        except TaskRunnerHelper.PermissionError:
            raise self.PermissionError('Unable to open checkpoint %s' %
                                       ckpt_file)
        log.debug('Flipping recovery mode off.')
        self._recovery = False
        self._set_task_status(self.task_state())
        self._resume_task()
        try:
            yield
        except Exception as e:
            log.error('Caught exception in self.control(): %s' % e)
            log.error('  %s' % traceback.format_exc())
        self._ckpt.close()

    def _resume_task(self):
        assert self._ckpt is not None
        unapplied_updates = self._replay_process_ckpts()
        if self.is_terminal():
            raise self.StateError('Cannot resume terminal task.')
        self._initialize_ckpt_header()
        self._replay(unapplied_updates)

    def _ckpt_write(self, record):
        """
      Write to the checkpoint stream if we're not in recovery mode.
    """
        if not self._recovery:
            self._ckpt.write(record)

    def _replay(self, checkpoints):
        """
      Replay a sequence of RunnerCkpts.
    """
        for checkpoint in checkpoints:
            self._dispatcher.dispatch(self._state, checkpoint)

    def _replay_runner_ckpt(self):
        """
      Replay the checkpoint stream associated with this task.
    """
        ckpt_file = self._pathspec.getpath('runner_checkpoint')
        if os.path.exists(ckpt_file):
            with open(ckpt_file, 'r') as fp:
                ckpt_recover = ThriftRecordReader(fp, RunnerCkpt)
                for record in ckpt_recover:
                    log.debug('Replaying runner checkpoint record: %s' %
                              record)
                    self._dispatcher.dispatch(self._state,
                                              record,
                                              recovery=True)

    def _replay_process_ckpts(self):
        """
      Replay the unmutating process checkpoints.  Return the unapplied process updates that
      would mutate the runner checkpoint stream.
    """
        process_updates = self._watcher.select()
        unapplied_process_updates = []
        for process_update in process_updates:
            if self._dispatcher.would_update(self._state, process_update):
                unapplied_process_updates.append(process_update)
            else:
                self._dispatcher.dispatch(self._state,
                                          process_update,
                                          recovery=True)
        return unapplied_process_updates

    def _initialize_ckpt_header(self):
        """
      Initializes the RunnerHeader for this checkpoint stream if it has not already
      been constructed.
    """
        if self._state.header is None:
            try:
                uid = pwd.getpwnam(self._user).pw_uid
            except KeyError:
                # This will cause failures downstream, but they will at least be correctly
                # reflected in the process state.
                log.error('Unknown user %s.' % self._user)
                uid = None

            header = RunnerHeader(task_id=self._task_id,
                                  launch_time_ms=int(self._launch_time * 1000),
                                  sandbox=self._sandbox,
                                  log_dir=self._log_dir,
                                  hostname=self._hostname,
                                  user=self._user,
                                  uid=uid,
                                  ports=self._portmap)
            runner_ckpt = RunnerCkpt(runner_header=header)
            self._dispatcher.dispatch(self._state, runner_ckpt)

    def _set_task_status(self, state):
        update = TaskStatus(state=state,
                            timestamp_ms=int(self._clock.time() * 1000),
                            runner_pid=os.getpid(),
                            runner_uid=os.getuid())
        runner_ckpt = RunnerCkpt(task_status=update)
        self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)

    def _finalization_remaining(self):
        # If a preemption deadline has been set, use that.
        if self._preemption_deadline:
            return max(0, self._preemption_deadline - self._clock.time())

        # Otherwise, use the finalization wait provided in the configuration.
        finalization_allocation = self.task.finalization_wait().get()
        if self._finalization_start is None:
            return sys.float_info.max
        else:
            waited = max(0, self._clock.time() - self._finalization_start)
            return max(0, finalization_allocation - waited)

    def _set_process_status(self, process_name, process_state, **kw):
        if 'sequence_number' in kw:
            sequence_number = kw.pop('sequence_number')
            log.debug('_set_process_status(%s <= %s, seq=%s[force])' %
                      (process_name,
                       ProcessState._VALUES_TO_NAMES.get(process_state),
                       sequence_number))
        else:
            current_run = self._current_process_run(process_name)
            if not current_run:
                assert process_state == ProcessState.WAITING
                sequence_number = 0
            else:
                sequence_number = current_run.seq + 1
            log.debug('_set_process_status(%s <= %s, seq=%s[auto])' %
                      (process_name,
                       ProcessState._VALUES_TO_NAMES.get(process_state),
                       sequence_number))
        runner_ckpt = RunnerCkpt(
            process_status=ProcessStatus(process=process_name,
                                         state=process_state,
                                         seq=sequence_number,
                                         **kw))
        self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)

    def _task_process_from_process_name(self, process_name, sequence_number):
        """
      Construct a Process() object from a process_name, populated with its
      correct run number and fully interpolated commandline.
    """
        run_number = len(self.state.processes[process_name]) - 1
        pathspec = self._pathspec.given(process=process_name, run=run_number)
        process = self._process_map.get(process_name)
        if process is None:
            raise self.InternalError('FATAL: Could not find process: %s' %
                                     process_name)

        def close_ckpt_and_fork():
            pid = os.fork()
            if pid == 0 and self._ckpt is not None:
                self._ckpt.close()
            return pid

        (logger_destination, logger_mode, rotate_log_size,
         rotate_log_backups) = self._build_process_logger_args(process)

        return Process(process.name().get(),
                       process.cmdline().get(),
                       sequence_number,
                       pathspec,
                       self._sandbox,
                       self._user,
                       chroot=self._chroot,
                       fork=close_ckpt_and_fork,
                       logger_destination=logger_destination,
                       logger_mode=logger_mode,
                       rotate_log_size=rotate_log_size,
                       rotate_log_backups=rotate_log_backups,
                       preserve_env=self._preserve_env)

    def _build_process_logger_args(self, process):
        """
      Build the appropriate logging configuration based on flags + process
      configuration settings.

      If no configuration (neither flags nor process config), default to
      "standard" mode.
    """
        destination, mode, size, backups = None, None, None, None
        logger = process.logger()
        if logger is Empty:
            if self._process_logger_destination:
                destination = self._process_logger_destination
            else:
                destination = LoggerDestination.FILE

            if self._process_logger_mode:
                mode = self._process_logger_mode,
                size = Amount(self._rotate_log_size_mb, Data.MB)
                backups = self._rotate_log_backups
            else:
                mode = LoggerMode.STANDARD
        else:
            destination = logger.destination().get()
            mode = logger.mode().get()
            if mode == LoggerMode.ROTATE:
                rotate = logger.rotate()
                size = Amount(rotate.log_size().get(), Data.BYTES)
                backups = rotate.backups().get()
        return destination, mode, size, backups

    def deadlocked(self, plan=None):
        """Check whether a plan is deadlocked, i.e. there are no running/runnable processes, and the
    plan is not complete."""
        plan = plan or self._regular_plan
        now = self._clock.time()
        running = list(plan.running)
        runnable = list(plan.runnable_at(now))
        waiting = list(plan.waiting_at(now))
        log.debug(
            'running:%d runnable:%d waiting:%d complete:%s' %
            (len(running), len(runnable), len(waiting), plan.is_complete()))
        return len(running + runnable +
                   waiting) == 0 and not plan.is_complete()

    def is_healthy(self):
        """Check whether the TaskRunner is healthy. A healthy TaskRunner is not deadlocked and has not
    reached its max_failures count."""
        max_failures = self._task.max_failures().get()
        deadlocked = self.deadlocked()
        under_failure_limit = max_failures == 0 or len(
            self._regular_plan.failed) < max_failures
        log.debug(
            'max_failures:%d failed:%d under_failure_limit:%s deadlocked:%s ==> health:%s'
            %
            (max_failures, len(self._regular_plan.failed), under_failure_limit,
             deadlocked, not deadlocked and under_failure_limit))
        return not deadlocked and under_failure_limit

    def _current_process_run(self, process_name):
        if process_name not in self._state.processes or len(
                self._state.processes[process_name]) == 0:
            return None
        return self._state.processes[process_name][-1]

    def is_process_lost(self, process_name):
        """Determine whether or not we should mark a task as LOST and do so if necessary."""
        current_run = self._current_process_run(process_name)
        if not current_run:
            raise self.InternalError('No current_run for process %s!' %
                                     process_name)

        def forked_but_never_came_up():
            return current_run.state == ProcessState.FORKED and (
                self._clock.time() - current_run.fork_time >
                self.LOST_TIMEOUT.as_(Time.SECONDS))

        def running_but_coordinator_died():
            if current_run.state != ProcessState.RUNNING:
                return False
            coordinator_pid, _, _ = TaskRunnerHelper.scan_process(
                self.state, process_name)
            if coordinator_pid is not None:
                return False
            elif self._watcher.has_data(process_name):
                return False
            return True

        if forked_but_never_came_up() or running_but_coordinator_died():
            log.info('Detected a LOST task: %s' % current_run)
            log.debug('  forked_but_never_came_up: %s' %
                      forked_but_never_came_up())
            log.debug('  running_but_coordinator_died: %s' %
                      running_but_coordinator_died())
            return True

        return False

    def _run_plan(self, plan):
        log.debug('Schedule pass:'******'running: %s' % ' '.join(plan.running))
        log.debug('finished: %s' % ' '.join(plan.finished))

        launched = []
        for process_name in plan.running:
            if self.is_process_lost(process_name):
                self._set_process_status(process_name, ProcessState.LOST)

        now = self._clock.time()
        runnable = list(plan.runnable_at(now))
        waiting = list(plan.waiting_at(now))
        log.debug('runnable: %s' % ' '.join(runnable))
        log.debug('waiting: %s' % ' '.join('%s[T-%.1fs]' %
                                           (process, plan.get_wait(process))
                                           for process in waiting))

        def pick_processes(process_list):
            if self._task.max_concurrency().get() == 0:
                return process_list
            num_to_pick = max(
                self._task.max_concurrency().get() - len(running), 0)
            return process_list[:num_to_pick]

        for process_name in pick_processes(runnable):
            tp = self._task_processes.get(process_name)
            if tp:
                current_run = self._current_process_run(process_name)
                assert current_run.state == ProcessState.WAITING
            else:
                self._set_process_status(process_name, ProcessState.WAITING)
                tp = self._task_processes[process_name]
            log.info('Forking Process(%s)' % process_name)
            try:
                tp.start()
                launched.append(tp)
            except Process.Error as e:
                log.error('Failed to launch process: %s' % e)
                self._set_process_status(process_name, ProcessState.FAILED)

        return len(launched) > 0

    def _terminate_plan(self, plan):
        for process in plan.running:
            last_run = self._current_process_run(process)
            if last_run and last_run.state in (ProcessState.FORKED,
                                               ProcessState.RUNNING):
                TaskRunnerHelper.terminate_process(self.state, process)

    def has_running_processes(self):
        """
      Returns True if any processes associated with this task have active pids.
    """
        process_tree = TaskRunnerHelper.scan_tree(self.state)
        return any(any(process_set) for process_set in process_tree.values())

    def has_active_processes(self):
        """
      Returns True if any processes are in non-terminal states.
    """
        return any(
            not TaskRunnerHelper.is_process_terminal(run.state)
            for run in filter(None, (self._current_process_run(process)
                                     for process in self.state.processes)))

    def collect_updates(self, timeout=None):
        """
      Collects and applies updates from process checkpoint streams.  Returns the number
      of applied process checkpoints.
    """
        if not self.has_active_processes():
            return 0

        sleep_interval = self.COORDINATOR_INTERVAL_SLEEP.as_(Time.SECONDS)
        total_time = 0.0

        while True:
            process_updates = self._watcher.select()
            for process_update in process_updates:
                self._dispatcher.dispatch(self._state, process_update,
                                          self._recovery)
            if process_updates:
                return len(process_updates)
            if timeout is not None and total_time >= timeout:
                return 0
            total_time += sleep_interval
            self._clock.sleep(sleep_interval)

    def is_terminal(self):
        return TaskRunnerHelper.is_task_terminal(self.task_state())

    def terminal_state(self):
        if self._terminal_state:
            log.debug(
                'Forced terminal state: %s' % TaskState._VALUES_TO_NAMES.get(
                    self._terminal_state, 'UNKNOWN'))
            return self._terminal_state
        else:
            return TaskState.SUCCESS if self.is_healthy() else TaskState.FAILED

    def run(self, force=False):
        """
      Entrypoint to runner. Assume control of checkpoint stream, and execute TaskRunnerStages
      until runner is terminal.
    """
        if self.is_terminal():
            return
        with self.control(force):
            self._run()

    def _run(self):
        while not self.is_terminal():
            start = self._clock.time()
            # step 1: execute stage corresponding to the state we're currently in
            runner = self._stages[self.task_state()]
            iteration_wait = runner.run()
            if iteration_wait is None:
                log.debug('Run loop: No more work to be done in state %s' %
                          TaskState._VALUES_TO_NAMES.get(
                              self.task_state(), 'UNKNOWN'))
                self._set_task_status(runner.transition_to())
                continue
            log.debug('Run loop: Work to be done within %.1fs' %
                      iteration_wait)
            # step 2: check child process checkpoint streams for updates
            if not self.collect_updates(iteration_wait):
                # If we don't collect any updates, at least 'touch' the checkpoint stream
                # so as to prevent garbage collection.
                elapsed = self._clock.time() - start
                if elapsed < iteration_wait:
                    log.debug(
                        'Update collection only took %.1fs, idling %.1fs' %
                        (elapsed, iteration_wait - elapsed))
                    self._clock.sleep(iteration_wait - elapsed)
                log.debug(
                    'Run loop: No updates collected, touching checkpoint.')
                os.utime(self._pathspec.getpath('runner_checkpoint'), None)
            # step 3: reap any zombie child processes
            TaskRunnerHelper.reap_children()

    def kill(self,
             force=False,
             terminal_status=TaskState.KILLED,
             preemption_wait=Amount(1, Time.MINUTES)):
        """
      Kill all processes associated with this task and set task/process states as terminal_status
      (defaults to KILLED)
    """
        log.debug('Runner issued kill: force:%s, preemption_wait:%s' %
                  (force, preemption_wait))
        assert terminal_status in (TaskState.KILLED, TaskState.LOST)
        self._preemption_deadline = self._clock.time() + preemption_wait.as_(
            Time.SECONDS)
        with self.control(force):
            if self.is_terminal():
                log.warning('Task is not in ACTIVE state, cannot issue kill.')
                return
            self._terminal_state = terminal_status
            if self.task_state() == TaskState.ACTIVE:
                self._set_task_status(TaskState.CLEANING)
            self._run()

    def lose(self, force=False):
        """
      Mark a task as LOST and kill any straggling processes.
    """
        self.kill(force,
                  preemption_wait=Amount(0, Time.SECONDS),
                  terminal_status=TaskState.LOST)

    def _kill(self):
        processes = TaskRunnerHelper.scan_tree(self._state)
        for process, pid_tuple in processes.items():
            current_run = self._current_process_run(process)
            coordinator_pid, pid, tree = pid_tuple
            if TaskRunnerHelper.is_process_terminal(current_run.state):
                if coordinator_pid or pid or tree:
                    log.warning(
                        'Terminal process (%s) still has running pids:' %
                        process)
                    log.warning('  coordinator_pid: %s' % coordinator_pid)
                    log.warning('              pid: %s' % pid)
                    log.warning('             tree: %s' % tree)
                TaskRunnerHelper.kill_process(self.state, process)
            else:
                if coordinator_pid or pid or tree:
                    log.info('Transitioning %s to KILLED' % process)
                    self._set_process_status(process,
                                             ProcessState.KILLED,
                                             stop_time=self._clock.time(),
                                             return_code=-1)
                else:
                    log.info('Transitioning %s to LOST' % process)
                    if current_run.state != ProcessState.WAITING:
                        self._set_process_status(process, ProcessState.LOST)

Example #30

0

Show file

File: runner.py Project: apache/aurora

  def __init__(self, task, checkpoint_root, sandbox, log_dir=None,
               task_id=None, portmap=None, user=None, chroot=False, clock=time,
               universal_handler=None, planner_class=TaskPlanner, hostname=None,
               process_logger_destination=None, process_logger_mode=None,
               rotate_log_size_mb=None, rotate_log_backups=None,
               preserve_env=False, mesos_containerizer_path=None, container_sandbox=None):
    """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
        mesos_containerizer_path = the path to the mesos-containerizer executable that will be used
                                   to isolate the task's filesystem (if using a filesystem image).
        container_sandbox = the path within the isolated filesystem where the task's sandbox is
                            mounted.
    """
    if not issubclass(planner_class, TaskPlanner):
      raise TypeError('planner_class must be a TaskPlanner.')
    self._clock = clock
    launch_time = self._clock.time()
    launch_time_ms = '%06d' % int((launch_time - int(launch_time)) * (10 ** 6))
    if not task_id:
      self._task_id = '%s-%s.%s' % (task.name(),
                                    time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)),
                                    launch_time_ms)
    else:
      self._task_id = task_id
    current_user = TaskRunnerHelper.get_actual_user()
    self._user = user or current_user
    # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
    if self._user != current_user:
      if os.geteuid() != 0:
        raise ValueError('task specifies user as %s, but %s does not have setuid permission!' % (
          self._user, current_user))
    self._portmap = portmap or {}
    self._launch_time = launch_time
    self._log_dir = log_dir or os.path.join(sandbox, '.logs')
    self._process_logger_destination = process_logger_destination
    self._process_logger_mode = process_logger_mode
    self._rotate_log_size_mb = rotate_log_size_mb
    self._rotate_log_backups = rotate_log_backups
    self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir)
    self._hostname = hostname or socket.gethostname()
    try:
      ThermosTaskValidator.assert_valid_task(task)
      ThermosTaskValidator.assert_valid_ports(task, self._portmap)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    context = ThermosContext(
        task_id=self._task_id,
        ports=self._portmap,
        user=self._user)
    self._task, uninterp = (task % Environment(thermos=context)).interpolate()
    if len(uninterp) > 0:
      raise self.InvalidTask('Failed to interpolate task, missing: %s' %
          ', '.join(str(ref) for ref in uninterp))
    try:
      ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    self._plan = None  # plan currently being executed (updated by Handlers)
    self._regular_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is False)
    self._finalizing_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is True)
    self._chroot = chroot
    self._sandbox = sandbox
    self._container_sandbox = container_sandbox
    self._terminal_state = None
    self._ckpt = None
    self._process_map = dict((p.name().get(), p) for p in self._task.processes())
    self._task_processes = {}
    self._stages = dict((state, stage(self)) for state, stage in self.STAGES.items())
    self._finalization_start = None
    self._preemption_deadline = None
    self._watcher = ProcessMuxer(self._pathspec)
    self._state = RunnerState(processes={})
    self._preserve_env = preserve_env
    self._mesos_containerizer_path = mesos_containerizer_path

    # create runner state
    universal_handler = universal_handler or TaskRunnerUniversalHandler
    self._dispatcher = CheckpointDispatcher()
    self._dispatcher.register_handler(universal_handler(self))
    self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
    self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

    # recover checkpointed runner state and update plan
    self._recovery = True
    self._replay_runner_ckpt()

Example #31

0

Show file

 def state(self):
   """Return final state of Task (RunnerState, read from disk and cached for future access)"""
   if self._state is None:
     path = self._pathspec.given(task_id=self._task_id).getpath('runner_checkpoint')
     self._state = CheckpointDispatcher.from_file(path)
   return self._state

Example #32

0

Show file

 def state(self, task_id):
     if task_id not in self._states:
         self._states[task_id] = CheckpointDispatcher.from_file(
             self._detector.get_checkpoint(task_id))
     return self._states[task_id]

Example #33

0

Show file

File: observed_task.py Project: radhikari54/Mastering-Mesos

 def state(self):
   """Return final state of Task (RunnerState, read from disk and cached for future access)"""
   if self._state is None:
     path = self._pathspec.getpath('runner_checkpoint')
     self._state = CheckpointDispatcher.from_file(path)
   return copy.deepcopy(self._state) if self._state else RunnerState(processes={})

Example #34

0

Show file

File: inspector.py Project: sumanau7/incubator-aurora

  def inspect(self, task_id):
    """
      Reconstructs the checkpoint stream and returns a CheckpointInspection.
    """
    dispatcher = CheckpointDispatcher()
    state = RunnerState(processes = {})
    muxer = ProcessMuxer(self._path.given(task_id=task_id))

    runner_processes = []
    coordinator_processes = set()
    processes = set()

    def consume_process_record(record):
      if not record.process_status:
        return
      try:
        user_uid = pwd.getpwnam(state.header.user).pw_uid
      except KeyError:
        log.error('Could not find user: %s' % state.header.user)
        return
      if record.process_status.state == ProcessState.FORKED:
        coordinator_processes.add((record.process_status.coordinator_pid, user_uid,
                                   record.process_status.fork_time))
      elif record.process_status.state == ProcessState.RUNNING:
        processes.add((record.process_status.pid, user_uid,
                       record.process_status.start_time))

    # replay runner checkpoint
    runner_pid = None
    runner_latest_update = 0
    try:
      with open(self._path.given(task_id=task_id).getpath('runner_checkpoint')) as fp:
        with closing(ThriftRecordReader(fp, RunnerCkpt)) as ckpt:
          for record in ckpt:
            dispatcher.dispatch(state, record)
            runner_latest_update = max(runner_latest_update,
                self.get_timestamp(record.process_status))
            # collect all bound runners
            if record.task_status:
              if record.task_status.runner_pid != runner_pid:
                runner_processes.append((record.task_status.runner_pid,
                                         record.task_status.runner_uid or 0,
                                         record.task_status.timestamp_ms))
                runner_pid = record.task_status.runner_pid
            elif record.process_status:
              consume_process_record(record)
    except (IOError, OSError, RecordIO.Error) as err:
      log.debug('Error inspecting task runner checkpoint: %s' % err)
      return

    # register existing processes in muxer
    for process_name in state.processes:
      muxer.register(process_name)

    # read process checkpoints
    process_latest_update = runner_latest_update
    for record in muxer.select():
      process_latest_update = max(process_latest_update, self.get_timestamp(record.process_status))
      consume_process_record(record)

    return CheckpointInspection(
      runner_latest_update=runner_latest_update,
      process_latest_update=process_latest_update,
      runner_processes=runner_processes,
      coordinator_processes=coordinator_processes,
      processes=processes)

Example #35

0

Show file

    def inspect(self, task_id):
        """
      Reconstructs the checkpoint stream and returns a CheckpointInspection.
    """
        dispatcher = CheckpointDispatcher()
        state = RunnerState(processes={})
        muxer = ProcessMuxer(self._path.given(task_id=task_id))

        runner_processes = []
        coordinator_processes = set()
        processes = set()

        def consume_process_record(record):
            if not record.process_status:
                return
            try:
                user_uid = pwd.getpwnam(state.header.user).pw_uid
            except KeyError:
                log.error('Could not find user: %s' % state.header.user)
                return
            if record.process_status.state == ProcessState.FORKED:
                coordinator_processes.add(
                    (record.process_status.coordinator_pid, user_uid,
                     record.process_status.fork_time))
            elif record.process_status.state == ProcessState.RUNNING:
                processes.add((record.process_status.pid, user_uid,
                               record.process_status.start_time))

        # replay runner checkpoint
        runner_pid = None
        runner_latest_update = 0
        try:
            with open(
                    self._path.given(
                        task_id=task_id).getpath('runner_checkpoint')) as fp:
                with closing(ThriftRecordReader(fp, RunnerCkpt)) as ckpt:
                    for record in ckpt:
                        dispatcher.dispatch(state, record)
                        runner_latest_update = max(
                            runner_latest_update,
                            self.get_timestamp(record.process_status))
                        # collect all bound runners
                        if record.task_status:
                            if record.task_status.runner_pid != runner_pid:
                                runner_processes.append(
                                    (record.task_status.runner_pid,
                                     record.task_status.runner_uid
                                     or 0, record.task_status.timestamp_ms))
                                runner_pid = record.task_status.runner_pid
                        elif record.process_status:
                            consume_process_record(record)
        except (IOError, OSError, RecordIO.Error) as err:
            log.debug('Error inspecting task runner checkpoint: %s' % err)
            return

        # register existing processes in muxer
        for process_name in state.processes:
            muxer.register(process_name)

        # read process checkpoints
        process_latest_update = runner_latest_update
        for record in muxer.select():
            process_latest_update = max(
                process_latest_update,
                self.get_timestamp(record.process_status))
            consume_process_record(record)

        return CheckpointInspection(
            runner_latest_update=runner_latest_update,
            process_latest_update=process_latest_update,
            runner_processes=runner_processes,
            coordinator_processes=coordinator_processes,
            processes=processes)