Example #1
0
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state="active")
     task_json = path.getpath("task_path")
     task_checkpoint = path.getpath("runner_checkpoint")
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(
             task.tasks()[0].task(),
             checkpoint_root,
             checkpoint.header.sandbox,
             log_dir=checkpoint.header.log_dir,
             task_id=task_id,
             portmap=checkpoint.header.ports,
             hostname=checkpoint.header.hostname,
         )
     except Exception as e:
         log.error("Failed to reconstitute checkpoint in TaskRunner.get: %s" % e, exc_info=True)
         return None
Example #2
0
 def open_checkpoint(cls, filename, force=False, state=None):
     """
   Acquire a locked checkpoint stream.
 """
     safe_mkdir(os.path.dirname(filename))
     fp = lock_file(filename, "a+")
     if fp in (None, False):
         if force:
             log.info('Found existing runner, forcing leadership forfeit.')
             state = state or CheckpointDispatcher.from_file(filename)
             if cls.kill_runner(state):
                 log.info('Successfully killed leader.')
                 # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
                 # a maximum timeout.  But blocking is necessary because os.kill does not immediately
                 # release the lock if we're in force mode.
                 fp = lock_file(filename, "a+", blocking=True)
         else:
             log.error('Found existing runner, cannot take control.')
     if fp in (None, False):
         raise cls.PermissionError(
             'Could not open locked checkpoint: %s, lock_file = %s' %
             (filename, fp))
     ckpt = ThriftRecordWriter(fp)
     ckpt.set_sync(True)
     return ckpt
Example #3
0
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
     task_json = path.getpath('task_path')
     task_checkpoint = path.getpath('runner_checkpoint')
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(task.tasks()[0].task(),
                    checkpoint_root,
                    checkpoint.header.sandbox,
                    log_dir=checkpoint.header.log_dir,
                    task_id=task_id,
                    portmap=checkpoint.header.ports,
                    hostname=checkpoint.header.hostname)
     except Exception as e:
         log.error(
             'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e,
             exc_info=True)
         return None
Example #4
0
def tail(args, options):
    """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
    if len(args) == 0:
        app.error("Expected a task to tail, got nothing!")
    if len(args) not in (1, 2):
        app.error("Expected at most two arguments (task and optional process), got %d" % len(args))

    task_id = args[0]
    detector = TaskDetector(root=options.root)
    checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
    log_dir = checkpoint.header.log_dir
    process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
    if len(args) == 2:
        process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

    if len(process_runs) == 0:
        print("ERROR: No processes found.", file=sys.stderr)
        sys.exit(1)

    processes = set([process for process, _ in process_runs])
    if len(processes) != 1:
        print("ERROR: More than one process matches query.", file=sys.stderr)
        sys.exit(1)

    process = processes.pop()
    run = max([run for _, run in process_runs])

    logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath(
        "process_logdir"
    )
    logfile = os.path.join(logdir, "stderr" if options.use_stderr else "stdout")

    monitor = TaskMonitor(TaskPath(root=options.root), args[0])

    def log_is_active():
        active_processes = monitor.get_active_processes()
        for process_status, process_run in active_processes:
            if process_status.process == process and process_run == run:
                return True
        return False

    if not log_is_active():
        print("Tail of terminal log %s" % logfile)
        for line in tail_closed(logfile):
            print(line.rstrip())
        return

    now = time.time()
    next_check = now + 5.0
    print("Tail of active log %s" % logfile)
    for line in tail_f(logfile, include_last=True, forever=False):
        print(line.rstrip())
        if time.time() > next_check:
            if not log_is_active():
                break
            else:
                next_check = time.time() + 5.0
def tail(args, options):
  """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
  if len(args) == 0:
    app.error('Expected a task to tail, got nothing!')
  if len(args) not in (1, 2):
    app.error('Expected at most two arguments (task and optional process), got %d' % len(args))

  task_id = args[0]
  detector = TaskDetector(root=options.root)
  checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
  log_dir = checkpoint.header.log_dir
  process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
  if len(args) == 2:
    process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

  if len(process_runs) == 0:
    print('ERROR: No processes found.', file=sys.stderr)
    sys.exit(1)

  processes = set([process for process, _ in process_runs])
  if len(processes) != 1:
    print('ERROR: More than one process matches query.', file=sys.stderr)
    sys.exit(1)

  process = processes.pop()
  run = max([run for _, run in process_runs])

  logdir = TaskPath(root=options.root, task_id=args[0], process=process,
     run=run, log_dir=log_dir).getpath('process_logdir')
  logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout')

  monitor = TaskMonitor(TaskPath(root=options.root), args[0])
  def log_is_active():
    active_processes = monitor.get_active_processes()
    for process_status, process_run in active_processes:
      if process_status.process == process and process_run == run:
        return True
    return False

  if not log_is_active():
    print('Tail of terminal log %s' % logfile)
    for line in tail_closed(logfile):
      print(line.rstrip())
    return

  now = time.time()
  next_check = now + 5.0
  print('Tail of active log %s' % logfile)
  for line in tail_f(logfile, include_last=True, forever=False):
    print(line.rstrip())
    if time.time() > next_check:
      if not log_is_active():
        break
      else:
        next_check = time.time() + 5.0
Example #6
0
  def run(self):
    self._run_count += 1
    atexit.register(self.cleanup)

    if self.script_filename:
      os.unlink(self.script_filename)

    with temporary_file(cleanup=False) as fp:
      self.script_filename = fp.name
      fp.write(self.RUN_JOB_SCRIPT % {
        'filename': self.job_filename,
        'sandbox': self.sandbox,
        'root': self.tempdir,
        'task_id': self.task_id,
        'state_filename': self.state_filename,
        'success_rate': self.success_rate,
        'random_seed': self.random_seed + self._run_count,
        'extra_task_runner_args': self.extra_task_runner_args,
      })

    with environment_as(PYTHONPATH=os.pathsep.join(sys.path)):
      self.po = subprocess.Popen([sys.executable, self.script_filename],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      try:
        so, se = self.po.communicate()
      except OSError as e:
        if e.errno == errno.ECHILD:
          so = se = 'Killed'
        else:
          raise

    rc = self.po.returncode
    if rc != 0:
      if os.path.exists(self.job_filename):
        with open(self.job_filename) as fp:
          config = fp.read()
      else:
        config = 'Nonexistent!'
      if 'THERMOS_DEBUG' in os.environ:
        print("Runner failed!\n\n\nconfig:%s\n\n\nstdout:%s\n\n\nstderr:%s\n\n\n" % (
            config, so, se))

    try:
      with open(self.state_filename, 'r') as fp:
        self.state = thrift_deserialize(RunnerState(), fp.read())
    except Exception as e:
      if 'THERMOS_DEBUG' in os.environ:
        print('Failed to load Runner state: %s' % e, file=sys.stderr)
      self.state = RunnerState()

    try:
      self.reconstructed_state = CheckpointDispatcher.from_file(
          self.pathspec.getpath('runner_checkpoint'))
    except Exception as e:
      print('Failed to replay checkpoint: %s' % e, file=sys.stderr)
      self.reconstructed_state = None
    self.initialized = True
    return rc
Example #7
0
    def __init__(self, checkpoint_root, task_id):
        """
    :param checkpoint_root: The checkpoint root to find the given task.
    :param task_id: The task_id of the task whose state we wish to manage.
    """

        self._detector = TaskDetector(checkpoint_root)
        self._task_id = task_id
        self._pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
        self._state = CheckpointDispatcher.from_file(
            self._detector.get_checkpoint(task_id))
Example #8
0
  def kill(cls, task_id, checkpoint_root, force=False,
           terminal_status=TaskState.KILLED, clock=time):
    """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
    if terminal_status not in (TaskState.KILLED, TaskState.LOST):
      raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                      TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status)
    pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
    checkpoint = pathspec.getpath('runner_checkpoint')
    state = CheckpointDispatcher.from_file(checkpoint)

    if state is None or state.header is None or state.statuses is None:
      if force:
        log.error('Task has uninitialized TaskState - forcibly finalizing')
        cls.finalize_task(pathspec)
        return
      else:
        log.error('Cannot update states in uninitialized TaskState!')
        return

    ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

    def write_task_state(state):
      update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000),
                          runner_pid=os.getpid(), runner_uid=os.getuid())
      ckpt.write(RunnerCkpt(task_status=update))

    def write_process_status(status):
      ckpt.write(RunnerCkpt(process_status=status))

    if cls.is_task_terminal(state.statuses[-1].state):
      log.info('Task is already in terminal state!  Finalizing.')
      cls.finalize_task(pathspec)
      return

    with closing(ckpt):
      write_task_state(TaskState.ACTIVE)
      for process, history in state.processes.items():
        process_status = history[-1]
        if not cls.is_process_terminal(process_status.state):
          if cls.kill_process(state, process):
            write_process_status(ProcessStatus(process=process,
              state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9,
              stop_time=clock.time()))
          else:
            if process_status.state is not ProcessState.WAITING:
              write_process_status(ProcessStatus(process=process,
                state=ProcessState.LOST, seq=process_status.seq + 1))
      write_task_state(terminal_status)
    cls.finalize_task(pathspec)
Example #9
0
  def kill(cls, task_id, checkpoint_root, force=False,
           terminal_status=TaskState.KILLED, clock=time):
    """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
    if terminal_status not in (TaskState.KILLED, TaskState.LOST):
      raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                      TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status)
    pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
    checkpoint = pathspec.getpath('runner_checkpoint')
    state = CheckpointDispatcher.from_file(checkpoint)

    if state is None or state.header is None or state.statuses is None:
      if force:
        log.error('Task has uninitialized TaskState - forcibly finalizing')
        cls.finalize_task(pathspec)
        return
      else:
        log.error('Cannot update states in uninitialized TaskState!')
        return

    ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

    def write_task_state(state):
      update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000),
                          runner_pid=os.getpid(), runner_uid=os.getuid())
      ckpt.write(RunnerCkpt(task_status=update))

    def write_process_status(status):
      ckpt.write(RunnerCkpt(process_status=status))

    if cls.is_task_terminal(state.statuses[-1].state):
      log.info('Task is already in terminal state!  Finalizing.')
      cls.finalize_task(pathspec)
      return

    with closing(ckpt):
      write_task_state(TaskState.ACTIVE)
      for process, history in state.processes.items():
        process_status = history[-1]
        if not cls.is_process_terminal(process_status.state):
          if cls.kill_process(state, process):
            write_process_status(ProcessStatus(process=process,
              state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9,
              stop_time=clock.time()))
          else:
            if process_status.state is not ProcessState.WAITING:
              write_process_status(ProcessStatus(process=process,
                state=ProcessState.LOST, seq=process_status.seq + 1))
      write_task_state(terminal_status)
    cls.finalize_task(pathspec)
Example #10
0
 def format_task(detector, task_id):
     checkpoint_filename = detector.get_checkpoint(task_id)
     checkpoint_stat = os.stat(checkpoint_filename)
     try:
         checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
     except KeyError:
         checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid
     print('  %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='')
     if options.verbose == 0:
         print()
     if options.verbose > 0:
         state = CheckpointDispatcher.from_file(checkpoint_filename)
         if state is None or state.header is None:
             print(' - checkpoint stream CORRUPT or outdated format')
             return
         print('  state: %8s' % TaskState._VALUES_TO_NAMES.get(
             state.statuses[-1].state, 'Unknown'),
               end='')
         print(' start: %25s' % time.asctime(
             time.localtime(state.header.launch_time_ms / 1000.0)))
     if options.verbose > 1:
         print('    user: %s' % state.header.user, end='')
         if state.header.ports:
             print(' ports: %s' %
                   ' '.join('%s -> %s' % (key, val)
                            for key, val in state.header.ports.items()))
         else:
             print(' ports: None')
         print('    sandbox: %s' % state.header.sandbox)
     if options.verbose > 2:
         print('    process table:')
         for process, process_history in state.processes.items():
             print('      - %s runs: %s' % (process, len(process_history)),
                   end='')
             last_run = process_history[-1]
             print(' last: pid=%s, rc=%s, finish:%s, state:%s' %
                   (last_run.pid or 'None', last_run.return_code
                    if last_run.return_code is not None else '',
                    time.asctime(time.localtime(last_run.stop_time))
                    if last_run.stop_time else 'None',
                    ProcessState._VALUES_TO_NAMES.get(
                        last_run.state, 'Unknown')))
         print()
Example #11
0
 def format_task(task_id):
     checkpoint_filename = detector.get_checkpoint(task_id)
     checkpoint_stat = os.stat(checkpoint_filename)
     try:
         checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
     except:
         checkpoint_owner = "uid:%s" % checkpoint_stat.st_uid
     print("  %-20s [owner: %8s]" % (task_id, checkpoint_owner), end="")
     if options.verbose == 0:
         print()
     if options.verbose > 0:
         state = CheckpointDispatcher.from_file(checkpoint_filename)
         if state is None or state.header is None:
             print(" - checkpoint stream CORRUPT or outdated format")
             return
         print("  state: %8s" % TaskState._VALUES_TO_NAMES.get(state.statuses[-1].state, "Unknown"), end="")
         print(" start: %25s" % time.asctime(time.localtime(state.header.launch_time_ms / 1000.0)))
     if options.verbose > 1:
         print("    user: %s" % state.header.user, end="")
         if state.header.ports:
             print(" ports: %s" % " ".join("%s -> %s" % (key, val) for key, val in state.header.ports.items()))
         else:
             print(" ports: None")
         print("    sandbox: %s" % state.header.sandbox)
     if options.verbose > 2:
         print("    process table:")
         for process, process_history in state.processes.items():
             print("      - %s runs: %s" % (process, len(process_history)), end="")
             last_run = process_history[-1]
             print(
                 " last: pid=%s, rc=%s, finish:%s, state:%s"
                 % (
                     last_run.pid or "None",
                     last_run.return_code if last_run.return_code is not None else "",
                     time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else "None",
                     ProcessState._VALUES_TO_NAMES.get(last_run.state, "Unknown"),
                 )
             )
         print()
Example #12
0
 def format_task(task_id):
   checkpoint_filename = detector.get_checkpoint(task_id)
   checkpoint_stat = os.stat(checkpoint_filename)
   try:
     checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
   except KeyError:
     checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid
   print('  %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='')
   if options.verbose == 0:
     print()
   if options.verbose > 0:
     state = CheckpointDispatcher.from_file(checkpoint_filename)
     if state is None or state.header is None:
       print(' - checkpoint stream CORRUPT or outdated format')
       return
     print('  state: %8s' % TaskState._VALUES_TO_NAMES.get(state.statuses[-1].state, 'Unknown'),
       end='')
     print(' start: %25s' % time.asctime(time.localtime(state.header.launch_time_ms / 1000.0)))
   if options.verbose > 1:
     print('    user: %s' % state.header.user, end='')
     if state.header.ports:
       print(' ports: %s' % ' '.join('%s -> %s' % (key, val)
                                        for key, val in state.header.ports.items()))
     else:
       print(' ports: None')
     print('    sandbox: %s' % state.header.sandbox)
   if options.verbose > 2:
     print('    process table:')
     for process, process_history in state.processes.items():
       print('      - %s runs: %s' % (process, len(process_history)), end='')
       last_run = process_history[-1]
       print(' last: pid=%s, rc=%s, finish:%s, state:%s' % (
         last_run.pid or 'None',
         last_run.return_code if last_run.return_code is not None else '',
         time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else 'None',
         ProcessState._VALUES_TO_NAMES.get(last_run.state, 'Unknown')))
     print()
Example #13
0
 def open_checkpoint(cls, filename, force=False, state=None):
   """
     Acquire a locked checkpoint stream.
   """
   safe_mkdir(os.path.dirname(filename))
   fp = lock_file(filename, "a+")
   if fp in (None, False):
     if force:
       log.info('Found existing runner, forcing leadership forfeit.')
       state = state or CheckpointDispatcher.from_file(filename)
       if cls.kill_runner(state):
         log.info('Successfully killed leader.')
         # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
         # a maximum timeout.  But blocking is necessary because os.kill does not immediately
         # release the lock if we're in force mode.
         fp = lock_file(filename, "a+", blocking=True)
     else:
       log.error('Found existing runner, cannot take control.')
   if fp in (None, False):
     raise cls.PermissionError('Could not open locked checkpoint: %s, lock_file = %s' %
       (filename, fp))
   ckpt = ThriftRecordWriter(fp)
   ckpt.set_sync(True)
   return ckpt
 def state(self):
   """Return final state of Task (RunnerState, read from disk and cached for future access)"""
   if self._state is None:
     path = self._pathspec.getpath('runner_checkpoint')
     self._state = CheckpointDispatcher.from_file(path)
   return copy.deepcopy(self._state) if self._state else RunnerState(processes={})
Example #15
0
 def state(self):
   """Return final state of Task (RunnerState, read from disk and cached for future access)"""
   if self._state is None:
     path = self._pathspec.given(task_id=self._task_id).getpath('runner_checkpoint')
     self._state = CheckpointDispatcher.from_file(path)
   return self._state
Example #16
0
 def state(self, task_id):
     if task_id not in self._states:
         self._states[task_id] = CheckpointDispatcher.from_file(
             self._detector.get_checkpoint(task_id))
     return self._states[task_id]
Example #17
0
 def state(self, task_id):
   if task_id not in self._states:
     self._states[task_id] = CheckpointDispatcher.from_file(self._detector.get_checkpoint(task_id))
   return self._states[task_id]