Ejemplo n.º 1
0
 def open_checkpoint(cls, filename, force=False, state=None):
     """
   Acquire a locked checkpoint stream.
 """
     safe_mkdir(os.path.dirname(filename))
     fp = lock_file(filename, "a+")
     if fp in (None, False):
         if force:
             log.info('Found existing runner, forcing leadership forfeit.')
             state = state or CheckpointDispatcher.from_file(filename)
             if cls.kill_runner(state):
                 log.info('Successfully killed leader.')
                 # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
                 # a maximum timeout.  But blocking is necessary because os.kill does not immediately
                 # release the lock if we're in force mode.
                 fp = lock_file(filename, "a+", blocking=True)
         else:
             log.error('Found existing runner, cannot take control.')
     if fp in (None, False):
         raise cls.PermissionError(
             'Could not open locked checkpoint: %s, lock_file = %s' %
             (filename, fp))
     ckpt = ThriftRecordWriter(fp)
     ckpt.set_sync(True)
     return ckpt
Ejemplo n.º 2
0
 def state(self):
     """Return final state of Task (RunnerState, read from disk and cached for future access)"""
     if self._state is None:
         path = self._pathspec.given(
             task_id=self._task_id).getpath('runner_checkpoint')
         self._state = CheckpointDispatcher.from_file(path)
     return self._state
Ejemplo n.º 3
0
    def run(self):
        self._run_count += 1
        atexit.register(self.cleanup)

        if self.script_filename:
            os.unlink(self.script_filename)

        with temporary_file(cleanup=False) as fp:
            self.script_filename = fp.name
            fp.write(
                self.RUN_JOB_SCRIPT % {
                    'filename': self.job_filename,
                    'sandbox': self.sandbox,
                    'root': self.tempdir,
                    'task_id': self.task_id,
                    'state_filename': self.state_filename,
                    'portmap': repr(self.portmap),
                    'success_rate': self.success_rate,
                    'random_seed': self.random_seed + self._run_count,
                })

        with environment_as(PYTHONPATH=os.pathsep.join(sys.path)):
            self.po = subprocess.Popen([sys.executable, self.script_filename],
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            try:
                so, se = self.po.communicate()
            except OSError as e:
                if e.errno == errno.ECHILD:
                    so = se = 'Killed'
                else:
                    raise

        rc = self.po.returncode
        if rc != 0:
            if os.path.exists(self.job_filename):
                config = open(self.job_filename).read()
            else:
                config = 'Nonexistent!'
            if 'THERMOS_DEBUG' in os.environ:
                print(
                    "Runner failed!\n\n\nconfig:%s\n\n\nstdout:%s\n\n\nstderr:%s\n\n\n"
                    % (config, so, se))

        try:
            with open(self.state_filename, 'r') as fp:
                self.state = thrift_deserialize(RunnerState(), fp.read())
        except Exception as e:
            if 'THERMOS_DEBUG' in os.environ:
                print('Failed to load Runner state: %s' % e, file=sys.stderr)
            self.state = RunnerState()

        try:
            self.reconstructed_state = CheckpointDispatcher.from_file(
                self.pathspec.getpath('runner_checkpoint'))
        except:
            self.reconstructed_state = None
        self.initialized = True
        return rc
Ejemplo n.º 4
0
def tail(args, options):
  """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
  if len(args) == 0:
    app.error('Expected a task to tail, got nothing!')
  if len(args) not in (1, 2):
    app.error('Expected at most two arguments (task and optional process), got %d' % len(args))

  task_id = args[0]
  detector = TaskDetector(root=options.root)
  checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
  log_dir = checkpoint.header.log_dir
  process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
  if len(args) == 2:
    process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

  if len(process_runs) == 0:
    print('ERROR: No processes found.', file=sys.stderr)
    sys.exit(1)

  processes = set([process for process, _ in process_runs])
  if len(processes) != 1:
    print('ERROR: More than one process matches query.', file=sys.stderr)
    sys.exit(1)

  process = processes.pop()
  run = max([run for _, run in process_runs])

  logdir = TaskPath(root=options.root, task_id=args[0], process=process,
     run=run, log_dir=log_dir).getpath('process_logdir')
  logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout')

  monitor = TaskMonitor(TaskPath(root = options.root), args[0])
  def log_is_active():
    active_processes = monitor.get_active_processes()
    for process_status, process_run in active_processes:
      if process_status.process == process and process_run == run:
        return True
    return False

  if not log_is_active():
    print('Tail of terminal log %s' % logfile)
    for line in tail_closed(logfile):
      print(line.rstrip())
    return

  now = time.time()
  next_check = now + 5.0
  print('Tail of active log %s' % logfile)
  for line in tail_f(logfile, include_last=True, forever=False):
    print(line.rstrip())
    if time.time() > next_check:
      if not log_is_active():
        break
      else:
        next_check = time.time() + 5.0
Ejemplo n.º 5
0
  def run(self):
    self._run_count += 1
    atexit.register(self.cleanup)

    if self.script_filename:
      os.unlink(self.script_filename)

    with temporary_file(cleanup=False) as fp:
      self.script_filename = fp.name
      fp.write(self.RUN_JOB_SCRIPT % {
        'filename': self.job_filename,
        'sandbox': self.sandbox,
        'root': self.tempdir,
        'task_id': self.task_id,
        'state_filename': self.state_filename,
        'portmap': repr(self.portmap),
        'success_rate': self.success_rate,
        'random_seed': self.random_seed + self._run_count,
      })

    with environment_as(PYTHONPATH=os.pathsep.join(sys.path)):
      self.po = subprocess.Popen([sys.executable, self.script_filename],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      try:
        so, se = self.po.communicate()
      except OSError as e:
        if e.errno == errno.ECHILD:
          so = se = 'Killed'
        else:
          raise

    rc = self.po.returncode
    if rc != 0:
      if os.path.exists(self.job_filename):
        config = open(self.job_filename).read()
      else:
        config = 'Nonexistent!'
      if 'THERMOS_DEBUG' in os.environ:
        print("Runner failed!\n\n\nconfig:%s\n\n\nstdout:%s\n\n\nstderr:%s\n\n\n" % (
            config, so, se))

    try:
      with open(self.state_filename, 'r') as fp:
        self.state = thrift_deserialize(RunnerState(), fp.read())
    except Exception as e:
      if 'THERMOS_DEBUG' in os.environ:
        print('Failed to load Runner state: %s' % e, file=sys.stderr)
      self.state = RunnerState()

    try:
      self.reconstructed_state = CheckpointDispatcher.from_file(
        self.pathspec.getpath('runner_checkpoint'))
    except:
      self.reconstructed_state = None
    self.initialized = True
    return rc
Ejemplo n.º 6
0
 def format_task(task_id):
     checkpoint_filename = detector.get_checkpoint(task_id)
     checkpoint_stat = os.stat(checkpoint_filename)
     try:
         checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
     except:
         checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid
     print('  %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='')
     if options.verbose == 0:
         print()
     if options.verbose > 0:
         state = CheckpointDispatcher.from_file(checkpoint_filename)
         if state is None or state.header is None:
             print(' - checkpoint stream CORRUPT or outdated format')
             return
         print('  state: %8s' % TaskState._VALUES_TO_NAMES.get(
             state.statuses[-1].state, 'Unknown'),
               end='')
         print(' start: %25s' % time.asctime(
             time.localtime(state.header.launch_time_ms / 1000.0)))
     if options.verbose > 1:
         print('    user: %s' % state.header.user, end='')
         if state.header.ports:
             print(' ports: %s' %
                   ' '.join('%s -> %s' % (key, val)
                            for key, val in state.header.ports.items()))
         else:
             print(' ports: None')
         print('    sandbox: %s' % state.header.sandbox)
     if options.verbose > 2:
         print('    process table:')
         for process, process_history in state.processes.items():
             print('      - %s runs: %s' % (process, len(process_history)),
                   end='')
             last_run = process_history[-1]
             print(' last: pid=%s, rc=%s, finish:%s, state:%s' %
                   (last_run.pid or 'None', last_run.return_code
                    if last_run.return_code is not None else '',
                    time.asctime(time.localtime(last_run.stop_time))
                    if last_run.stop_time else 'None',
                    ProcessState._VALUES_TO_NAMES.get(
                        last_run.state, 'Unknown')))
         print()
Ejemplo n.º 7
0
 def open_checkpoint(cls, filename, force=False, state=None):
     """
   Acquire a locked checkpoint stream.
 """
     safe_mkdir(os.path.dirname(filename))
     fp = lock_file(filename, "a+")
     if fp in (None, False):
         if force:
             log.info("Found existing runner, forcing leadership forfeit.")
             state = state or CheckpointDispatcher.from_file(filename)
             if cls.kill_runner(state):
                 log.info("Successfully killed leader.")
                 # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
                 # a maximum timeout.  But blocking is necessary because os.kill does not immediately
                 # release the lock if we're in force mode.
                 fp = lock_file(filename, "a+", blocking=True)
         else:
             log.error("Found existing runner, cannot take control.")
     if fp in (None, False):
         raise cls.PermissionError("Could not open locked checkpoint: %s, lock_file = %s" % (filename, fp))
     ckpt = ThriftRecordWriter(fp)
     ckpt.set_sync(True)
     return ckpt
Ejemplo n.º 8
0
 def format_task(task_id):
   checkpoint_filename = detector.get_checkpoint(task_id)
   checkpoint_stat = os.stat(checkpoint_filename)
   try:
     checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
   except:
     checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid
   print('  %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='')
   if options.verbose == 0:
     print()
   if options.verbose > 0:
     state = CheckpointDispatcher.from_file(checkpoint_filename)
     if state is None or state.header is None:
       print(' - checkpoint stream CORRUPT or outdated format')
       return
     print('  state: %8s' % TaskState._VALUES_TO_NAMES.get(state.statuses[-1].state, 'Unknown'),
       end='')
     print(' start: %25s' % time.asctime(time.localtime(state.header.launch_time_ms/1000.0)))
   if options.verbose > 1:
     print('    user: %s' % state.header.user, end='')
     if state.header.ports:
       print(' ports: %s' % ' '.join('%s -> %s' % (key, val)
                                        for key, val in state.header.ports.items()))
     else:
       print(' ports: None')
     print('    sandbox: %s' % state.header.sandbox)
   if options.verbose > 2:
     print('    process table:')
     for process, process_history in state.processes.items():
       print('      - %s runs: %s' % (process, len(process_history)), end='')
       last_run = process_history[-1]
       print(' last: pid=%s, rc=%s, finish:%s, state:%s' % (
         last_run.pid or 'None',
         last_run.return_code if last_run.return_code is not None else '',
         time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else 'None',
         ProcessState._VALUES_TO_NAMES.get(last_run.state, 'Unknown')))
     print()
Ejemplo n.º 9
0
 def get(cls, task_id, checkpoint_root):
   """
     Get a TaskRunner bound to the task_id in checkpoint_root.
   """
   path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
   task_json = path.getpath('task_path')
   task_checkpoint = path.getpath('runner_checkpoint')
   if not os.path.exists(task_json):
     return None
   task = ThermosConfigLoader.load_json(task_json)
   if task is None:
     return None
   if len(task.tasks()) == 0:
     return None
   try:
     checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
     if checkpoint is None or checkpoint.header is None:
       return None
     return cls(task.tasks()[0].task(), checkpoint_root, checkpoint.header.sandbox,
                log_dir=checkpoint.header.log_dir, task_id=task_id,
                portmap=checkpoint.header.ports)
   except Exception as e:
     log.error('Failed to reconstitute checkpoint in TaskRunner.get: %s' % e, exc_info=True)
     return None
Ejemplo n.º 10
0
    def kill(cls, task_id, checkpoint_root, force=False, terminal_status=TaskState.KILLED, clock=time):
        """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
        if terminal_status not in (TaskState.KILLED, TaskState.LOST):
            raise cls.Error(
                "terminal_status must be KILLED or LOST (got %s)" % TaskState._VALUES_TO_NAMES.get(terminal_status)
                or terminal_status
            )
        pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
        checkpoint = pathspec.getpath("runner_checkpoint")
        state = CheckpointDispatcher.from_file(checkpoint)

        if state is None or state.header is None or state.statuses is None:
            if force:
                log.error("Task has uninitialized TaskState - forcibly finalizing")
                cls.finalize_task(pathspec)
                return
            else:
                log.error("Cannot update states in uninitialized TaskState!")
                return

        ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

        def write_task_state(state):
            update = TaskStatus(
                state=state, timestamp_ms=int(clock.time() * 1000), runner_pid=os.getpid(), runner_uid=os.getuid()
            )
            ckpt.write(RunnerCkpt(task_status=update))

        def write_process_status(status):
            ckpt.write(RunnerCkpt(process_status=status))

        if cls.is_task_terminal(state.statuses[-1].state):
            log.info("Task is already in terminal state!  Finalizing.")
            cls.finalize_task(pathspec)
            return

        with closing(ckpt):
            write_task_state(TaskState.ACTIVE)
            for process, history in state.processes.items():
                process_status = history[-1]
                if not cls.is_process_terminal(process_status.state):
                    if cls.kill_process(state, process):
                        write_process_status(
                            ProcessStatus(
                                process=process,
                                state=ProcessState.KILLED,
                                seq=process_status.seq + 1,
                                return_code=-9,
                                stop_time=clock.time(),
                            )
                        )
                    else:
                        if process_status.state is not ProcessState.WAITING:
                            write_process_status(
                                ProcessStatus(process=process, state=ProcessState.LOST, seq=process_status.seq + 1)
                            )
            write_task_state(terminal_status)
        cls.finalize_task(pathspec)
Ejemplo n.º 11
0
 def state(self, task_id):
   if task_id not in self._states:
     self._states[task_id] = CheckpointDispatcher.from_file(self._detector.get_checkpoint(task_id))
   return self._states[task_id]
Ejemplo n.º 12
0
    def kill(cls,
             task_id,
             checkpoint_root,
             force=False,
             terminal_status=TaskState.KILLED,
             clock=time):
        """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
        if terminal_status not in (TaskState.KILLED, TaskState.LOST):
            raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                            TaskState._VALUES_TO_NAMES.get(terminal_status)
                            or terminal_status)
        pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
        checkpoint = pathspec.getpath('runner_checkpoint')
        state = CheckpointDispatcher.from_file(checkpoint)

        if state is None or state.header is None or state.statuses is None:
            if force:
                log.error(
                    'Task has uninitialized TaskState - forcibly finalizing')
                cls.finalize_task(pathspec)
                return
            else:
                log.error('Cannot update states in uninitialized TaskState!')
                return

        ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

        def write_task_state(state):
            update = TaskStatus(state=state,
                                timestamp_ms=int(clock.time() * 1000),
                                runner_pid=os.getpid(),
                                runner_uid=os.getuid())
            ckpt.write(RunnerCkpt(task_status=update))

        def write_process_status(status):
            ckpt.write(RunnerCkpt(process_status=status))

        if cls.is_task_terminal(state.statuses[-1].state):
            log.info('Task is already in terminal state!  Finalizing.')
            cls.finalize_task(pathspec)
            return

        with closing(ckpt):
            write_task_state(TaskState.ACTIVE)
            for process, history in state.processes.items():
                process_status = history[-1]
                if not cls.is_process_terminal(process_status.state):
                    if cls.kill_process(state, process):
                        write_process_status(
                            ProcessStatus(process=process,
                                          state=ProcessState.KILLED,
                                          seq=process_status.seq + 1,
                                          return_code=-9,
                                          stop_time=clock.time()))
                    else:
                        if process_status.state is not ProcessState.WAITING:
                            write_process_status(
                                ProcessStatus(process=process,
                                              state=ProcessState.LOST,
                                              seq=process_status.seq + 1))
            write_task_state(terminal_status)
        cls.finalize_task(pathspec)
Ejemplo n.º 13
0
def tail(args, options):
    """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
    if len(args) == 0:
        app.error('Expected a task to tail, got nothing!')
    if len(args) not in (1, 2):
        app.error(
            'Expected at most two arguments (task and optional process), got %d'
            % len(args))

    task_id = args[0]
    detector = TaskDetector(root=options.root)
    checkpoint = CheckpointDispatcher.from_file(
        detector.get_checkpoint(task_id))
    log_dir = checkpoint.header.log_dir
    process_runs = [(process, run)
                    for (process,
                         run) in detector.get_process_runs(task_id, log_dir)]
    if len(args) == 2:
        process_runs = [(process, run) for (process, run) in process_runs
                        if process == args[1]]

    if len(process_runs) == 0:
        print('ERROR: No processes found.', file=sys.stderr)
        sys.exit(1)

    processes = set([process for process, _ in process_runs])
    if len(processes) != 1:
        print('ERROR: More than one process matches query.', file=sys.stderr)
        sys.exit(1)

    process = processes.pop()
    run = max([run for _, run in process_runs])

    logdir = TaskPath(root=options.root,
                      task_id=args[0],
                      process=process,
                      run=run,
                      log_dir=log_dir).getpath('process_logdir')
    logfile = os.path.join(logdir,
                           'stderr' if options.use_stderr else 'stdout')

    monitor = TaskMonitor(TaskPath(root=options.root), args[0])

    def log_is_active():
        active_processes = monitor.get_active_processes()
        for process_status, process_run in active_processes:
            if process_status.process == process and process_run == run:
                return True
        return False

    if not log_is_active():
        print('Tail of terminal log %s' % logfile)
        for line in tail_closed(logfile):
            print(line.rstrip())
        return

    now = time.time()
    next_check = now + 5.0
    print('Tail of active log %s' % logfile)
    for line in tail_f(logfile, include_last=True, forever=False):
        print(line.rstrip())
        if time.time() > next_check:
            if not log_is_active():
                break
            else:
                next_check = time.time() + 5.0
Ejemplo n.º 14
0
 def state(self):
   """Return final state of Task (RunnerState, read from disk and cached for future access)"""
   if self._state is None:
     path = self._pathspec.given(task_id=self._task_id).getpath('runner_checkpoint')
     self._state = CheckpointDispatcher.from_file(path)
   return self._state