Example #1
0
def tail(args, options):
    """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
    if len(args) == 0:
        app.error("Expected a task to tail, got nothing!")
    if len(args) not in (1, 2):
        app.error("Expected at most two arguments (task and optional process), got %d" % len(args))

    task_id = args[0]
    detector = TaskDetector(root=options.root)
    checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
    log_dir = checkpoint.header.log_dir
    process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
    if len(args) == 2:
        process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

    if len(process_runs) == 0:
        print("ERROR: No processes found.", file=sys.stderr)
        sys.exit(1)

    processes = set([process for process, _ in process_runs])
    if len(processes) != 1:
        print("ERROR: More than one process matches query.", file=sys.stderr)
        sys.exit(1)

    process = processes.pop()
    run = max([run for _, run in process_runs])

    logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath(
        "process_logdir"
    )
    logfile = os.path.join(logdir, "stderr" if options.use_stderr else "stdout")

    monitor = TaskMonitor(TaskPath(root=options.root), args[0])

    def log_is_active():
        active_processes = monitor.get_active_processes()
        for process_status, process_run in active_processes:
            if process_status.process == process and process_run == run:
                return True
        return False

    if not log_is_active():
        print("Tail of terminal log %s" % logfile)
        for line in tail_closed(logfile):
            print(line.rstrip())
        return

    now = time.time()
    next_check = now + 5.0
    print("Tail of active log %s" % logfile)
    for line in tail_f(logfile, include_last=True, forever=False):
        print(line.rstrip())
        if time.time() > next_check:
            if not log_is_active():
                break
            else:
                next_check = time.time() + 5.0
Example #2
0
 def partition_tasks(self):
     """Return active/finished tasks as discovered from the checkpoint root."""
     detector = TaskDetector(root=self._checkpoint_root)
     active_tasks = set(
         t_id for _, t_id in detector.get_task_ids(state='active'))
     finished_tasks = set(
         t_id for _, t_id in detector.get_task_ids(state='finished'))
     return active_tasks, finished_tasks
def tail(args, options):
  """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
  if len(args) == 0:
    app.error('Expected a task to tail, got nothing!')
  if len(args) not in (1, 2):
    app.error('Expected at most two arguments (task and optional process), got %d' % len(args))

  task_id = args[0]
  detector = TaskDetector(root=options.root)
  checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
  log_dir = checkpoint.header.log_dir
  process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
  if len(args) == 2:
    process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

  if len(process_runs) == 0:
    print('ERROR: No processes found.', file=sys.stderr)
    sys.exit(1)

  processes = set([process for process, _ in process_runs])
  if len(processes) != 1:
    print('ERROR: More than one process matches query.', file=sys.stderr)
    sys.exit(1)

  process = processes.pop()
  run = max([run for _, run in process_runs])

  logdir = TaskPath(root=options.root, task_id=args[0], process=process,
     run=run, log_dir=log_dir).getpath('process_logdir')
  logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout')

  monitor = TaskMonitor(TaskPath(root=options.root), args[0])
  def log_is_active():
    active_processes = monitor.get_active_processes()
    for process_status, process_run in active_processes:
      if process_status.process == process and process_run == run:
        return True
    return False

  if not log_is_active():
    print('Tail of terminal log %s' % logfile)
    for line in tail_closed(logfile):
      print(line.rstrip())
    return

  now = time.time()
  next_check = now + 5.0
  print('Tail of active log %s' % logfile)
  for line in tail_f(logfile, include_last=True, forever=False):
    print(line.rstrip())
    if time.time() > next_check:
      if not log_is_active():
        break
      else:
        next_check = time.time() + 5.0
 def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
   self._pathspec = TaskPath(root=root)
   self._detector = TaskDetector(root)
   if not issubclass(resource_monitor_class, ResourceMonitorBase):
     raise ValueError("resource monitor class must implement ResourceMonitorBase!")
   self._resource_monitor = resource_monitor_class
   self._active_tasks = {}    # task_id => ActiveObservedTask
   self._finished_tasks = {}  # task_id => FinishedObservedTask
   self._stop_event = threading.Event()
   ExceptionalThread.__init__(self)
   Lockable.__init__(self)
   self.daemon = True
Example #5
0
  def partition_tasks(self):
    """Return active/finished tasks as discovered from the checkpoint roots."""
    active_tasks, finished_tasks = set(), set()

    for checkpoint_root in self._path_detector.get_paths():
      detector = TaskDetector(root=checkpoint_root)

      active_tasks.update(RootedTask(checkpoint_root, task_id)
          for _, task_id in detector.get_task_ids(state='active'))
      finished_tasks.update(RootedTask(checkpoint_root, task_id)
          for _, task_id in detector.get_task_ids(state='finished'))

    return active_tasks, finished_tasks
Example #6
0
  def partition_tasks(self):
    """Return active/finished tasks as discovered from the checkpoint roots."""
    active_tasks, finished_tasks = set(), set()

    for checkpoint_root in self._path_detector.get_paths():
      detector = TaskDetector(root=checkpoint_root)

      active_tasks.update(RootedTask(checkpoint_root, task_id)
          for _, task_id in detector.get_task_ids(state='active'))
      finished_tasks.update(RootedTask(checkpoint_root, task_id)
          for _, task_id in detector.get_task_ids(state='finished'))

    return active_tasks, finished_tasks
def tasks_from_re(expressions, root, state=None):
  task_ids = [t_id for _, t_id in TaskDetector(root=root).get_task_ids(state=state)]
  matched_tasks = set()
  for task_expr in map(re.compile, expressions):
    for task_id in task_ids:
      if task_expr.match(task_id):
        matched_tasks.add(task_id)
  return matched_tasks
Example #8
0
  def _refresh_detectors(self):
    new_paths = set(self._path_detector.get_paths())
    old_paths = set(self._detectors)

    for path in old_paths - new_paths:
      self._detectors.pop(path)

    for path in new_paths - old_paths:
      self._detectors[path] = TaskDetector(root=path)
 def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
   self._pathspec = TaskPath(root=root)
   self._detector = TaskDetector(root)
   if not issubclass(resource_monitor_class, ResourceMonitorBase):
     raise ValueError("resource monitor class must implement ResourceMonitorBase!")
   self._resource_monitor = resource_monitor_class
   self._active_tasks = {}    # task_id => ActiveObservedTask
   self._finished_tasks = {}  # task_id => FinishedObservedTask
   self._stop_event = threading.Event()
   ExceptionalThread.__init__(self)
   Lockable.__init__(self)
   self.daemon = True
Example #10
0
def tasks_from_re(expressions, state=None):
    path_detector = get_path_detector()

    matched_tasks = set()

    for root in path_detector.get_paths():
        task_ids = [
            t_id for _, t_id in TaskDetector(root).get_task_ids(state=state)
        ]
        for task_expr in map(re.compile, expressions):
            for task_id in task_ids:
                if task_expr.match(task_id):
                    matched_tasks.add((root, task_id))

    return matched_tasks
class TaskObserver(ExceptionalThread, Lockable):
  """
    The TaskObserver monitors the thermos checkpoint root for active/finished
    tasks.  It is used to be the oracle of the state of all thermos tasks on
    a machine.

    It currently returns JSON, but really should just return objects.  We should
    then build an object->json translator.
  """
  class UnexpectedError(Exception): pass
  class UnexpectedState(Exception): pass

  POLLING_INTERVAL = Amount(1, Time.SECONDS)

  def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
    self._pathspec = TaskPath(root=root)
    self._detector = TaskDetector(root)
    if not issubclass(resource_monitor_class, ResourceMonitorBase):
      raise ValueError("resource monitor class must implement ResourceMonitorBase!")
    self._resource_monitor = resource_monitor_class
    self._active_tasks = {}    # task_id => ActiveObservedTask
    self._finished_tasks = {}  # task_id => FinishedObservedTask
    self._stop_event = threading.Event()
    ExceptionalThread.__init__(self)
    Lockable.__init__(self)
    self.daemon = True

  @property
  def active_tasks(self):
    """Return a dictionary of active Tasks"""
    return self._active_tasks

  @property
  def finished_tasks(self):
    """Return a dictionary of finished Tasks"""
    return self._finished_tasks

  @property
  def all_tasks(self):
    """Return a dictionary of all Tasks known by the TaskObserver"""
    return dict(self.active_tasks.items() + self.finished_tasks.items())

  def stop(self):
    self._stop_event.set()

  def start(self):
    ExceptionalThread.start(self)

  @Lockable.sync
  def add_active_task(self, task_id):
    if task_id in self.finished_tasks:
      log.error('Found an active task (%s) in finished tasks?' % task_id)
      return
    task_monitor = TaskMonitor(self._pathspec, task_id)
    if not task_monitor.get_state().header:
      log.info('Unable to load task "%s"' % task_id)
      return
    sandbox = task_monitor.get_state().header.sandbox
    resource_monitor = self._resource_monitor(task_monitor, sandbox)
    resource_monitor.start()
    self._active_tasks[task_id] = ActiveObservedTask(
      task_id=task_id, pathspec=self._pathspec,
      task_monitor=task_monitor, resource_monitor=resource_monitor
    )

  @Lockable.sync
  def add_finished_task(self, task_id):
    self._finished_tasks[task_id] = FinishedObservedTask(
      task_id=task_id, pathspec=self._pathspec
    )

  @Lockable.sync
  def active_to_finished(self, task_id):
    self.remove_active_task(task_id)
    self.add_finished_task(task_id)

  @Lockable.sync
  def remove_active_task(self, task_id):
    task = self.active_tasks.pop(task_id)
    task.resource_monitor.kill()

  @Lockable.sync
  def remove_finished_task(self, task_id):
    self.finished_tasks.pop(task_id)

  def run(self):
    """
      The internal thread for the observer.  This periodically polls the
      checkpoint root for new tasks, or transitions of tasks from active to
      finished state.
    """
    while not self._stop_event.is_set():
      time.sleep(self.POLLING_INTERVAL.as_(Time.SECONDS))

      active_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='active')]
      finished_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='finished')]

      with self.lock:

        # Ensure all tasks currently detected on the system are observed appropriately
        for active in active_tasks:
          if active not in self.active_tasks:
            log.debug('task_id %s (unknown) -> active' % active)
            self.add_active_task(active)
        for finished in finished_tasks:
          if finished in self.active_tasks:
            log.debug('task_id %s active -> finished' % finished)
            self.active_to_finished(finished)
          elif finished not in self.finished_tasks:
            log.debug('task_id %s (unknown) -> finished' % finished)
            self.add_finished_task(finished)

        # Remove ObservedTasks for tasks no longer detected on the system
        for unknown in set(self.active_tasks) - set(active_tasks + finished_tasks):
          log.debug('task_id %s active -> (unknown)' % unknown)
          self.remove_active_task(unknown)
        for unknown in set(self.finished_tasks) - set(active_tasks + finished_tasks):
          log.debug('task_id %s finished -> (unknown)' % unknown)
          self.remove_finished_task(unknown)

  @Lockable.sync
  def process_from_name(self, task_id, process_id):
    if task_id in self.all_tasks:
      task = self.all_tasks[task_id].task
      if task:
        for process in task.processes():
          if process.name().get() == process_id:
            return process

  @Lockable.sync
  def task_count(self):
    """
      Return the count of tasks that could be ready properly from disk.
      This may be <= self.task_id_count()
    """
    return dict(
      active=len(self.active_tasks),
      finished=len(self.finished_tasks),
      all=len(self.all_tasks),
    )

  @Lockable.sync
  def task_id_count(self):
    """
      Return the raw count of active and finished task_ids from the TaskDetector.
    """
    num_active = len(list(self._detector.get_task_ids(state='active')))
    num_finished = len(list(self._detector.get_task_ids(state='finished')))
    return dict(active=num_active, finished=num_finished, all=num_active + num_finished)

  def _get_tasks_of_type(self, type):
    """Convenience function to return all tasks of a given type"""
    tasks = {
      'active': self.active_tasks,
      'finished': self.finished_tasks,
      'all': self.all_tasks,
    }.get(type, None)

    if tasks is None:
      log.error('Unknown task type %s' % type)
      return {}

    return tasks

  @Lockable.sync
  def state(self, task_id):
    """Return a dict containing mapped information about a task's state"""
    real_state = self.raw_state(task_id)
    if real_state is None or real_state.header is None:
      return {}
    else:
      return dict(
        task_id=real_state.header.task_id,
        launch_time=real_state.header.launch_time_ms / 1000.0,
        sandbox=real_state.header.sandbox,
        hostname=real_state.header.hostname,
        user=real_state.header.user
      )

  @Lockable.sync
  def raw_state(self, task_id):
    """
      Return the current runner state (thrift blob: gen.apache.thermos.ttypes.RunnerState)
      of a given task id
    """
    if task_id not in self.all_tasks:
      return None
    return self.all_tasks[task_id].state

  @Lockable.sync
  def _task_processes(self, task_id):
    """
      Return the processes of a task given its task_id.

      Returns a map from state to processes in that state, where possible
      states are: waiting, running, success, failed.
    """
    if task_id not in self.all_tasks:
      return {}
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}

    waiting, running, success, failed, killed = [], [], [], [], []
    for process, runs in state.processes.items():
      # No runs ==> nothing started.
      if len(runs) == 0:
        waiting.append(process)
      else:
        if runs[-1].state in (None, ProcessState.WAITING, ProcessState.LOST):
          waiting.append(process)
        elif runs[-1].state in (ProcessState.FORKED, ProcessState.RUNNING):
          running.append(process)
        elif runs[-1].state == ProcessState.SUCCESS:
          success.append(process)
        elif runs[-1].state == ProcessState.FAILED:
          failed.append(process)
        elif runs[-1].state == ProcessState.KILLED:
          killed.append(process)
        else:
          # TODO(wickman)  Consider log.error instead of raising.
          raise self.UnexpectedState(
            "Unexpected ProcessHistoryState: %s" % state.processes[process].state)

    return dict(waiting=waiting, running=running, success=success, failed=failed, killed=killed)

  @Lockable.sync
  def main(self, type=None, offset=None, num=None):
    """Return a set of information about tasks, optionally filtered

      Args:
        type = (all|active|finished|None) [default: all]
        offset = offset into the list of task_ids [default: 0]
        num = number of results to return [default: 20]

      Tasks are sorted by interest:
        - active tasks are sorted by start time
        - finished tasks are sorted by completion time

      Returns:
        {
          tasks: [task_id_1, ..., task_id_N],
          type: query type,
          offset: next offset,
          num: next num
        }

    """
    type = type or 'all'
    offset = offset or 0
    num = num or 20

    # Get a list of all ObservedTasks of requested type
    tasks = sorted((task for task in self._get_tasks_of_type(type).values()),
                   key=attrgetter('mtime'), reverse=True)

    # Filter by requested offset + number of results
    end = num
    if offset < 0:
      offset = offset % len(tasks) if len(tasks) > abs(offset) else 0
    end += offset
    tasks = tasks[offset:end]

    def task_row(observed_task):
      """Generate an output row for a Task"""
      task = self._task(observed_task.task_id)
      # tasks include those which could not be found properly and are hence empty {}
      if task:
        return dict(
            task_id=observed_task.task_id,
            name=task['name'],
            role=task['user'],
            launch_timestamp=task['launch_timestamp'],
            state=task['state'],
            state_timestamp=task['state_timestamp'],
            ports=task['ports'],
            **task['resource_consumption'])

    return dict(
      tasks=filter(None, map(task_row, tasks)),
      type=type,
      offset=offset,
      num=num,
      task_count=self.task_count()[type],
    )

  def _sample(self, task_id):
    if task_id not in self.active_tasks:
      log.debug("Task %s not found in active tasks" % task_id)
      sample = ProcessSample.empty().to_dict()
      sample['disk'] = 0
    else:
      resource_sample = self.active_tasks[task_id].resource_monitor.sample()[1]
      sample = resource_sample.process_sample.to_dict()
      sample['disk'] = resource_sample.disk_usage
      log.debug("Got sample for task %s: %s" % (task_id, sample))
    return sample

  @Lockable.sync
  def task_statuses(self, task_id):
    """
      Return the sequence of task states.

      [(task_state [string], timestamp), ...]
    """

    # Unknown task_id.
    if task_id not in self.all_tasks:
      return []

    task = self.all_tasks[task_id]
    if task is None:
      return []

    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return []

    # Get the timestamp of the transition into the current state.
    return [
      (TaskState._VALUES_TO_NAMES.get(st.state, 'UNKNOWN'), st.timestamp_ms / 1000)
      for st in state.statuses]

  @Lockable.sync
  def _task(self, task_id):
    """
      Return composite information about a particular task task_id, given the below
      schema.

      {
         task_id: string,
         name: string,
         user: string,
         launch_timestamp: seconds,
         state: string [ACTIVE, SUCCESS, FAILED]
         ports: { name1: 'url', name2: 'url2' }
         resource_consumption: { cpu:, ram:, disk: }
         processes: { -> names only
            waiting: [],
            running: [],
            success: [],
            failed:  []
         }
      }
    """
    # Unknown task_id.
    if task_id not in self.all_tasks:
      return {}

    task = self.all_tasks[task_id].task
    if task is None:
      # TODO(wickman)  Can this happen?
      log.error('Could not find task: %s' % task_id)
      return {}

    state = self.raw_state(task_id)
    if state is None or state.header is None:
      # TODO(wickman)  Can this happen?
      return {}

    # Get the timestamp of the transition into the current state.
    current_state = state.statuses[-1].state
    last_state = state.statuses[0]
    state_timestamp = 0
    for status in state.statuses:
      if status.state == current_state and last_state != current_state:
        state_timestamp = status.timestamp_ms / 1000
      last_state = status.state

    return dict(
       task_id=task_id,
       name=task.name().get(),
       launch_timestamp=state.statuses[0].timestamp_ms / 1000,
       state=TaskState._VALUES_TO_NAMES[state.statuses[-1].state],
       state_timestamp=state_timestamp,
       user=state.header.user,
       resource_consumption=self._sample(task_id),
       ports=state.header.ports,
       processes=self._task_processes(task_id),
       task_struct=task,
    )

  @Lockable.sync
  def _get_process_resource_consumption(self, task_id, process_name):
    if task_id not in self.active_tasks:
      log.debug("Task %s not found in active tasks" % task_id)
      return ProcessSample.empty().to_dict()
    sample = self.active_tasks[task_id].resource_monitor.sample_by_process(process_name).to_dict()
    log.debug('Resource consumption (%s, %s) => %s' % (task_id, process_name, sample))
    return sample

  @Lockable.sync
  def _get_process_tuple(self, history, run):
    """
      Return the basic description of a process run if it exists, otherwise
      an empty dictionary.

      {
        process_name: string
        process_run: int
        state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST]
        (optional) start_time: seconds from epoch
        (optional) stop_time: seconds from epoch
      }
    """
    if len(history) == 0:
      return {}
    if run >= len(history):
      return {}
    else:
      process_run = history[run]
      run = run % len(history)
      d = dict(
        process_name=process_run.process,
        process_run=run,
        state=ProcessState._VALUES_TO_NAMES[process_run.state],
      )
      if process_run.start_time:
        d.update(start_time=process_run.start_time)
      if process_run.stop_time:
        d.update(stop_time=process_run.stop_time)
      return d

  @Lockable.sync
  def process(self, task_id, process, run=None):
    """
      Returns a process run, where the schema is given below:

      {
        process_name: string
        process_run: int
        used: { cpu: float, ram: int bytes, disk: int bytes }
        start_time: (time since epoch in millis (utc))
        stop_time: (time since epoch in millis (utc))
        state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST]
      }

      If run is None, return the latest run.
    """
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}
    if process not in state.processes:
      return {}
    history = state.processes[process]
    run = int(run) if run is not None else -1
    tup = self._get_process_tuple(history, run)
    if not tup:
      return {}
    if tup.get('state') == 'RUNNING':
      tup.update(used=self._get_process_resource_consumption(task_id, process))
    return tup

  @Lockable.sync
  def _processes(self, task_id):
    """
      Return
        {
          process1: { ... }
          process2: { ... }
          ...
          processN: { ... }
        }

      where processK is the latest run of processK and in the schema as
      defined by process().
    """

    if task_id not in self.all_tasks:
      return {}
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}

    processes = self._task_processes(task_id)
    d = dict()
    for process_type in processes:
      for process_name in processes[process_type]:
        d[process_name] = self.process(task_id, process_name)
    return d

  @Lockable.sync
  def processes(self, task_ids):
    """
      Given a list of task_ids, returns a map of task_id => processes, where processes
      is defined by the schema in _processes.
    """
    if not isinstance(task_ids, (list, tuple)):
      return {}
    return dict((task_id, self._processes(task_id)) for task_id in task_ids)

  @Lockable.sync
  def get_run_number(self, runner_state, process, run=None):
    if runner_state is not None and runner_state.processes is not None:
      run = run if run is not None else -1
      if run < len(runner_state.processes[process]):
        if len(runner_state.processes[process]) > 0:
          return run % len(runner_state.processes[process])

  @Lockable.sync
  def logs(self, task_id, process, run=None):
    """
      Given a task_id and a process and (optional) run number, return a dict:
      {
        stderr: [dir, filename]
        stdout: [dir, filename]
      }

      If the run number is unspecified, uses the latest run.

      TODO(wickman)  Just return the filenames directly?
    """
    runner_state = self.raw_state(task_id)
    if runner_state is None or runner_state.header is None:
      return {}
    run = self.get_run_number(runner_state, process, run)
    if run is None:
      return {}
    log_path = self._pathspec.given(task_id=task_id, process=process, run=run,
                                    log_dir=runner_state.header.log_dir).getpath('process_logdir')
    return dict(
      stdout=[log_path, 'stdout'],
      stderr=[log_path, 'stderr']
    )

  @staticmethod
  def _sanitize_path(base_path, relpath):
    """
      Attempts to sanitize a path through path normalization, also making sure
      that the relative path is contained inside of base_path.
    """
    if relpath is None:
      relpath = "."
    normalized_base = os.path.realpath(base_path)
    normalized = os.path.realpath(os.path.join(base_path, relpath))
    if normalized.startswith(normalized_base):
      return (normalized_base, os.path.relpath(normalized, normalized_base))
    return (None, None)

  @Lockable.sync
  def valid_file(self, task_id, path):
    """
      Like valid_path, but also verify the given path is a file
    """
    chroot, path = self.valid_path(task_id, path)
    if chroot and path and os.path.isfile(os.path.join(chroot, path)):
      return chroot, path
    return None, None

  @Lockable.sync
  def valid_path(self, task_id, path):
    """
      Given a task_id and a path within that task_id's sandbox, verify:
        (1) it's actually in the sandbox and not outside
        (2) it's a valid, existing path
      Returns chroot and the pathname relative to that chroot.
    """
    runner_state = self.raw_state(task_id)
    if runner_state is None or runner_state.header is None:
      return None, None
    try:
      chroot = runner_state.header.sandbox
    except AttributeError:
      return None, None
    chroot, path = self._sanitize_path(chroot, path)
    if chroot and path:
      return chroot, path
    return None, None

  @Lockable.sync
  def files(self, task_id, path=None):
    """
      Returns dictionary
      {
        task_id: task_id
        chroot: absolute directory on machine
        path: sanitized relative path w.r.t. chroot
        dirs: list of directories
        files: list of files
      }
    """
    # TODO(jon): DEPRECATED: most of the necessary logic is handled directly in the templates.
    # Also, global s/chroot/sandbox/?
    empty = dict(task_id=task_id, chroot=None, path=None, dirs=None, files=None)
    path = path if path is not None else '.'
    runner_state = self.raw_state(task_id)
    if runner_state is None:
      return empty
    try:
      chroot = runner_state.header.sandbox
    except AttributeError:
      return empty
    if chroot is None:  # chroot-less job
      return empty
    chroot, path = self._sanitize_path(chroot, path)
    if (chroot is None or path is None
        or not os.path.isdir(os.path.join(chroot, path))):
      return empty
    names = os.listdir(os.path.join(chroot, path))
    dirs, files = [], []
    for name in names:
      if os.path.isdir(os.path.join(chroot, path, name)):
        dirs.append(name)
      else:
        files.append(name)
    return dict(
      task_id=task_id,
      chroot=chroot,
      path=path,
      dirs=dirs,
      files=files
    )
 def partition_tasks(self):
   """Return active/finished tasks as discovered from the checkpoint root."""
   detector = TaskDetector(root=self._checkpoint_root)
   active_tasks = set(t_id for _, t_id in detector.get_task_ids(state='active'))
   finished_tasks = set(t_id for _, t_id in detector.get_task_ids(state='finished'))
   return active_tasks, finished_tasks
Example #13
0
def status(args, options):
  """Get the status of task(s).

    Usage: thermos status [options] [task_name(s) or task_regexp(s)]

    Options:
      --verbosity=LEVEL     Verbosity level for logging. [default: 0]
      --only=TYPE	    Only print tasks of TYPE (options: active finished)
  """
  detector = TaskDetector(root=options.root)

  def format_task(task_id):
    checkpoint_filename = detector.get_checkpoint(task_id)
    checkpoint_stat = os.stat(checkpoint_filename)
    try:
      checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name
    except KeyError:
      checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid
    print('  %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='')
    if options.verbose == 0:
      print()
    if options.verbose > 0:
      state = CheckpointDispatcher.from_file(checkpoint_filename)
      if state is None or state.header is None:
        print(' - checkpoint stream CORRUPT or outdated format')
        return
      print('  state: %8s' % TaskState._VALUES_TO_NAMES.get(state.statuses[-1].state, 'Unknown'),
        end='')
      print(' start: %25s' % time.asctime(time.localtime(state.header.launch_time_ms / 1000.0)))
    if options.verbose > 1:
      print('    user: %s' % state.header.user, end='')
      if state.header.ports:
        print(' ports: %s' % ' '.join('%s -> %s' % (key, val)
                                         for key, val in state.header.ports.items()))
      else:
        print(' ports: None')
      print('    sandbox: %s' % state.header.sandbox)
    if options.verbose > 2:
      print('    process table:')
      for process, process_history in state.processes.items():
        print('      - %s runs: %s' % (process, len(process_history)), end='')
        last_run = process_history[-1]
        print(' last: pid=%s, rc=%s, finish:%s, state:%s' % (
          last_run.pid or 'None',
          last_run.return_code if last_run.return_code is not None else '',
          time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else 'None',
          ProcessState._VALUES_TO_NAMES.get(last_run.state, 'Unknown')))
      print()

  matchers = map(re.compile, args or ['.*'])
  active = [t_id for _, t_id in detector.get_task_ids(state='active')
            if any(pattern.match(t_id) for pattern in matchers)]
  finished = [t_id for _, t_id in detector.get_task_ids(state='finished')
              if any(pattern.match(t_id) for pattern in matchers)]

  found = False
  if options.only is None or options.only == 'active':
    if active:
      print('Active tasks:')
      found = True
      for task_id in active:
        format_task(task_id)
      print()

  if options.only is None or options.only == 'finished':
    if finished:
      print('Finished tasks:')
      found = True
      for task_id in finished:
        format_task(task_id)
      print()

  if not found:
    print('No tasks found in root [%s]' % options.root)
    sys.exit(1)
Example #14
0
def test_task_detector():
  with temporary_dir() as root:
    active_log_dir = os.path.join(root, 'active_log')
    finished_log_dir = os.path.join(root, 'finished_log')

    path = TaskPath(root=root)
    detector = TaskDetector(root)

    # test empty paths

    assert list(detector.get_task_ids(state='active')) == []
    assert list(detector.get_task_ids(state='finished')) == []
    assert set(detector.get_task_ids()) == set()

    assert detector.get_checkpoint(task_id='active_task') == path.given(
        task_id='active_task').getpath('runner_checkpoint')

    assert detector.get_checkpoint(task_id='finished_task') == path.given(
        task_id='finished_task').getpath('runner_checkpoint')

    assert set(detector.get_process_checkpoints('active_task')) == set()
    assert set(detector.get_process_checkpoints('finished_task')) == set()
    assert set(detector.get_process_runs('active_task', active_log_dir)) == set()
    assert set(detector.get_process_runs('finished_task', finished_log_dir)) == set()
    assert set(detector.get_process_logs('active_task', active_log_dir)) == set()
    assert set(detector.get_process_logs('finished_task', finished_log_dir)) == set()

    # create paths

    paths = [
        path.given(state='active', task_id='active_task').getpath('task_path'),
        path.given(state='finished', task_id='finished_task').getpath('task_path'),
        path.given(task_id='active_task').getpath('runner_checkpoint'),
        path.given(task_id='finished_task').getpath('runner_checkpoint'),
        path.given(
            task_id='active_task',
            process='hello_world',
            run='0',
            log_dir=active_log_dir
        ).with_filename('stdout').getpath('process_logdir'),
        path.given(
            task_id='finished_task',
            process='goodbye_world',
            run='1',
            log_dir=finished_log_dir
        ).with_filename('stderr').getpath('process_logdir'),
        path.given(task_id='active_task', process='hello_world').getpath('process_checkpoint'),
        path.given(task_id='finished_task', process='goodbye_world').getpath('process_checkpoint'),
    ]

    for p in paths:
      touch(p)

    detector = TaskDetector(root)

    assert list(detector.get_task_ids(state='active')) == list([('active', 'active_task')])
    assert list(detector.get_task_ids(state='finished')) == list([('finished', 'finished_task')])
    assert set(detector.get_task_ids()) == set(
        [('active', 'active_task'), ('finished', 'finished_task')])

    assert list(detector.get_process_checkpoints('active_task')) == [
        path.given(task_id='active_task', process='hello_world').getpath('process_checkpoint')]

    assert list(detector.get_process_checkpoints('finished_task')) == [
        path.given(task_id='finished_task', process='goodbye_world').getpath('process_checkpoint')]

    assert list(detector.get_process_runs('active_task', active_log_dir)) == [
        ('hello_world', 0)]
    assert list(detector.get_process_runs('finished_task', finished_log_dir)) == [
        ('goodbye_world', 1)]

    assert list(detector.get_process_logs('active_task', active_log_dir)) == [
        path.given(
            task_id='active_task',
            process='hello_world',
            run='0',
            log_dir=active_log_dir
        ).with_filename('stdout').getpath('process_logdir')]

    assert list(detector.get_process_logs('finished_task', finished_log_dir)) == [
        path.given(
            task_id='finished_task',
            process='goodbye_world',
            run='1',
            log_dir=finished_log_dir
        ).with_filename('stderr').getpath('process_logdir')]
class TaskObserver(ExceptionalThread, Lockable):
  """
    The TaskObserver monitors the thermos checkpoint root for active/finished
    tasks.  It is used to be the oracle of the state of all thermos tasks on
    a machine.

    It currently returns JSON, but really should just return objects.  We should
    then build an object->json translator.
  """
  class UnexpectedError(Exception): pass
  class UnexpectedState(Exception): pass

  POLLING_INTERVAL = Amount(1, Time.SECONDS)

  def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
    self._pathspec = TaskPath(root=root)
    self._detector = TaskDetector(root)
    if not issubclass(resource_monitor_class, ResourceMonitorBase):
      raise ValueError("resource monitor class must implement ResourceMonitorBase!")
    self._resource_monitor = resource_monitor_class
    self._active_tasks = {}    # task_id => ActiveObservedTask
    self._finished_tasks = {}  # task_id => FinishedObservedTask
    self._stop_event = threading.Event()
    ExceptionalThread.__init__(self)
    Lockable.__init__(self)
    self.daemon = True

  @property
  def active_tasks(self):
    """Return a dictionary of active Tasks"""
    return self._active_tasks

  @property
  def finished_tasks(self):
    """Return a dictionary of finished Tasks"""
    return self._finished_tasks

  @property
  def all_tasks(self):
    """Return a dictionary of all Tasks known by the TaskObserver"""
    return dict(self.active_tasks.items() + self.finished_tasks.items())

  def stop(self):
    self._stop_event.set()

  def start(self):
    ExceptionalThread.start(self)

  @Lockable.sync
  def add_active_task(self, task_id):
    if task_id in self.finished_tasks:
      log.error('Found an active task (%s) in finished tasks?' % task_id)
      return
    task_monitor = TaskMonitor(self._pathspec, task_id)
    if not task_monitor.get_state().header:
      log.info('Unable to load task "%s"' % task_id)
      return
    sandbox = task_monitor.get_state().header.sandbox
    resource_monitor = self._resource_monitor(task_monitor, sandbox)
    resource_monitor.start()
    self._active_tasks[task_id] = ActiveObservedTask(
      task_id=task_id, pathspec=self._pathspec,
      task_monitor=task_monitor, resource_monitor=resource_monitor
    )

  @Lockable.sync
  def add_finished_task(self, task_id):
    self._finished_tasks[task_id] = FinishedObservedTask(
      task_id=task_id, pathspec=self._pathspec
    )

  @Lockable.sync
  def active_to_finished(self, task_id):
    self.remove_active_task(task_id)
    self.add_finished_task(task_id)

  @Lockable.sync
  def remove_active_task(self, task_id):
    task = self.active_tasks.pop(task_id)
    task.resource_monitor.kill()

  @Lockable.sync
  def remove_finished_task(self, task_id):
    self.finished_tasks.pop(task_id)

  def run(self):
    """
      The internal thread for the observer.  This periodically polls the
      checkpoint root for new tasks, or transitions of tasks from active to
      finished state.
    """
    while not self._stop_event.is_set():
      time.sleep(self.POLLING_INTERVAL.as_(Time.SECONDS))

      active_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='active')]
      finished_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='finished')]

      with self.lock:

        # Ensure all tasks currently detected on the system are observed appropriately
        for active in active_tasks:
          if active not in self.active_tasks:
            log.debug('task_id %s (unknown) -> active' % active)
            self.add_active_task(active)
        for finished in finished_tasks:
          if finished in self.active_tasks:
            log.debug('task_id %s active -> finished' % finished)
            self.active_to_finished(finished)
          elif finished not in self.finished_tasks:
            log.debug('task_id %s (unknown) -> finished' % finished)
            self.add_finished_task(finished)

        # Remove ObservedTasks for tasks no longer detected on the system
        for unknown in set(self.active_tasks) - set(active_tasks + finished_tasks):
          log.debug('task_id %s active -> (unknown)' % unknown)
          self.remove_active_task(unknown)
        for unknown in set(self.finished_tasks) - set(active_tasks + finished_tasks):
          log.debug('task_id %s finished -> (unknown)' % unknown)
          self.remove_finished_task(unknown)

  @Lockable.sync
  def process_from_name(self, task_id, process_id):
    if task_id in self.all_tasks:
      task = self.all_tasks[task_id].task
      if task:
        for process in task.processes():
          if process.name().get() == process_id:
            return process

  @Lockable.sync
  def task_count(self):
    """
      Return the count of tasks that could be ready properly from disk.
      This may be <= self.task_id_count()
    """
    return dict(
      active=len(self.active_tasks),
      finished=len(self.finished_tasks),
      all=len(self.all_tasks),
    )

  @Lockable.sync
  def task_id_count(self):
    """
      Return the raw count of active and finished task_ids from the TaskDetector.
    """
    num_active = len(list(self._detector.get_task_ids(state='active')))
    num_finished = len(list(self._detector.get_task_ids(state='finished')))
    return dict(active=num_active, finished=num_finished, all=num_active + num_finished)

  def _get_tasks_of_type(self, type):
    """Convenience function to return all tasks of a given type"""
    tasks = {
      'active': self.active_tasks,
      'finished': self.finished_tasks,
      'all': self.all_tasks,
    }.get(type, None)

    if tasks is None:
      log.error('Unknown task type %s' % type)
      return {}

    return tasks

  @Lockable.sync
  def state(self, task_id):
    """Return a dict containing mapped information about a task's state"""
    real_state = self.raw_state(task_id)
    if real_state is None or real_state.header is None:
      return {}
    else:
      return dict(
        task_id=real_state.header.task_id,
        launch_time=real_state.header.launch_time_ms / 1000.0,
        sandbox=real_state.header.sandbox,
        hostname=real_state.header.hostname,
        user=real_state.header.user
      )

  @Lockable.sync
  def raw_state(self, task_id):
    """
      Return the current runner state (thrift blob: gen.apache.thermos.ttypes.RunnerState)
      of a given task id
    """
    if task_id not in self.all_tasks:
      return None
    return self.all_tasks[task_id].state

  @Lockable.sync
  def _task_processes(self, task_id):
    """
      Return the processes of a task given its task_id.

      Returns a map from state to processes in that state, where possible
      states are: waiting, running, success, failed.
    """
    if task_id not in self.all_tasks:
      return {}
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}

    waiting, running, success, failed, killed = [], [], [], [], []
    for process, runs in state.processes.items():
      # No runs ==> nothing started.
      if len(runs) == 0:
        waiting.append(process)
      else:
        if runs[-1].state in (None, ProcessState.WAITING, ProcessState.LOST):
          waiting.append(process)
        elif runs[-1].state in (ProcessState.FORKED, ProcessState.RUNNING):
          running.append(process)
        elif runs[-1].state == ProcessState.SUCCESS:
          success.append(process)
        elif runs[-1].state == ProcessState.FAILED:
          failed.append(process)
        elif runs[-1].state == ProcessState.KILLED:
          killed.append(process)
        else:
          # TODO(wickman)  Consider log.error instead of raising.
          raise self.UnexpectedState(
            "Unexpected ProcessHistoryState: %s" % state.processes[process].state)

    return dict(waiting=waiting, running=running, success=success, failed=failed, killed=killed)

  @Lockable.sync
  def main(self, type=None, offset=None, num=None):
    """Return a set of information about tasks, optionally filtered

      Args:
        type = (all|active|finished|None) [default: all]
        offset = offset into the list of task_ids [default: 0]
        num = number of results to return [default: 20]

      Tasks are sorted by interest:
        - active tasks are sorted by start time
        - finished tasks are sorted by completion time

      Returns:
        {
          tasks: [task_id_1, ..., task_id_N],
          type: query type,
          offset: next offset,
          num: next num
        }

    """
    type = type or 'all'
    offset = offset or 0
    num = num or 20

    # Get a list of all ObservedTasks of requested type
    tasks = sorted((task for task in self._get_tasks_of_type(type).values()),
                   key=attrgetter('mtime'), reverse=True)

    # Filter by requested offset + number of results
    end = num
    if offset < 0:
      offset = offset % len(tasks) if len(tasks) > abs(offset) else 0
    end += offset
    tasks = tasks[offset:end]

    def task_row(observed_task):
      """Generate an output row for a Task"""
      task = self._task(observed_task.task_id)
      # tasks include those which could not be found properly and are hence empty {}
      if task:
        return dict(
            task_id=observed_task.task_id,
            name=task['name'],
            role=task['user'],
            launch_timestamp=task['launch_timestamp'],
            state=task['state'],
            state_timestamp=task['state_timestamp'],
            ports=task['ports'],
            **task['resource_consumption'])

    return dict(
      tasks=filter(None, map(task_row, tasks)),
      type=type,
      offset=offset,
      num=num,
      task_count=self.task_count()[type],
    )

  def _sample(self, task_id):
    if task_id not in self.active_tasks:
      log.debug("Task %s not found in active tasks" % task_id)
      sample = ProcessSample.empty().to_dict()
      sample['disk'] = 0
    else:
      resource_sample = self.active_tasks[task_id].resource_monitor.sample()[1]
      sample = resource_sample.process_sample.to_dict()
      sample['disk'] = resource_sample.disk_usage
      log.debug("Got sample for task %s: %s" % (task_id, sample))
    return sample

  @Lockable.sync
  def task_statuses(self, task_id):
    """
      Return the sequence of task states.

      [(task_state [string], timestamp), ...]
    """

    # Unknown task_id.
    if task_id not in self.all_tasks:
      return []

    task = self.all_tasks[task_id]
    if task is None:
      return []

    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return []

    # Get the timestamp of the transition into the current state.
    return [
      (TaskState._VALUES_TO_NAMES.get(st.state, 'UNKNOWN'), st.timestamp_ms / 1000)
      for st in state.statuses]

  @Lockable.sync
  def tasks(self, task_ids):
    """
      Return information about an iterable of tasks [task_id1, task_id2, ...]
      in the following form.

      {
        task_id1 : self._task(task_id1),
        task_id2 : self._task(task_id2),
        ...
      }
    """
    res = {}
    for task_id in task_ids:
      d = self._task(task_id)
      task_struct = d.pop('task_struct')
      d['task'] = task_struct.get()
      res[task_id] = d
    return res

  @Lockable.sync
  def _task(self, task_id):
    """
      Return composite information about a particular task task_id, given the below
      schema.

      {
         task_id: string,
         name: string,
         user: string,
         launch_timestamp: seconds,
         state: string [ACTIVE, SUCCESS, FAILED]
         ports: { name1: 'url', name2: 'url2' }
         resource_consumption: { cpu:, ram:, disk: }
         processes: { -> names only
            waiting: [],
            running: [],
            success: [],
            failed:  []
         }
      }
    """
    # Unknown task_id.
    if task_id not in self.all_tasks:
      return {}

    task = self.all_tasks[task_id].task
    if task is None:
      # TODO(wickman)  Can this happen?
      log.error('Could not find task: %s' % task_id)
      return {}

    state = self.raw_state(task_id)
    if state is None or state.header is None:
      # TODO(wickman)  Can this happen?
      return {}

    # Get the timestamp of the transition into the current state.
    current_state = state.statuses[-1].state
    last_state = state.statuses[0]
    state_timestamp = 0
    for status in state.statuses:
      if status.state == current_state and last_state != current_state:
        state_timestamp = status.timestamp_ms / 1000
      last_state = status.state

    return dict(
       task_id=task_id,
       name=task.name().get(),
       launch_timestamp=state.statuses[0].timestamp_ms / 1000,
       state=TaskState._VALUES_TO_NAMES[state.statuses[-1].state],
       state_timestamp=state_timestamp,
       user=state.header.user,
       resource_consumption=self._sample(task_id),
       ports=state.header.ports,
       processes=self._task_processes(task_id),
       task_struct=task,
    )

  @Lockable.sync
  def _get_process_resource_consumption(self, task_id, process_name):
    if task_id not in self.active_tasks:
      log.debug("Task %s not found in active tasks" % task_id)
      return ProcessSample.empty().to_dict()
    sample = self.active_tasks[task_id].resource_monitor.sample_by_process(process_name).to_dict()
    log.debug('Resource consumption (%s, %s) => %s' % (task_id, process_name, sample))
    return sample

  @Lockable.sync
  def _get_process_tuple(self, history, run):
    """
      Return the basic description of a process run if it exists, otherwise
      an empty dictionary.

      {
        process_name: string
        process_run: int
        (optional) return_code: int
        state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST]
        (optional) start_time: seconds from epoch
        (optional) stop_time: seconds from epoch
      }
    """
    if len(history) == 0:
      return {}
    if run >= len(history):
      return {}
    else:
      process_run = history[run]
      run = run % len(history)
      d = dict(
        process_name=process_run.process,
        process_run=run,
        state=ProcessState._VALUES_TO_NAMES[process_run.state],
      )
      if process_run.start_time:
        d.update(start_time=process_run.start_time)
      if process_run.stop_time:
        d.update(stop_time=process_run.stop_time)
      if process_run.return_code:
        d.update(return_code=process_run.return_code)
      return d

  @Lockable.sync
  def process(self, task_id, process, run=None):
    """
      Returns a process run, where the schema is given below:

      {
        process_name: string
        process_run: int
        used: { cpu: float, ram: int bytes, disk: int bytes }
        start_time: (time since epoch in millis (utc))
        stop_time: (time since epoch in millis (utc))
        state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST]
      }

      If run is None, return the latest run.
    """
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}
    if process not in state.processes:
      return {}
    history = state.processes[process]
    run = int(run) if run is not None else -1
    tup = self._get_process_tuple(history, run)
    if not tup:
      return {}
    if tup.get('state') == 'RUNNING':
      tup.update(used=self._get_process_resource_consumption(task_id, process))
    return tup

  @Lockable.sync
  def _processes(self, task_id):
    """
      Return
        {
          process1: { ... }
          process2: { ... }
          ...
          processN: { ... }
        }

      where processK is the latest run of processK and in the schema as
      defined by process().
    """

    if task_id not in self.all_tasks:
      return {}
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}

    processes = self._task_processes(task_id)
    d = dict()
    for process_type in processes:
      for process_name in processes[process_type]:
        d[process_name] = self.process(task_id, process_name)
    return d

  @Lockable.sync
  def processes(self, task_ids):
    """
      Given a list of task_ids, returns a map of task_id => processes, where processes
      is defined by the schema in _processes.
    """
    if not isinstance(task_ids, (list, tuple)):
      return {}
    return dict((task_id, self._processes(task_id)) for task_id in task_ids)

  @Lockable.sync
  def get_run_number(self, runner_state, process, run=None):
    if runner_state is not None and runner_state.processes is not None:
      run = run if run is not None else -1
      if run < len(runner_state.processes[process]):
        if len(runner_state.processes[process]) > 0:
          return run % len(runner_state.processes[process])

  @Lockable.sync
  def logs(self, task_id, process, run=None):
    """
      Given a task_id and a process and (optional) run number, return a dict:
      {
        stderr: [dir, filename]
        stdout: [dir, filename]
      }

      If the run number is unspecified, uses the latest run.

      TODO(wickman)  Just return the filenames directly?
    """
    runner_state = self.raw_state(task_id)
    if runner_state is None or runner_state.header is None:
      return {}
    run = self.get_run_number(runner_state, process, run)
    if run is None:
      return {}
    log_path = self._pathspec.given(task_id=task_id, process=process, run=run,
                                    log_dir=runner_state.header.log_dir).getpath('process_logdir')
    return dict(
      stdout=[log_path, 'stdout'],
      stderr=[log_path, 'stderr']
    )

  @staticmethod
  def _sanitize_path(base_path, relpath):
    """
      Attempts to sanitize a path through path normalization, also making sure
      that the relative path is contained inside of base_path.
    """
    if relpath is None:
      relpath = "."
    normalized_base = os.path.realpath(base_path)
    normalized = os.path.realpath(os.path.join(base_path, relpath))
    if normalized.startswith(normalized_base):
      return (normalized_base, os.path.relpath(normalized, normalized_base))
    return (None, None)

  @Lockable.sync
  def valid_file(self, task_id, path):
    """
      Like valid_path, but also verify the given path is a file
    """
    chroot, path = self.valid_path(task_id, path)
    if chroot and path and os.path.isfile(os.path.join(chroot, path)):
      return chroot, path
    return None, None

  @Lockable.sync
  def valid_path(self, task_id, path):
    """
      Given a task_id and a path within that task_id's sandbox, verify:
        (1) it's actually in the sandbox and not outside
        (2) it's a valid, existing path
      Returns chroot and the pathname relative to that chroot.
    """
    runner_state = self.raw_state(task_id)
    if runner_state is None or runner_state.header is None:
      return None, None
    try:
      chroot = runner_state.header.sandbox
    except AttributeError:
      return None, None
    chroot, path = self._sanitize_path(chroot, path)
    if chroot and path:
      return chroot, path
    return None, None

  @Lockable.sync
  def files(self, task_id, path=None):
    """
      Returns dictionary
      {
        task_id: task_id
        chroot: absolute directory on machine
        path: sanitized relative path w.r.t. chroot
        dirs: list of directories
        files: list of files
      }
    """
    # TODO(jon): DEPRECATED: most of the necessary logic is handled directly in the templates.
    # Also, global s/chroot/sandbox/?
    empty = dict(task_id=task_id, chroot=None, path=None, dirs=None, files=None)
    path = path if path is not None else '.'
    runner_state = self.raw_state(task_id)
    if runner_state is None:
      return empty
    try:
      chroot = runner_state.header.sandbox
    except AttributeError:
      return empty
    if chroot is None:  # chroot-less job
      return empty
    chroot, path = self._sanitize_path(chroot, path)
    if (chroot is None or path is None
        or not os.path.isdir(os.path.join(chroot, path))):
      return empty
    names = os.listdir(os.path.join(chroot, path))
    dirs, files = [], []
    for name in names:
      if os.path.isdir(os.path.join(chroot, path, name)):
        dirs.append(name)
      else:
        files.append(name)
    return dict(
      task_id=task_id,
      chroot=chroot,
      path=path,
      dirs=dirs,
      files=files
    )