def write_header(root, sandbox, task_id):
    log_dir = os.path.join(sandbox, '.logs')
    path = TaskPath(root=root, task_id=task_id, log_dir=log_dir)
    header = RunnerHeader(task_id=task_id, sandbox=sandbox, log_dir=log_dir)
    ckpt = TaskRunnerHelper.open_checkpoint(path.getpath('runner_checkpoint'))
    ckpt.write(RunnerCkpt(runner_header=header))
    ckpt.close()
Example #2
0
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state="active")
     task_json = path.getpath("task_path")
     task_checkpoint = path.getpath("runner_checkpoint")
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(
             task.tasks()[0].task(),
             checkpoint_root,
             checkpoint.header.sandbox,
             log_dir=checkpoint.header.log_dir,
             task_id=task_id,
             portmap=checkpoint.header.ports,
             hostname=checkpoint.header.hostname,
         )
     except Exception as e:
         log.error("Failed to reconstitute checkpoint in TaskRunner.get: %s" % e, exc_info=True)
         return None
Example #3
0
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
     task_json = path.getpath('task_path')
     task_checkpoint = path.getpath('runner_checkpoint')
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(task.tasks()[0].task(),
                    checkpoint_root,
                    checkpoint.header.sandbox,
                    log_dir=checkpoint.header.log_dir,
                    task_id=task_id,
                    portmap=checkpoint.header.ports,
                    hostname=checkpoint.header.hostname)
     except Exception as e:
         log.error(
             'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e,
             exc_info=True)
         return None
def write_header(root, sandbox, task_id):
  log_dir = os.path.join(sandbox, '.logs')
  path = TaskPath(root=root, task_id=task_id, log_dir=log_dir)
  header = RunnerHeader(task_id=task_id, sandbox=sandbox, log_dir=log_dir)
  ckpt = TaskRunnerHelper.open_checkpoint(path.getpath('runner_checkpoint'))
  ckpt.write(RunnerCkpt(runner_header=header))
  ckpt.close()
def tail(args, options):
  """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
  if len(args) == 0:
    app.error('Expected a task to tail, got nothing!')
  if len(args) not in (1, 2):
    app.error('Expected at most two arguments (task and optional process), got %d' % len(args))

  task_id = args[0]
  detector = TaskDetector(root=options.root)
  checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
  log_dir = checkpoint.header.log_dir
  process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
  if len(args) == 2:
    process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

  if len(process_runs) == 0:
    print('ERROR: No processes found.', file=sys.stderr)
    sys.exit(1)

  processes = set([process for process, _ in process_runs])
  if len(processes) != 1:
    print('ERROR: More than one process matches query.', file=sys.stderr)
    sys.exit(1)

  process = processes.pop()
  run = max([run for _, run in process_runs])

  logdir = TaskPath(root=options.root, task_id=args[0], process=process,
     run=run, log_dir=log_dir).getpath('process_logdir')
  logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout')

  monitor = TaskMonitor(TaskPath(root=options.root), args[0])
  def log_is_active():
    active_processes = monitor.get_active_processes()
    for process_status, process_run in active_processes:
      if process_status.process == process and process_run == run:
        return True
    return False

  if not log_is_active():
    print('Tail of terminal log %s' % logfile)
    for line in tail_closed(logfile):
      print(line.rstrip())
    return

  now = time.time()
  next_check = now + 5.0
  print('Tail of active log %s' % logfile)
  for line in tail_f(logfile, include_last=True, forever=False):
    print(line.rstrip())
    if time.time() > next_check:
      if not log_is_active():
        break
      else:
        next_check = time.time() + 5.0
 def run_with_class(process_class):
   with temporary_dir() as td:
     taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
     sandbox = setup_sandbox(td, taskpath)
     with open(os.path.join(sandbox, 'silly_pants'), 'w') as silly_pants:
       p = process_class('process', 'echo test >&%s' % silly_pants.fileno(),
           0, taskpath, sandbox)
       p.start()
       return wait_for_rc(taskpath.getpath('process_checkpoint'))
Example #7
0
    def __init__(self, checkpoint_root, task_id):
        """
    :param checkpoint_root: The checkpoint root to find the given task.
    :param task_id: The task_id of the task whose state we wish to manage.
    """

        self._detector = TaskDetector(checkpoint_root)
        self._task_id = task_id
        self._pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
        self._state = CheckpointDispatcher.from_file(
            self._detector.get_checkpoint(task_id))
 def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
   self._pathspec = TaskPath(root=root)
   self._detector = TaskDetector(root)
   if not issubclass(resource_monitor_class, ResourceMonitorBase):
     raise ValueError("resource monitor class must implement ResourceMonitorBase!")
   self._resource_monitor = resource_monitor_class
   self._active_tasks = {}    # task_id => ActiveObservedTask
   self._finished_tasks = {}  # task_id => FinishedObservedTask
   self._stop_event = threading.Event()
   ExceptionalThread.__init__(self)
   Lockable.__init__(self)
   self.daemon = True
Example #9
0
  def kill(cls, task_id, checkpoint_root, force=False,
           terminal_status=TaskState.KILLED, clock=time):
    """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
    if terminal_status not in (TaskState.KILLED, TaskState.LOST):
      raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                      TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status)
    pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
    checkpoint = pathspec.getpath('runner_checkpoint')
    state = CheckpointDispatcher.from_file(checkpoint)

    if state is None or state.header is None or state.statuses is None:
      if force:
        log.error('Task has uninitialized TaskState - forcibly finalizing')
        cls.finalize_task(pathspec)
        return
      else:
        log.error('Cannot update states in uninitialized TaskState!')
        return

    ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

    def write_task_state(state):
      update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000),
                          runner_pid=os.getpid(), runner_uid=os.getuid())
      ckpt.write(RunnerCkpt(task_status=update))

    def write_process_status(status):
      ckpt.write(RunnerCkpt(process_status=status))

    if cls.is_task_terminal(state.statuses[-1].state):
      log.info('Task is already in terminal state!  Finalizing.')
      cls.finalize_task(pathspec)
      return

    with closing(ckpt):
      write_task_state(TaskState.ACTIVE)
      for process, history in state.processes.items():
        process_status = history[-1]
        if not cls.is_process_terminal(process_status.state):
          if cls.kill_process(state, process):
            write_process_status(ProcessStatus(process=process,
              state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9,
              stop_time=clock.time()))
          else:
            if process_status.state is not ProcessState.WAITING:
              write_process_status(ProcessStatus(process=process,
                state=ProcessState.LOST, seq=process_status.seq + 1))
      write_task_state(terminal_status)
    cls.finalize_task(pathspec)
Example #10
0
  def kill(cls, task_id, checkpoint_root, force=False,
           terminal_status=TaskState.KILLED, clock=time):
    """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
    if terminal_status not in (TaskState.KILLED, TaskState.LOST):
      raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                      TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status)
    pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
    checkpoint = pathspec.getpath('runner_checkpoint')
    state = CheckpointDispatcher.from_file(checkpoint)

    if state is None or state.header is None or state.statuses is None:
      if force:
        log.error('Task has uninitialized TaskState - forcibly finalizing')
        cls.finalize_task(pathspec)
        return
      else:
        log.error('Cannot update states in uninitialized TaskState!')
        return

    ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

    def write_task_state(state):
      update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000),
                          runner_pid=os.getpid(), runner_uid=os.getuid())
      ckpt.write(RunnerCkpt(task_status=update))

    def write_process_status(status):
      ckpt.write(RunnerCkpt(process_status=status))

    if cls.is_task_terminal(state.statuses[-1].state):
      log.info('Task is already in terminal state!  Finalizing.')
      cls.finalize_task(pathspec)
      return

    with closing(ckpt):
      write_task_state(TaskState.ACTIVE)
      for process, history in state.processes.items():
        process_status = history[-1]
        if not cls.is_process_terminal(process_status.state):
          if cls.kill_process(state, process):
            write_process_status(ProcessStatus(process=process,
              state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9,
              stop_time=clock.time()))
          else:
            if process_status.state is not ProcessState.WAITING:
              write_process_status(ProcessStatus(process=process,
                state=ProcessState.LOST, seq=process_status.seq + 1))
      write_task_state(terminal_status)
    cls.finalize_task(pathspec)
def test_simple_process():
  with temporary_dir() as td:
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox)
    p.start()
    rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

    assert rc == 0
    stdout = taskpath.with_filename('stdout').getpath('process_logdir')
    assert os.path.exists(stdout)
    with open(stdout, 'r') as fp:
      assert fp.read() == 'hello world\n'
def test_simple_process():
    with temporary_dir() as td:
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)

        p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox)
        p.start()
        rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

        assert rc == 0
        stdout = taskpath.with_filename('stdout').getpath('process_logdir')
        assert os.path.exists(stdout)
        with open(stdout, 'r') as fp:
            assert fp.read() == 'hello world\n'
 def run_with_class(process_class):
     with temporary_dir() as td:
         taskpath = TaskPath(root=td,
                             task_id='task',
                             process='process',
                             run=0)
         sandbox = setup_sandbox(td, taskpath)
         with open(os.path.join(sandbox, 'silly_pants'),
                   'w') as silly_pants:
             p = process_class('process',
                               'echo test >&%s' % silly_pants.fileno(), 0,
                               taskpath, sandbox)
             p.start()
             return wait_for_rc(taskpath.getpath('process_checkpoint'))
def test_simple_process_other_user(*args):
  with temporary_dir() as td:
    some_user = get_other_nonroot_user()
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox, user=some_user.pw_name)
    p.start()
    rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

    # since we're not actually root, the best we can do is check the right things were attempted
    assert os.setgroups.calledwith([g.gr_gid for g in grp.getgrall() if some_user.pw_name in g])
    assert os.setgid.calledwith(some_user.pw_uid)
    assert os.setuid.calledwith(some_user.pw_gid)
def test_log_permissions():
    with temporary_dir() as td:
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)

        p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox)
        p.start()
        wait_for_rc(taskpath.getpath('process_checkpoint'))

        stdout = taskpath.with_filename('stdout').getpath('process_logdir')
        stderr = taskpath.with_filename('stderr').getpath('process_logdir')
        assert os.path.exists(stdout)
        assert os.path.exists(stderr)
        assert os.stat(stdout).st_uid == os.getuid()
        assert os.stat(stderr).st_uid == os.getuid()
Example #16
0
  def __init__(self, root, task_id):
    """Construct a TaskMonitor.

    :param root: The checkpoint root of the task.
    :param task_id: The task id of the task.
    """
    pathspec = TaskPath(root=root, task_id=task_id)
    self._dispatcher = CheckpointDispatcher()
    self._runnerstate = RunnerState(processes={})
    self._runner_ckpt = pathspec.getpath('runner_checkpoint')
    self._active_file, self._finished_file = (pathspec.given(state=state).getpath('task_path')
        for state in ('active', 'finished'))
    self._ckpt_head = 0
    self._apply_states()
    self._lock = threading.Lock()
Example #17
0
def test_empty_garbage_collector():
    with temporary_dir() as checkpoint_root:
        path = TaskPath(root=checkpoint_root, task_id='does_not_exist')
        gc = TaskGarbageCollector(checkpoint_root, 'does_not_exist')

        assert gc.get_age() == 0

        # assume runner, finished task
        assert set(gc.get_metadata(with_size=False)) == set([
            path.getpath('runner_checkpoint'),
            path.given(state='finished').getpath('task_path'),
        ])

        assert list(gc.get_logs()) == []
        assert list(gc.get_data()) == []
def test_log_permissions():
  with temporary_dir() as td:
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox)
    p.start()
    rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

    stdout = taskpath.with_filename('stdout').getpath('process_logdir')
    stderr = taskpath.with_filename('stderr').getpath('process_logdir')
    assert os.path.exists(stdout)
    assert os.path.exists(stderr)
    assert os.stat(stdout).st_uid == os.getuid()
    assert os.stat(stderr).st_uid == os.getuid()
Example #19
0
def test_empty_garbage_collector():
  with temporary_dir() as checkpoint_root:
    path = TaskPath(root=checkpoint_root, task_id='does_not_exist')
    gc = TaskGarbageCollector(checkpoint_root, 'does_not_exist')

    assert gc.get_age() == 0

    # assume runner, finished task
    assert set(gc.get_metadata(with_size=False)) == set([
        path.getpath('runner_checkpoint'),
        path.given(state='finished').getpath('task_path'),
    ])

    assert list(gc.get_logs()) == []
    assert list(gc.get_data()) == []
Example #20
0
    def logs(self, task_id, process, run=None):
        """
      Given a task_id and a process and (optional) run number, return a dict:
      {
        stderr: [dir, filename]
        stdout: [dir, filename]
      }

      If the run number is unspecified, uses the latest run.

      TODO(wickman)  Just return the filenames directly?
    """
        runner_state = self.raw_state(task_id)
        if runner_state is None or runner_state.header is None:
            return {}
        run = self.get_run_number(runner_state, process, run)
        if run is None:
            return {}
        observed_task = self.all_tasks.get(task_id, None)
        if not observed_task:
            return {}

        log_path = TaskPath(
            root=observed_task.root,
            task_id=task_id,
            process=process,
            run=run,
            log_dir=runner_state.header.log_dir,
        ).getpath('process_logdir')

        return dict(stdout=[log_path, 'stdout'], stderr=[log_path, 'stderr'])
Example #21
0
def make_taskpath(td):
  return TaskPath(
      root=td,
      task_id='task',
      process='process',
      run=0,
      log_dir=os.path.join(td, '.logs'))
Example #22
0
  def _runner_ckpt(self, task):
    """Return the runner checkpoint file for a given task.

    :param task: An instance of a task to retrieve checkpoint path
    :type task: :class:`RootedTask` instance
    """
    return TaskPath(root=task.checkpoint_root, task_id=task.task_id).getpath('runner_checkpoint')
Example #23
0
    def test_basic_as_job(self):
        proxy_driver = ProxyDriver()

        with temporary_dir() as tempdir:
            te = AuroraExecutor(runner_provider=make_provider(tempdir),
                                sandbox_provider=DefaultTestSandboxProvider())
            te.launchTask(proxy_driver,
                          make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0))
            te.runner_started.wait()
            while te._status_manager is None:
                time.sleep(0.1)
            te.terminated.wait()
            tm = TaskMonitor(TaskPath(root=tempdir),
                             task_id=HELLO_WORLD_TASK_ID)
            runner_state = tm.get_state()

        assert 'hello_world_hello_world-001' in runner_state.processes, (
            'Could not find processes, got: %s' %
            ' '.join(runner_state.processes))
        updates = proxy_driver.method_calls['sendStatusUpdate']
        assert len(updates) == 3
        status_updates = [arg_tuple[0][0] for arg_tuple in updates]
        assert status_updates[0].state == mesos_pb2.TASK_STARTING
        assert status_updates[1].state == mesos_pb2.TASK_RUNNING
        assert status_updates[2].state == mesos_pb2.TASK_FINISHED
Example #24
0
    def __init__(self, root, task_id):
        """Construct a TaskMonitor.

    :param root: The checkpoint root of the task.
    :param task_id: The task id of the task.
    """
        pathspec = TaskPath(root=root, task_id=task_id)
        self._dispatcher = CheckpointDispatcher()
        self._runnerstate = RunnerState(processes={})
        self._runner_ckpt = pathspec.getpath("runner_checkpoint")
        self._active_file, self._finished_file = (
            pathspec.given(state=state).getpath("task_path") for state in ("active", "finished")
        )
        self._ckpt_head = 0
        self._apply_states()
        self._lock = threading.Lock()
def test_log_permissions_other_user(*mocks):
  with temporary_dir() as td:
    some_user = get_other_nonroot_user()
    taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
    sandbox = setup_sandbox(td, taskpath)

    p = TestProcess('process', 'echo hello world', 0, taskpath, sandbox, user=some_user.pw_name)
    p.start()
    rc = wait_for_rc(taskpath.getpath('process_checkpoint'))

    # since we're not actually root, the best we can do is check the right things were attempted
    stdout = taskpath.with_filename('stdout').getpath('process_logdir')
    stderr = taskpath.with_filename('stderr').getpath('process_logdir')
    assert os.path.exists(stdout)
    assert os.path.exists(stderr)
    assert os.chown.calledwith(stdout, some_user.pw_uid, some_user.pw_gid)
    assert os.chown.calledwith(stderr, some_user.pw_uid, some_user.pw_gid)
Example #26
0
def test_garbage_collector(safe_rmtree, safe_delete):
    with temporary_dir() as sandbox, temporary_dir(
    ) as checkpoint_root, temporary_dir() as log_dir:

        path = TaskPath(root=checkpoint_root, task_id='test', log_dir=log_dir)

        touch(os.path.join(sandbox, 'test_file1'))
        touch(os.path.join(sandbox, 'test_file2'))
        safe_mkdir(
            os.path.dirname(path.given(state='finished').getpath('task_path')))
        safe_mkdir(os.path.dirname(path.getpath('runner_checkpoint')))
        touch(path.given(state='finished').getpath('task_path'))

        header = RunnerHeader(task_id='test', sandbox=sandbox, log_dir=log_dir)
        ckpt = TaskRunnerHelper.open_checkpoint(
            path.getpath('runner_checkpoint'))
        ckpt.write(RunnerCkpt(runner_header=header))
        ckpt.close()

        gc = TaskGarbageCollector(checkpoint_root, task_id='test')
        assert gc._state.header.log_dir == log_dir
        assert gc._state.header.sandbox == sandbox

        # erase metadata
        gc.erase_metadata()
        safe_delete.assert_has_calls([
            call(path.given(state='finished').getpath('task_path')),
            call(path.getpath('runner_checkpoint'))
        ],
                                     any_order=True)
        safe_rmtree.assert_has_calls([call(path.getpath('checkpoint_path'))])

        safe_delete.reset_mock()
        safe_rmtree.reset_mock()

        # erase logs
        gc.erase_logs()
        safe_rmtree.assert_has_calls([call(log_dir)])

        safe_delete.reset_mock()
        safe_rmtree.reset_mock()

        # erase sandbox
        gc.erase_data()

        safe_delete.assert_has_calls([
            call(os.path.join(sandbox, 'test_file1')),
            call(os.path.join(sandbox, 'test_file2'))
        ],
                                     any_order=True)
        safe_rmtree.assert_has_calls([call(sandbox)])
Example #27
0
 def erase_logs(self, task_id):
     for fn in self.get_logs(task_id, with_size=False):
         safe_delete(fn)
     state = self.state(task_id)
     if state and state.header:
         safe_rmtree(
             TaskPath(
                 root=self._root,
                 task_id=task_id,
                 log_dir=state.header.log_dir).getpath('process_logbase'))
def test_simple_process_other_user(*args):
    with temporary_dir() as td:
        some_user = get_other_nonroot_user()
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)

        p = TestProcess('process',
                        'echo hello world',
                        0,
                        taskpath,
                        sandbox,
                        user=some_user.pw_name)
        p.start()
        wait_for_rc(taskpath.getpath('process_checkpoint'))

        # since we're not actually root, the best we can do is check the right things were attempted
        assert os.setgroups.calledwith(
            [g.gr_gid for g in grp.getgrall() if some_user.pw_name in g])
        assert os.setgid.calledwith(some_user.pw_uid)
        assert os.setuid.calledwith(some_user.pw_gid)
def test_other_user_fails_nonroot():
    with temporary_dir() as td:
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)
        process = TestProcess('process',
                              'echo hello world',
                              0,
                              taskpath,
                              sandbox,
                              user=get_other_nonroot_user().pw_name)
        with pytest.raises(Process.PermissionError):
            process.start()
Example #30
0
 def from_assigned_task(self, assigned_task, sandbox):
     task_id = assigned_task.taskId
     resources = mesos_task_instance_from_assigned_task(
         assigned_task).task().resources()
     task_path = TaskPath(root=self._checkpoint_root, task_id=task_id)
     task_monitor = TaskMonitor(task_path, task_id)
     resource_monitor = TaskResourceMonitor(
         task_monitor,
         sandbox.root,
         disk_collector=self._disk_collector,
         disk_collection_interval=self._disk_collection_interval)
     return ResourceManager(resources, resource_monitor)
 def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
   self._pathspec = TaskPath(root=root)
   self._detector = TaskDetector(root)
   if not issubclass(resource_monitor_class, ResourceMonitorBase):
     raise ValueError("resource monitor class must implement ResourceMonitorBase!")
   self._resource_monitor = resource_monitor_class
   self._active_tasks = {}    # task_id => ActiveObservedTask
   self._finished_tasks = {}  # task_id => FinishedObservedTask
   self._stop_event = threading.Event()
   ExceptionalThread.__init__(self)
   Lockable.__init__(self)
   self.daemon = True
def test_log_permissions_other_user(*mocks):
    with temporary_dir() as td:
        some_user = get_other_nonroot_user()
        taskpath = TaskPath(root=td, task_id='task', process='process', run=0)
        sandbox = setup_sandbox(td, taskpath)

        p = TestProcess('process',
                        'echo hello world',
                        0,
                        taskpath,
                        sandbox,
                        user=some_user.pw_name)
        p.start()
        wait_for_rc(taskpath.getpath('process_checkpoint'))

        # since we're not actually root, the best we can do is check the right things were attempted
        stdout = taskpath.with_filename('stdout').getpath('process_logdir')
        stderr = taskpath.with_filename('stderr').getpath('process_logdir')
        assert os.path.exists(stdout)
        assert os.path.exists(stderr)
        assert os.chown.calledwith(stdout, some_user.pw_uid, some_user.pw_gid)
        assert os.chown.calledwith(stderr, some_user.pw_uid, some_user.pw_gid)
Example #33
0
 def get_metadata(self, task_id, with_size=True):
     runner_ckpt = self._detector.get_checkpoint(task_id)
     process_ckpts = [
         ckpt for ckpt in self._detector.get_process_checkpoints(task_id)
     ]
     json_spec = TaskPath(root=self._root,
                          task_id=task_id,
                          state='finished').getpath('task_path')
     for path in [json_spec, runner_ckpt] + process_ckpts:
         if with_size:
             yield path, safe_bsize(path)
         else:
             yield path
Example #34
0
  def __init__(self, task, success_rate=100, random_seed=31337, **extra_task_runner_args):
    """
      task = Thermos task
      portmap = port map
      success_rate = success rate of writing checkpoint to disk
    """
    self.task = task

    with temporary_file(cleanup=False) as fp:
      self.job_filename = fp.name
      fp.write(ThermosTaskWrapper(task).to_json())

    self.state_filename = tempfile.mktemp()
    self.tempdir = tempfile.mkdtemp()
    self.task_id = '%s-runner-base' % int(time.time() * 1000000)
    self.sandbox = os.path.join(self.tempdir, 'sandbox')
    self.extra_task_runner_args = extra_task_runner_args
    self.cleaned = False
    self.pathspec = TaskPath(root=self.tempdir, task_id=self.task_id)
    self.script_filename = None
    self.success_rate = success_rate
    self.random_seed = random_seed
    self._run_count = 0
    def test_sample_by_process_no_process(self, mock_get_active_processes):
        task_path = TaskPath(root='.')

        task_monitor = TaskMonitor(task_path, 'fake-task-id')
        mock_get_active_processes.return_value = []

        task_resource_monitor = TaskResourceMonitor(task_monitor, '.')

        with self.assertRaises(ValueError):
            task_resource_monitor.sample_by_process('fake-process-name')

        assert mock_get_active_processes.mock_calls == [
            mock.call(task_monitor)
        ]
Example #36
0
def test_garbage_collector(safe_rmtree, safe_delete):
  with temporary_dir() as sandbox, temporary_dir() as checkpoint_root, temporary_dir() as log_dir:

    path = TaskPath(root=checkpoint_root, task_id='test', log_dir=log_dir)

    touch(os.path.join(sandbox, 'test_file1'))
    touch(os.path.join(sandbox, 'test_file2'))
    safe_mkdir(os.path.dirname(path.given(state='finished').getpath('task_path')))
    safe_mkdir(os.path.dirname(path.getpath('runner_checkpoint')))
    touch(path.given(state='finished').getpath('task_path'))

    header = RunnerHeader(task_id='test', sandbox=sandbox, log_dir=log_dir)
    ckpt = TaskRunnerHelper.open_checkpoint(path.getpath('runner_checkpoint'))
    ckpt.write(RunnerCkpt(runner_header=header))
    ckpt.close()

    gc = TaskGarbageCollector(checkpoint_root, task_id='test')
    assert gc._state.header.log_dir == log_dir
    assert gc._state.header.sandbox == sandbox

    # erase metadata
    gc.erase_metadata()
    safe_delete.assert_has_calls([
        call(path.given(state='finished').getpath('task_path')),
        call(path.getpath('runner_checkpoint'))], any_order=True)
    safe_rmtree.assert_has_calls([call(path.getpath('checkpoint_path'))])

    safe_delete.reset_mock()
    safe_rmtree.reset_mock()

    # erase logs
    gc.erase_logs()
    safe_rmtree.assert_has_calls([call(log_dir)])

    safe_delete.reset_mock()
    safe_rmtree.reset_mock()

    # erase sandbox
    gc.erase_data()

    safe_delete.assert_has_calls([
        call(os.path.join(sandbox, 'test_file1')),
        call(os.path.join(sandbox, 'test_file2'))], any_order=True)
    safe_rmtree.assert_has_calls([call(sandbox)])
Example #37
0
  def start(self, timeout=MAX_WAIT):
    """Fork the task runner and return once the underlying task is running, up to timeout."""
    self.forking.set()

    self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id)

    cmdline_args = self._cmdline()
    log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args))

    cwd = os.environ.get('MESOS_DIRECTORY')
    try:
      self._popen = subprocess.Popen(cmdline_args, cwd=cwd)
    except OSError as e:
      raise TaskError(e)

    self.forked.set()

    self.wait_start(timeout=timeout)
Example #38
0
 def setup_task(self, task, root, finished=False, corrupt=False):
   """Set up the checkpoint stream for the given task in the given checkpoint root, optionally
   finished and/or with a corrupt stream"""
   class FastTaskRunner(TaskRunner):
     COORDINATOR_INTERVAL_SLEEP = Amount(1, Time.MILLISECONDS)
   tr = FastTaskRunner(
       task=task,
       checkpoint_root=root,
       sandbox=os.path.join(root, 'sandbox', task.name().get()),
       clock=ThreadedClock(time.time()))
   with tr.control():
     # initialize checkpoint stream
     pass
   if finished:
     tr.kill()
   if corrupt:
     ckpt_file = TaskPath(root=root, tr=tr.task_id).getpath('runner_checkpoint')
     with open(ckpt_file, 'w') as f:
       f.write("definitely not a valid checkpoint stream")
   return tr.task_id
    def test_sample_by_process(self, mock_get_active_processes, mock_sample):
        fake_process_name = 'fake-process-name'
        task_path = TaskPath(root='.')
        task_monitor = TaskMonitor(task_path, 'fake-task-id')
        fake_process_status = ProcessStatus(process=fake_process_name)
        mock_get_active_processes.return_value = [(fake_process_status, 1)]
        fake_process_sample = ProcessSample.empty()
        mock_sample.return_value = fake_process_sample

        task_resource_monitor = TaskResourceMonitor(task_monitor, '.')

        assert fake_process_sample == task_resource_monitor.sample_by_process(
            fake_process_name)
        assert mock_get_active_processes.mock_calls == [
            mock.call(task_monitor)
        ]
        assert mock_sample.mock_calls == [
            mock.call(
                task_resource_monitor._process_collectors[fake_process_status])
        ]
  def start(self, timeout=MAX_WAIT):
    """Fork the task runner and return once the underlying task is running, up to timeout."""
    self.forking.set()

    try:
      chmod_plus_x(self._runner_pex)
    except OSError as e:
      if e.errno != errno.EPERM:
        raise TaskError('Failed to chmod +x runner: %s' % e)

    self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id)

    cmdline_args = self._cmdline()
    log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args))

    try:
      self._popen = subprocess.Popen(cmdline_args)
    except OSError as e:
      raise TaskError(e)

    self.forked.set()

    log.debug('Waiting for task to start.')

    def is_started():
      return self._monitor and (self._monitor.active or self._monitor.finished)

    waited = Amount(0, Time.SECONDS)
    while not is_started() and waited < timeout:
      log.debug('  - sleeping...')
      self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
      waited += self.POLL_INTERVAL

    if not is_started():
      log.error('Task did not start with in deadline, forcing loss.')
      self.lose()
      raise TaskError('Task did not start within deadline.')
Example #41
0
 def __init__(self, root, task_id):
   self._root = root
   self._task_id = task_id
   self._pathspec = TaskPath(root=self._root, task_id=self._task_id)
   self._mtime = self._get_mtime()
class TaskObserver(ExceptionalThread, Lockable):
  """
    The TaskObserver monitors the thermos checkpoint root for active/finished
    tasks.  It is used to be the oracle of the state of all thermos tasks on
    a machine.

    It currently returns JSON, but really should just return objects.  We should
    then build an object->json translator.
  """
  class UnexpectedError(Exception): pass
  class UnexpectedState(Exception): pass

  POLLING_INTERVAL = Amount(1, Time.SECONDS)

  def __init__(self, root, resource_monitor_class=TaskResourceMonitor):
    self._pathspec = TaskPath(root=root)
    self._detector = TaskDetector(root)
    if not issubclass(resource_monitor_class, ResourceMonitorBase):
      raise ValueError("resource monitor class must implement ResourceMonitorBase!")
    self._resource_monitor = resource_monitor_class
    self._active_tasks = {}    # task_id => ActiveObservedTask
    self._finished_tasks = {}  # task_id => FinishedObservedTask
    self._stop_event = threading.Event()
    ExceptionalThread.__init__(self)
    Lockable.__init__(self)
    self.daemon = True

  @property
  def active_tasks(self):
    """Return a dictionary of active Tasks"""
    return self._active_tasks

  @property
  def finished_tasks(self):
    """Return a dictionary of finished Tasks"""
    return self._finished_tasks

  @property
  def all_tasks(self):
    """Return a dictionary of all Tasks known by the TaskObserver"""
    return dict(self.active_tasks.items() + self.finished_tasks.items())

  def stop(self):
    self._stop_event.set()

  def start(self):
    ExceptionalThread.start(self)

  @Lockable.sync
  def add_active_task(self, task_id):
    if task_id in self.finished_tasks:
      log.error('Found an active task (%s) in finished tasks?' % task_id)
      return
    task_monitor = TaskMonitor(self._pathspec, task_id)
    if not task_monitor.get_state().header:
      log.info('Unable to load task "%s"' % task_id)
      return
    sandbox = task_monitor.get_state().header.sandbox
    resource_monitor = self._resource_monitor(task_monitor, sandbox)
    resource_monitor.start()
    self._active_tasks[task_id] = ActiveObservedTask(
      task_id=task_id, pathspec=self._pathspec,
      task_monitor=task_monitor, resource_monitor=resource_monitor
    )

  @Lockable.sync
  def add_finished_task(self, task_id):
    self._finished_tasks[task_id] = FinishedObservedTask(
      task_id=task_id, pathspec=self._pathspec
    )

  @Lockable.sync
  def active_to_finished(self, task_id):
    self.remove_active_task(task_id)
    self.add_finished_task(task_id)

  @Lockable.sync
  def remove_active_task(self, task_id):
    task = self.active_tasks.pop(task_id)
    task.resource_monitor.kill()

  @Lockable.sync
  def remove_finished_task(self, task_id):
    self.finished_tasks.pop(task_id)

  def run(self):
    """
      The internal thread for the observer.  This periodically polls the
      checkpoint root for new tasks, or transitions of tasks from active to
      finished state.
    """
    while not self._stop_event.is_set():
      time.sleep(self.POLLING_INTERVAL.as_(Time.SECONDS))

      active_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='active')]
      finished_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='finished')]

      with self.lock:

        # Ensure all tasks currently detected on the system are observed appropriately
        for active in active_tasks:
          if active not in self.active_tasks:
            log.debug('task_id %s (unknown) -> active' % active)
            self.add_active_task(active)
        for finished in finished_tasks:
          if finished in self.active_tasks:
            log.debug('task_id %s active -> finished' % finished)
            self.active_to_finished(finished)
          elif finished not in self.finished_tasks:
            log.debug('task_id %s (unknown) -> finished' % finished)
            self.add_finished_task(finished)

        # Remove ObservedTasks for tasks no longer detected on the system
        for unknown in set(self.active_tasks) - set(active_tasks + finished_tasks):
          log.debug('task_id %s active -> (unknown)' % unknown)
          self.remove_active_task(unknown)
        for unknown in set(self.finished_tasks) - set(active_tasks + finished_tasks):
          log.debug('task_id %s finished -> (unknown)' % unknown)
          self.remove_finished_task(unknown)

  @Lockable.sync
  def process_from_name(self, task_id, process_id):
    if task_id in self.all_tasks:
      task = self.all_tasks[task_id].task
      if task:
        for process in task.processes():
          if process.name().get() == process_id:
            return process

  @Lockable.sync
  def task_count(self):
    """
      Return the count of tasks that could be ready properly from disk.
      This may be <= self.task_id_count()
    """
    return dict(
      active=len(self.active_tasks),
      finished=len(self.finished_tasks),
      all=len(self.all_tasks),
    )

  @Lockable.sync
  def task_id_count(self):
    """
      Return the raw count of active and finished task_ids from the TaskDetector.
    """
    num_active = len(list(self._detector.get_task_ids(state='active')))
    num_finished = len(list(self._detector.get_task_ids(state='finished')))
    return dict(active=num_active, finished=num_finished, all=num_active + num_finished)

  def _get_tasks_of_type(self, type):
    """Convenience function to return all tasks of a given type"""
    tasks = {
      'active': self.active_tasks,
      'finished': self.finished_tasks,
      'all': self.all_tasks,
    }.get(type, None)

    if tasks is None:
      log.error('Unknown task type %s' % type)
      return {}

    return tasks

  @Lockable.sync
  def state(self, task_id):
    """Return a dict containing mapped information about a task's state"""
    real_state = self.raw_state(task_id)
    if real_state is None or real_state.header is None:
      return {}
    else:
      return dict(
        task_id=real_state.header.task_id,
        launch_time=real_state.header.launch_time_ms / 1000.0,
        sandbox=real_state.header.sandbox,
        hostname=real_state.header.hostname,
        user=real_state.header.user
      )

  @Lockable.sync
  def raw_state(self, task_id):
    """
      Return the current runner state (thrift blob: gen.apache.thermos.ttypes.RunnerState)
      of a given task id
    """
    if task_id not in self.all_tasks:
      return None
    return self.all_tasks[task_id].state

  @Lockable.sync
  def _task_processes(self, task_id):
    """
      Return the processes of a task given its task_id.

      Returns a map from state to processes in that state, where possible
      states are: waiting, running, success, failed.
    """
    if task_id not in self.all_tasks:
      return {}
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}

    waiting, running, success, failed, killed = [], [], [], [], []
    for process, runs in state.processes.items():
      # No runs ==> nothing started.
      if len(runs) == 0:
        waiting.append(process)
      else:
        if runs[-1].state in (None, ProcessState.WAITING, ProcessState.LOST):
          waiting.append(process)
        elif runs[-1].state in (ProcessState.FORKED, ProcessState.RUNNING):
          running.append(process)
        elif runs[-1].state == ProcessState.SUCCESS:
          success.append(process)
        elif runs[-1].state == ProcessState.FAILED:
          failed.append(process)
        elif runs[-1].state == ProcessState.KILLED:
          killed.append(process)
        else:
          # TODO(wickman)  Consider log.error instead of raising.
          raise self.UnexpectedState(
            "Unexpected ProcessHistoryState: %s" % state.processes[process].state)

    return dict(waiting=waiting, running=running, success=success, failed=failed, killed=killed)

  @Lockable.sync
  def main(self, type=None, offset=None, num=None):
    """Return a set of information about tasks, optionally filtered

      Args:
        type = (all|active|finished|None) [default: all]
        offset = offset into the list of task_ids [default: 0]
        num = number of results to return [default: 20]

      Tasks are sorted by interest:
        - active tasks are sorted by start time
        - finished tasks are sorted by completion time

      Returns:
        {
          tasks: [task_id_1, ..., task_id_N],
          type: query type,
          offset: next offset,
          num: next num
        }

    """
    type = type or 'all'
    offset = offset or 0
    num = num or 20

    # Get a list of all ObservedTasks of requested type
    tasks = sorted((task for task in self._get_tasks_of_type(type).values()),
                   key=attrgetter('mtime'), reverse=True)

    # Filter by requested offset + number of results
    end = num
    if offset < 0:
      offset = offset % len(tasks) if len(tasks) > abs(offset) else 0
    end += offset
    tasks = tasks[offset:end]

    def task_row(observed_task):
      """Generate an output row for a Task"""
      task = self._task(observed_task.task_id)
      # tasks include those which could not be found properly and are hence empty {}
      if task:
        return dict(
            task_id=observed_task.task_id,
            name=task['name'],
            role=task['user'],
            launch_timestamp=task['launch_timestamp'],
            state=task['state'],
            state_timestamp=task['state_timestamp'],
            ports=task['ports'],
            **task['resource_consumption'])

    return dict(
      tasks=filter(None, map(task_row, tasks)),
      type=type,
      offset=offset,
      num=num,
      task_count=self.task_count()[type],
    )

  def _sample(self, task_id):
    if task_id not in self.active_tasks:
      log.debug("Task %s not found in active tasks" % task_id)
      sample = ProcessSample.empty().to_dict()
      sample['disk'] = 0
    else:
      resource_sample = self.active_tasks[task_id].resource_monitor.sample()[1]
      sample = resource_sample.process_sample.to_dict()
      sample['disk'] = resource_sample.disk_usage
      log.debug("Got sample for task %s: %s" % (task_id, sample))
    return sample

  @Lockable.sync
  def task_statuses(self, task_id):
    """
      Return the sequence of task states.

      [(task_state [string], timestamp), ...]
    """

    # Unknown task_id.
    if task_id not in self.all_tasks:
      return []

    task = self.all_tasks[task_id]
    if task is None:
      return []

    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return []

    # Get the timestamp of the transition into the current state.
    return [
      (TaskState._VALUES_TO_NAMES.get(st.state, 'UNKNOWN'), st.timestamp_ms / 1000)
      for st in state.statuses]

  @Lockable.sync
  def _task(self, task_id):
    """
      Return composite information about a particular task task_id, given the below
      schema.

      {
         task_id: string,
         name: string,
         user: string,
         launch_timestamp: seconds,
         state: string [ACTIVE, SUCCESS, FAILED]
         ports: { name1: 'url', name2: 'url2' }
         resource_consumption: { cpu:, ram:, disk: }
         processes: { -> names only
            waiting: [],
            running: [],
            success: [],
            failed:  []
         }
      }
    """
    # Unknown task_id.
    if task_id not in self.all_tasks:
      return {}

    task = self.all_tasks[task_id].task
    if task is None:
      # TODO(wickman)  Can this happen?
      log.error('Could not find task: %s' % task_id)
      return {}

    state = self.raw_state(task_id)
    if state is None or state.header is None:
      # TODO(wickman)  Can this happen?
      return {}

    # Get the timestamp of the transition into the current state.
    current_state = state.statuses[-1].state
    last_state = state.statuses[0]
    state_timestamp = 0
    for status in state.statuses:
      if status.state == current_state and last_state != current_state:
        state_timestamp = status.timestamp_ms / 1000
      last_state = status.state

    return dict(
       task_id=task_id,
       name=task.name().get(),
       launch_timestamp=state.statuses[0].timestamp_ms / 1000,
       state=TaskState._VALUES_TO_NAMES[state.statuses[-1].state],
       state_timestamp=state_timestamp,
       user=state.header.user,
       resource_consumption=self._sample(task_id),
       ports=state.header.ports,
       processes=self._task_processes(task_id),
       task_struct=task,
    )

  @Lockable.sync
  def _get_process_resource_consumption(self, task_id, process_name):
    if task_id not in self.active_tasks:
      log.debug("Task %s not found in active tasks" % task_id)
      return ProcessSample.empty().to_dict()
    sample = self.active_tasks[task_id].resource_monitor.sample_by_process(process_name).to_dict()
    log.debug('Resource consumption (%s, %s) => %s' % (task_id, process_name, sample))
    return sample

  @Lockable.sync
  def _get_process_tuple(self, history, run):
    """
      Return the basic description of a process run if it exists, otherwise
      an empty dictionary.

      {
        process_name: string
        process_run: int
        state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST]
        (optional) start_time: seconds from epoch
        (optional) stop_time: seconds from epoch
      }
    """
    if len(history) == 0:
      return {}
    if run >= len(history):
      return {}
    else:
      process_run = history[run]
      run = run % len(history)
      d = dict(
        process_name=process_run.process,
        process_run=run,
        state=ProcessState._VALUES_TO_NAMES[process_run.state],
      )
      if process_run.start_time:
        d.update(start_time=process_run.start_time)
      if process_run.stop_time:
        d.update(stop_time=process_run.stop_time)
      return d

  @Lockable.sync
  def process(self, task_id, process, run=None):
    """
      Returns a process run, where the schema is given below:

      {
        process_name: string
        process_run: int
        used: { cpu: float, ram: int bytes, disk: int bytes }
        start_time: (time since epoch in millis (utc))
        stop_time: (time since epoch in millis (utc))
        state: string [WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST]
      }

      If run is None, return the latest run.
    """
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}
    if process not in state.processes:
      return {}
    history = state.processes[process]
    run = int(run) if run is not None else -1
    tup = self._get_process_tuple(history, run)
    if not tup:
      return {}
    if tup.get('state') == 'RUNNING':
      tup.update(used=self._get_process_resource_consumption(task_id, process))
    return tup

  @Lockable.sync
  def _processes(self, task_id):
    """
      Return
        {
          process1: { ... }
          process2: { ... }
          ...
          processN: { ... }
        }

      where processK is the latest run of processK and in the schema as
      defined by process().
    """

    if task_id not in self.all_tasks:
      return {}
    state = self.raw_state(task_id)
    if state is None or state.header is None:
      return {}

    processes = self._task_processes(task_id)
    d = dict()
    for process_type in processes:
      for process_name in processes[process_type]:
        d[process_name] = self.process(task_id, process_name)
    return d

  @Lockable.sync
  def processes(self, task_ids):
    """
      Given a list of task_ids, returns a map of task_id => processes, where processes
      is defined by the schema in _processes.
    """
    if not isinstance(task_ids, (list, tuple)):
      return {}
    return dict((task_id, self._processes(task_id)) for task_id in task_ids)

  @Lockable.sync
  def get_run_number(self, runner_state, process, run=None):
    if runner_state is not None and runner_state.processes is not None:
      run = run if run is not None else -1
      if run < len(runner_state.processes[process]):
        if len(runner_state.processes[process]) > 0:
          return run % len(runner_state.processes[process])

  @Lockable.sync
  def logs(self, task_id, process, run=None):
    """
      Given a task_id and a process and (optional) run number, return a dict:
      {
        stderr: [dir, filename]
        stdout: [dir, filename]
      }

      If the run number is unspecified, uses the latest run.

      TODO(wickman)  Just return the filenames directly?
    """
    runner_state = self.raw_state(task_id)
    if runner_state is None or runner_state.header is None:
      return {}
    run = self.get_run_number(runner_state, process, run)
    if run is None:
      return {}
    log_path = self._pathspec.given(task_id=task_id, process=process, run=run,
                                    log_dir=runner_state.header.log_dir).getpath('process_logdir')
    return dict(
      stdout=[log_path, 'stdout'],
      stderr=[log_path, 'stderr']
    )

  @staticmethod
  def _sanitize_path(base_path, relpath):
    """
      Attempts to sanitize a path through path normalization, also making sure
      that the relative path is contained inside of base_path.
    """
    if relpath is None:
      relpath = "."
    normalized_base = os.path.realpath(base_path)
    normalized = os.path.realpath(os.path.join(base_path, relpath))
    if normalized.startswith(normalized_base):
      return (normalized_base, os.path.relpath(normalized, normalized_base))
    return (None, None)

  @Lockable.sync
  def valid_file(self, task_id, path):
    """
      Like valid_path, but also verify the given path is a file
    """
    chroot, path = self.valid_path(task_id, path)
    if chroot and path and os.path.isfile(os.path.join(chroot, path)):
      return chroot, path
    return None, None

  @Lockable.sync
  def valid_path(self, task_id, path):
    """
      Given a task_id and a path within that task_id's sandbox, verify:
        (1) it's actually in the sandbox and not outside
        (2) it's a valid, existing path
      Returns chroot and the pathname relative to that chroot.
    """
    runner_state = self.raw_state(task_id)
    if runner_state is None or runner_state.header is None:
      return None, None
    try:
      chroot = runner_state.header.sandbox
    except AttributeError:
      return None, None
    chroot, path = self._sanitize_path(chroot, path)
    if chroot and path:
      return chroot, path
    return None, None

  @Lockable.sync
  def files(self, task_id, path=None):
    """
      Returns dictionary
      {
        task_id: task_id
        chroot: absolute directory on machine
        path: sanitized relative path w.r.t. chroot
        dirs: list of directories
        files: list of files
      }
    """
    # TODO(jon): DEPRECATED: most of the necessary logic is handled directly in the templates.
    # Also, global s/chroot/sandbox/?
    empty = dict(task_id=task_id, chroot=None, path=None, dirs=None, files=None)
    path = path if path is not None else '.'
    runner_state = self.raw_state(task_id)
    if runner_state is None:
      return empty
    try:
      chroot = runner_state.header.sandbox
    except AttributeError:
      return empty
    if chroot is None:  # chroot-less job
      return empty
    chroot, path = self._sanitize_path(chroot, path)
    if (chroot is None or path is None
        or not os.path.isdir(os.path.join(chroot, path))):
      return empty
    names = os.listdir(os.path.join(chroot, path))
    dirs, files = [], []
    for name in names:
      if os.path.isdir(os.path.join(chroot, path, name)):
        dirs.append(name)
      else:
        files.append(name)
    return dict(
      task_id=task_id,
      chroot=chroot,
      path=path,
      dirs=dirs,
      files=files
    )
Example #43
0
def test_task_detector():
  with temporary_dir() as root:
    active_log_dir = os.path.join(root, 'active_log')
    finished_log_dir = os.path.join(root, 'finished_log')

    path = TaskPath(root=root)
    detector = TaskDetector(root)

    # test empty paths

    assert list(detector.get_task_ids(state='active')) == []
    assert list(detector.get_task_ids(state='finished')) == []
    assert set(detector.get_task_ids()) == set()

    assert detector.get_checkpoint(task_id='active_task') == path.given(
        task_id='active_task').getpath('runner_checkpoint')

    assert detector.get_checkpoint(task_id='finished_task') == path.given(
        task_id='finished_task').getpath('runner_checkpoint')

    assert set(detector.get_process_checkpoints('active_task')) == set()
    assert set(detector.get_process_checkpoints('finished_task')) == set()
    assert set(detector.get_process_runs('active_task', active_log_dir)) == set()
    assert set(detector.get_process_runs('finished_task', finished_log_dir)) == set()
    assert set(detector.get_process_logs('active_task', active_log_dir)) == set()
    assert set(detector.get_process_logs('finished_task', finished_log_dir)) == set()

    # create paths

    paths = [
        path.given(state='active', task_id='active_task').getpath('task_path'),
        path.given(state='finished', task_id='finished_task').getpath('task_path'),
        path.given(task_id='active_task').getpath('runner_checkpoint'),
        path.given(task_id='finished_task').getpath('runner_checkpoint'),
        path.given(
            task_id='active_task',
            process='hello_world',
            run='0',
            log_dir=active_log_dir
        ).with_filename('stdout').getpath('process_logdir'),
        path.given(
            task_id='finished_task',
            process='goodbye_world',
            run='1',
            log_dir=finished_log_dir
        ).with_filename('stderr').getpath('process_logdir'),
        path.given(task_id='active_task', process='hello_world').getpath('process_checkpoint'),
        path.given(task_id='finished_task', process='goodbye_world').getpath('process_checkpoint'),
    ]

    for p in paths:
      touch(p)

    detector = TaskDetector(root)

    assert list(detector.get_task_ids(state='active')) == list([('active', 'active_task')])
    assert list(detector.get_task_ids(state='finished')) == list([('finished', 'finished_task')])
    assert set(detector.get_task_ids()) == set(
        [('active', 'active_task'), ('finished', 'finished_task')])

    assert list(detector.get_process_checkpoints('active_task')) == [
        path.given(task_id='active_task', process='hello_world').getpath('process_checkpoint')]

    assert list(detector.get_process_checkpoints('finished_task')) == [
        path.given(task_id='finished_task', process='goodbye_world').getpath('process_checkpoint')]

    assert list(detector.get_process_runs('active_task', active_log_dir)) == [
        ('hello_world', 0)]
    assert list(detector.get_process_runs('finished_task', finished_log_dir)) == [
        ('goodbye_world', 1)]

    assert list(detector.get_process_logs('active_task', active_log_dir)) == [
        path.given(
            task_id='active_task',
            process='hello_world',
            run='0',
            log_dir=active_log_dir
        ).with_filename('stdout').getpath('process_logdir')]

    assert list(detector.get_process_logs('finished_task', finished_log_dir)) == [
        path.given(
            task_id='finished_task',
            process='goodbye_world',
            run='1',
            log_dir=finished_log_dir
        ).with_filename('stderr').getpath('process_logdir')]
Example #44
0
    def __init__(self,
                 task,
                 checkpoint_root,
                 sandbox,
                 log_dir=None,
                 task_id=None,
                 portmap=None,
                 user=None,
                 chroot=False,
                 clock=time,
                 universal_handler=None,
                 planner_class=TaskPlanner,
                 hostname=None,
                 process_logger_destination=None,
                 process_logger_mode=None,
                 rotate_log_size_mb=None,
                 rotate_log_backups=None,
                 preserve_env=False):
        """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
    """
        if not issubclass(planner_class, TaskPlanner):
            raise TypeError('planner_class must be a TaskPlanner.')
        self._clock = clock
        launch_time = self._clock.time()
        launch_time_ms = '%06d' % int(
            (launch_time - int(launch_time)) * (10**6))
        if not task_id:
            self._task_id = '%s-%s.%s' % (
                task.name(),
                time.strftime('%Y%m%d-%H%M%S',
                              time.localtime(launch_time)), launch_time_ms)
        else:
            self._task_id = task_id
        current_user = TaskRunnerHelper.get_actual_user()
        self._user = user or current_user
        # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
        if self._user != current_user:
            if os.geteuid() != 0:
                raise ValueError(
                    'task specifies user as %s, but %s does not have setuid permission!'
                    % (self._user, current_user))
        self._portmap = portmap or {}
        self._launch_time = launch_time
        self._log_dir = log_dir or os.path.join(sandbox, '.logs')
        self._process_logger_destination = process_logger_destination
        self._process_logger_mode = process_logger_mode
        self._rotate_log_size_mb = rotate_log_size_mb
        self._rotate_log_backups = rotate_log_backups
        self._pathspec = TaskPath(root=checkpoint_root,
                                  task_id=self._task_id,
                                  log_dir=self._log_dir)
        self._hostname = hostname or socket.gethostname()
        try:
            ThermosTaskValidator.assert_valid_task(task)
            ThermosTaskValidator.assert_valid_ports(task, self._portmap)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        context = ThermosContext(task_id=self._task_id,
                                 ports=self._portmap,
                                 user=self._user)
        self._task, uninterp = (task %
                                Environment(thermos=context)).interpolate()
        if len(uninterp) > 0:
            raise self.InvalidTask('Failed to interpolate task, missing: %s' %
                                   ', '.join(str(ref) for ref in uninterp))
        try:
            ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        self._plan = None  # plan currently being executed (updated by Handlers)
        self._regular_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is False)
        self._finalizing_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is True)
        self._chroot = chroot
        self._sandbox = sandbox
        self._terminal_state = None
        self._ckpt = None
        self._process_map = dict(
            (p.name().get(), p) for p in self._task.processes())
        self._task_processes = {}
        self._stages = dict(
            (state, stage(self)) for state, stage in self.STAGES.items())
        self._finalization_start = None
        self._preemption_deadline = None
        self._watcher = ProcessMuxer(self._pathspec)
        self._state = RunnerState(processes={})
        self._preserve_env = preserve_env

        # create runner state
        universal_handler = universal_handler or TaskRunnerUniversalHandler
        self._dispatcher = CheckpointDispatcher()
        self._dispatcher.register_handler(universal_handler(self))
        self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
        self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

        # recover checkpointed runner state and update plan
        self._recovery = True
        self._replay_runner_ckpt()
 def __init__(self, root):
   self._root_dir = root
   self._pathspec = TaskPath()
Example #46
0
class CheckpointInspector(object):
  def __init__(self, checkpoint_root):
    self._path = TaskPath(root=checkpoint_root)

  @staticmethod
  def get_timestamp(process_record):
    if process_record :
      for timestamp in ('fork_time', 'start_time', 'stop_time'):
        stamp = getattr(process_record, timestamp, None)
        if stamp:
          return stamp
    return 0

  def inspect(self, task_id):
    """
      Reconstructs the checkpoint stream and returns a CheckpointInspection.
    """
    dispatcher = CheckpointDispatcher()
    state = RunnerState(processes = {})
    muxer = ProcessMuxer(self._path.given(task_id=task_id))

    runner_processes = []
    coordinator_processes = set()
    processes = set()

    def consume_process_record(record):
      if not record.process_status:
        return
      try:
        user_uid = pwd.getpwnam(state.header.user).pw_uid
      except KeyError:
        log.error('Could not find user: %s' % state.header.user)
        return
      if record.process_status.state == ProcessState.FORKED:
        coordinator_processes.add((record.process_status.coordinator_pid, user_uid,
                                   record.process_status.fork_time))
      elif record.process_status.state == ProcessState.RUNNING:
        processes.add((record.process_status.pid, user_uid,
                       record.process_status.start_time))

    # replay runner checkpoint
    runner_pid = None
    runner_latest_update = 0
    try:
      with open(self._path.given(task_id=task_id).getpath('runner_checkpoint')) as fp:
        with closing(ThriftRecordReader(fp, RunnerCkpt)) as ckpt:
          for record in ckpt:
            dispatcher.dispatch(state, record)
            runner_latest_update = max(runner_latest_update,
                self.get_timestamp(record.process_status))
            # collect all bound runners
            if record.task_status:
              if record.task_status.runner_pid != runner_pid:
                runner_processes.append((record.task_status.runner_pid,
                                         record.task_status.runner_uid or 0,
                                         record.task_status.timestamp_ms))
                runner_pid = record.task_status.runner_pid
            elif record.process_status:
              consume_process_record(record)
    except (IOError, OSError, RecordIO.Error) as err:
      log.debug('Error inspecting task runner checkpoint: %s' % err)
      return

    # register existing processes in muxer
    for process_name in state.processes:
      muxer.register(process_name)

    # read process checkpoints
    process_latest_update = runner_latest_update
    for record in muxer.select():
      process_latest_update = max(process_latest_update, self.get_timestamp(record.process_status))
      consume_process_record(record)

    return CheckpointInspection(
      runner_latest_update=runner_latest_update,
      process_latest_update=process_latest_update,
      runner_processes=runner_processes,
      coordinator_processes=coordinator_processes,
      processes=processes)
Example #47
0
class Runner(object):
  RUN_JOB_SCRIPT = """
# this is a hack to process wheel nspkg declarations
import os, sys, site
for path in sys.path:
  if path.endswith('.whl') and os.path.isdir(path):
    site.addsitedir(path)

import os
import random
import sys
from twitter.common import log
from twitter.common.log.options import LogOptions
from apache.thermos.config.loader import ThermosConfigLoader
from apache.thermos.core.helper import TaskRunnerHelper
from apache.thermos.core.runner import TaskRunner, TaskRunnerUniversalHandler
from thrift.TSerialization import serialize as thrift_serialize

random.seed(%(random_seed)d)

log.init('runner_base')
LogOptions.set_disk_log_level('DEBUG')

task = ThermosConfigLoader.load_json('%(filename)s')
task = task.tasks()[0].task

success_rate=%(success_rate)d

class AngryHandler(TaskRunnerUniversalHandler):
  def checkpoint(self, record):
    if not self._runner._recovery:
      if random.randint(0, 100) <= success_rate:
        super(AngryHandler, self).checkpoint(record)
      else:
        sys.exit(1)

sandbox = os.path.join('%(sandbox)s', '%(task_id)s')
args = %(extra_task_runner_args)r
args['task_id'] = '%(task_id)s'
args['universal_handler'] = AngryHandler

runner = TaskRunner(task, '%(root)s', sandbox, **args)
runner.run()

with open('%(state_filename)s', 'w') as fp:
  fp.write(thrift_serialize(runner.state))
"""

  def __init__(self, task, success_rate=100, random_seed=31337, **extra_task_runner_args):
    """
      task = Thermos task
      portmap = port map
      success_rate = success rate of writing checkpoint to disk
    """
    self.task = task

    with temporary_file(cleanup=False) as fp:
      self.job_filename = fp.name
      fp.write(ThermosTaskWrapper(task).to_json())

    self.state_filename = tempfile.mktemp()
    self.tempdir = tempfile.mkdtemp()
    self.task_id = '%s-runner-base' % int(time.time() * 1000000)
    self.sandbox = os.path.join(self.tempdir, 'sandbox')
    self.extra_task_runner_args = extra_task_runner_args
    self.cleaned = False
    self.pathspec = TaskPath(root=self.tempdir, task_id=self.task_id)
    self.script_filename = None
    self.success_rate = success_rate
    self.random_seed = random_seed
    self._run_count = 0

  @property
  def pid(self):
    return self.po.pid

  @property
  def root(self):
    return self.tempdir

  def run(self):
    self._run_count += 1
    atexit.register(self.cleanup)

    if self.script_filename:
      os.unlink(self.script_filename)

    with temporary_file(cleanup=False) as fp:
      self.script_filename = fp.name
      fp.write(self.RUN_JOB_SCRIPT % {
        'filename': self.job_filename,
        'sandbox': self.sandbox,
        'root': self.tempdir,
        'task_id': self.task_id,
        'state_filename': self.state_filename,
        'success_rate': self.success_rate,
        'random_seed': self.random_seed + self._run_count,
        'extra_task_runner_args': self.extra_task_runner_args,
      })

    with environment_as(PYTHONPATH=os.pathsep.join(sys.path)):
      self.po = subprocess.Popen([sys.executable, self.script_filename],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      try:
        so, se = self.po.communicate()
      except OSError as e:
        if e.errno == errno.ECHILD:
          so = se = 'Killed'
        else:
          raise

    rc = self.po.returncode
    if rc != 0:
      if os.path.exists(self.job_filename):
        with open(self.job_filename) as fp:
          config = fp.read()
      else:
        config = 'Nonexistent!'
      if 'THERMOS_DEBUG' in os.environ:
        print("Runner failed!\n\n\nconfig:%s\n\n\nstdout:%s\n\n\nstderr:%s\n\n\n" % (
            config, so, se))

    try:
      with open(self.state_filename, 'r') as fp:
        self.state = thrift_deserialize(RunnerState(), fp.read())
    except Exception as e:
      if 'THERMOS_DEBUG' in os.environ:
        print('Failed to load Runner state: %s' % e, file=sys.stderr)
      self.state = RunnerState()

    try:
      self.reconstructed_state = CheckpointDispatcher.from_file(
          self.pathspec.getpath('runner_checkpoint'))
    except Exception as e:
      print('Failed to replay checkpoint: %s' % e, file=sys.stderr)
      self.reconstructed_state = None
    self.initialized = True
    return rc

  def cleanup(self):
    if not self.cleaned:
      if hasattr(self, 'po'):
        try:
          self.po.kill()
        except Exception as e:
          print('Failed to kill runner: %s' % e, file=sys.stderr)
          pass
      os.unlink(self.job_filename)
      os.unlink(self.script_filename)
      if 'THERMOS_DEBUG' not in os.environ:
        shutil.rmtree(self.tempdir, ignore_errors=True)
      else:
        print('Logs saved in %s' % self.tempdir)
      self.cleaned = True
Example #48
0
  def __init__(self, task, checkpoint_root, sandbox, log_dir=None,
               task_id=None, portmap=None, user=None, chroot=False, clock=time,
               universal_handler=None, planner_class=TaskPlanner, hostname=None,
               process_logger_destination=None, process_logger_mode=None,
               rotate_log_size_mb=None, rotate_log_backups=None,
               preserve_env=False, mesos_containerizer_path=None, container_sandbox=None):
    """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
        mesos_containerizer_path = the path to the mesos-containerizer executable that will be used
                                   to isolate the task's filesystem (if using a filesystem image).
        container_sandbox = the path within the isolated filesystem where the task's sandbox is
                            mounted.
    """
    if not issubclass(planner_class, TaskPlanner):
      raise TypeError('planner_class must be a TaskPlanner.')
    self._clock = clock
    launch_time = self._clock.time()
    launch_time_ms = '%06d' % int((launch_time - int(launch_time)) * (10 ** 6))
    if not task_id:
      self._task_id = '%s-%s.%s' % (task.name(),
                                    time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)),
                                    launch_time_ms)
    else:
      self._task_id = task_id
    current_user = TaskRunnerHelper.get_actual_user()
    self._user = user or current_user
    # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
    if self._user != current_user:
      if os.geteuid() != 0:
        raise ValueError('task specifies user as %s, but %s does not have setuid permission!' % (
          self._user, current_user))
    self._portmap = portmap or {}
    self._launch_time = launch_time
    self._log_dir = log_dir or os.path.join(sandbox, '.logs')
    self._process_logger_destination = process_logger_destination
    self._process_logger_mode = process_logger_mode
    self._rotate_log_size_mb = rotate_log_size_mb
    self._rotate_log_backups = rotate_log_backups
    self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir)
    self._hostname = hostname or socket.gethostname()
    try:
      ThermosTaskValidator.assert_valid_task(task)
      ThermosTaskValidator.assert_valid_ports(task, self._portmap)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    context = ThermosContext(
        task_id=self._task_id,
        ports=self._portmap,
        user=self._user)
    self._task, uninterp = (task % Environment(thermos=context)).interpolate()
    if len(uninterp) > 0:
      raise self.InvalidTask('Failed to interpolate task, missing: %s' %
          ', '.join(str(ref) for ref in uninterp))
    try:
      ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    self._plan = None  # plan currently being executed (updated by Handlers)
    self._regular_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is False)
    self._finalizing_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is True)
    self._chroot = chroot
    self._sandbox = sandbox
    self._container_sandbox = container_sandbox
    self._terminal_state = None
    self._ckpt = None
    self._process_map = dict((p.name().get(), p) for p in self._task.processes())
    self._task_processes = {}
    self._stages = dict((state, stage(self)) for state, stage in self.STAGES.items())
    self._finalization_start = None
    self._preemption_deadline = None
    self._watcher = ProcessMuxer(self._pathspec)
    self._state = RunnerState(processes={})
    self._preserve_env = preserve_env
    self._mesos_containerizer_path = mesos_containerizer_path

    # create runner state
    universal_handler = universal_handler or TaskRunnerUniversalHandler
    self._dispatcher = CheckpointDispatcher()
    self._dispatcher.register_handler(universal_handler(self))
    self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
    self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

    # recover checkpointed runner state and update plan
    self._recovery = True
    self._replay_runner_ckpt()
Example #49
0
class TaskRunner(object):
  """
    Run a ThermosTask.

    This class encapsulates the core logic to run and control the state of a Thermos task.
    Typically, it will be instantiated directly to control a new task, but a TaskRunner can also be
    synthesised from an existing task's checkpoint root
  """
  class Error(Exception): pass
  class InternalError(Error): pass
  class InvalidTask(Error): pass
  class PermissionError(Error): pass
  class StateError(Error): pass

  # Maximum amount of time we spend waiting for new updates from the checkpoint streams
  # before doing housecleaning (checking for LOST tasks, dead PIDs.)
  MAX_ITERATION_TIME = Amount(10, Time.SECONDS)

  # Minimum amount of time we wait between polls for updates on coordinator checkpoints.
  COORDINATOR_INTERVAL_SLEEP = Amount(1, Time.SECONDS)

  # Amount of time we're willing to wait after forking before we expect the runner to have
  # exec'ed the child process.
  LOST_TIMEOUT = Amount(60, Time.SECONDS)

  # Active task stages
  STAGES = {
    TaskState.ACTIVE: TaskRunnerStage_ACTIVE,
    TaskState.CLEANING: TaskRunnerStage_CLEANING,
    TaskState.FINALIZING: TaskRunnerStage_FINALIZING
  }

  @classmethod
  def get(cls, task_id, checkpoint_root):
    """
      Get a TaskRunner bound to the task_id in checkpoint_root.
    """
    path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
    task_json = path.getpath('task_path')
    task_checkpoint = path.getpath('runner_checkpoint')
    if not os.path.exists(task_json):
      return None
    task = ThermosConfigLoader.load_json(task_json)
    if task is None:
      return None
    if len(task.tasks()) == 0:
      return None
    try:
      checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
      if checkpoint is None or checkpoint.header is None:
        return None
      return cls(task.tasks()[0].task(), checkpoint_root, checkpoint.header.sandbox,
                 log_dir=checkpoint.header.log_dir, task_id=task_id,
                 portmap=checkpoint.header.ports, hostname=checkpoint.header.hostname)
    except Exception as e:
      log.error('Failed to reconstitute checkpoint in TaskRunner.get: %s', e, exc_info=True)
      return None

  def __init__(self, task, checkpoint_root, sandbox, log_dir=None,
               task_id=None, portmap=None, user=None, chroot=False, clock=time,
               universal_handler=None, planner_class=TaskPlanner, hostname=None,
               process_logger_destination=None, process_logger_mode=None,
               rotate_log_size_mb=None, rotate_log_backups=None,
               preserve_env=False, mesos_containerizer_path=None, container_sandbox=None):
    """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
        mesos_containerizer_path = the path to the mesos-containerizer executable that will be used
                                   to isolate the task's filesystem (if using a filesystem image).
        container_sandbox = the path within the isolated filesystem where the task's sandbox is
                            mounted.
    """
    if not issubclass(planner_class, TaskPlanner):
      raise TypeError('planner_class must be a TaskPlanner.')
    self._clock = clock
    launch_time = self._clock.time()
    launch_time_ms = '%06d' % int((launch_time - int(launch_time)) * (10 ** 6))
    if not task_id:
      self._task_id = '%s-%s.%s' % (task.name(),
                                    time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)),
                                    launch_time_ms)
    else:
      self._task_id = task_id
    current_user = TaskRunnerHelper.get_actual_user()
    self._user = user or current_user
    # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
    if self._user != current_user:
      if os.geteuid() != 0:
        raise ValueError('task specifies user as %s, but %s does not have setuid permission!' % (
          self._user, current_user))
    self._portmap = portmap or {}
    self._launch_time = launch_time
    self._log_dir = log_dir or os.path.join(sandbox, '.logs')
    self._process_logger_destination = process_logger_destination
    self._process_logger_mode = process_logger_mode
    self._rotate_log_size_mb = rotate_log_size_mb
    self._rotate_log_backups = rotate_log_backups
    self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir)
    self._hostname = hostname or socket.gethostname()
    try:
      ThermosTaskValidator.assert_valid_task(task)
      ThermosTaskValidator.assert_valid_ports(task, self._portmap)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    context = ThermosContext(
        task_id=self._task_id,
        ports=self._portmap,
        user=self._user)
    self._task, uninterp = (task % Environment(thermos=context)).interpolate()
    if len(uninterp) > 0:
      raise self.InvalidTask('Failed to interpolate task, missing: %s' %
          ', '.join(str(ref) for ref in uninterp))
    try:
      ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    self._plan = None  # plan currently being executed (updated by Handlers)
    self._regular_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is False)
    self._finalizing_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is True)
    self._chroot = chroot
    self._sandbox = sandbox
    self._container_sandbox = container_sandbox
    self._terminal_state = None
    self._ckpt = None
    self._process_map = dict((p.name().get(), p) for p in self._task.processes())
    self._task_processes = {}
    self._stages = dict((state, stage(self)) for state, stage in self.STAGES.items())
    self._finalization_start = None
    self._preemption_deadline = None
    self._watcher = ProcessMuxer(self._pathspec)
    self._state = RunnerState(processes={})
    self._preserve_env = preserve_env
    self._mesos_containerizer_path = mesos_containerizer_path

    # create runner state
    universal_handler = universal_handler or TaskRunnerUniversalHandler
    self._dispatcher = CheckpointDispatcher()
    self._dispatcher.register_handler(universal_handler(self))
    self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
    self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

    # recover checkpointed runner state and update plan
    self._recovery = True
    self._replay_runner_ckpt()

  @property
  def task(self):
    return self._task

  @property
  def task_id(self):
    return self._task_id

  @property
  def state(self):
    return self._state

  @property
  def processes(self):
    return self._task_processes

  def task_state(self):
    return self._state.statuses[-1].state if self._state.statuses else TaskState.ACTIVE

  def close_ckpt(self):
    """Force close the checkpoint stream.  This is necessary for runners terminated through
       exception propagation."""
    log.debug('Closing the checkpoint stream.')
    self._ckpt.close()

  @contextmanager
  def control(self, force=False):
    """
      Bind to the checkpoint associated with this task, position to the end of the log if
      it exists, or create it if it doesn't.  Fails if we cannot get "leadership" i.e. a
      file lock on the checkpoint stream.
    """
    if self.is_terminal():
      raise self.StateError('Cannot take control of a task in terminal state.')
    if self._sandbox:
      safe_mkdir(self._sandbox)
    ckpt_file = self._pathspec.getpath('runner_checkpoint')
    try:
      self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file, force=force, state=self._state)
    except TaskRunnerHelper.PermissionError:
      raise self.PermissionError('Unable to open checkpoint %s' % ckpt_file)
    log.debug('Flipping recovery mode off.')
    self._recovery = False
    self._set_task_status(self.task_state())
    self._resume_task()
    try:
      yield
    except Exception as e:
      log.error('Caught exception in self.control(): %s', e)
      log.error('  %s', traceback.format_exc())
    self._ckpt.close()

  def _resume_task(self):
    assert self._ckpt is not None
    unapplied_updates = self._replay_process_ckpts()
    if self.is_terminal():
      raise self.StateError('Cannot resume terminal task.')
    self._initialize_ckpt_header()
    self._replay(unapplied_updates)

  def _ckpt_write(self, record):
    """
      Write to the checkpoint stream if we're not in recovery mode.
    """
    if not self._recovery:
      self._ckpt.write(record)

  def _replay(self, checkpoints):
    """
      Replay a sequence of RunnerCkpts.
    """
    for checkpoint in checkpoints:
      self._dispatcher.dispatch(self._state, checkpoint)

  def _replay_runner_ckpt(self):
    """
      Replay the checkpoint stream associated with this task.
    """
    ckpt_file = self._pathspec.getpath('runner_checkpoint')
    if os.path.exists(ckpt_file):
      with open(ckpt_file, 'r') as fp:
        ckpt_recover = ThriftRecordReader(fp, RunnerCkpt)
        for record in ckpt_recover:
          log.debug('Replaying runner checkpoint record: %s', record)
          self._dispatcher.dispatch(self._state, record, recovery=True)

  def _replay_process_ckpts(self):
    """
      Replay the unmutating process checkpoints.  Return the unapplied process updates that
      would mutate the runner checkpoint stream.
    """
    process_updates = self._watcher.select()
    unapplied_process_updates = []
    for process_update in process_updates:
      if self._dispatcher.would_update(self._state, process_update):
        unapplied_process_updates.append(process_update)
      else:
        self._dispatcher.dispatch(self._state, process_update, recovery=True)
    return unapplied_process_updates

  def _initialize_ckpt_header(self):
    """
      Initializes the RunnerHeader for this checkpoint stream if it has not already
      been constructed.
    """
    if self._state.header is None:
      try:
        uid = pwd.getpwnam(self._user).pw_uid
      except KeyError:
        # This will cause failures downstream, but they will at least be correctly
        # reflected in the process state.
        log.error('Unknown user %s.', self._user)
        uid = None

      header = RunnerHeader(
          task_id=self._task_id,
          launch_time_ms=int(self._launch_time * 1000),
          sandbox=self._sandbox,
          log_dir=self._log_dir,
          hostname=self._hostname,
          user=self._user,
          uid=uid,
          ports=self._portmap)
      runner_ckpt = RunnerCkpt(runner_header=header)
      self._dispatcher.dispatch(self._state, runner_ckpt)

  def _set_task_status(self, state):
    update = TaskStatus(state=state, timestamp_ms=int(self._clock.time() * 1000),
                        runner_pid=os.getpid(), runner_uid=os.getuid())
    runner_ckpt = RunnerCkpt(task_status=update)
    self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)

  def _finalization_remaining(self):
    # If a preemption deadline has been set, use that.
    if self._preemption_deadline:
      return max(0, self._preemption_deadline - self._clock.time())

    # Otherwise, use the finalization wait provided in the configuration.
    finalization_allocation = self.task.finalization_wait().get()
    if self._finalization_start is None:
      return sys.float_info.max
    else:
      waited = max(0, self._clock.time() - self._finalization_start)
      return max(0, finalization_allocation - waited)

  def _set_process_status(self, process_name, process_state, **kw):
    if 'sequence_number' in kw:
      sequence_number = kw.pop('sequence_number')
      log.debug('_set_process_status(%s <= %s, seq=%s[force])', process_name,
        ProcessState._VALUES_TO_NAMES.get(process_state), sequence_number)
    else:
      current_run = self._current_process_run(process_name)
      if not current_run:
        assert process_state == ProcessState.WAITING
        sequence_number = 0
      else:
        sequence_number = current_run.seq + 1
      log.debug('_set_process_status(%s <= %s, seq=%s[auto])', process_name,
        ProcessState._VALUES_TO_NAMES.get(process_state), sequence_number)
    runner_ckpt = RunnerCkpt(process_status=ProcessStatus(
        process=process_name, state=process_state, seq=sequence_number, **kw))
    self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)

  def _task_process_from_process_name(self, process_name, sequence_number):
    """
      Construct a Process() object from a process_name, populated with its
      correct run number and fully interpolated commandline.
    """
    run_number = len(self.state.processes[process_name]) - 1
    pathspec = self._pathspec.given(process=process_name, run=run_number)
    process = self._process_map.get(process_name)
    if process is None:
      raise self.InternalError('FATAL: Could not find process: %s' % process_name)
    def close_ckpt_and_fork():
      pid = os.fork()
      if pid == 0 and self._ckpt is not None:
        self._ckpt.close()
      return pid

    (logger_destination,
     logger_mode,
     rotate_log_size,
     rotate_log_backups) = self._build_process_logger_args(process)

    return Process(
      process.name().get(),
      process.cmdline().get(),
      sequence_number,
      pathspec,
      self._sandbox,
      self._user,
      chroot=self._chroot,
      fork=close_ckpt_and_fork,
      logger_destination=logger_destination,
      logger_mode=logger_mode,
      rotate_log_size=rotate_log_size,
      rotate_log_backups=rotate_log_backups,
      preserve_env=self._preserve_env,
      mesos_containerizer_path=self._mesos_containerizer_path,
      container_sandbox=self._container_sandbox)

  _DEFAULT_LOGGER = Logger()
  _DEFAULT_ROTATION = RotatePolicy()

  def _build_process_logger_args(self, process):
    """
      Build the appropriate logging configuration based on flags + process
      configuration settings.

      If no configuration (neither flags nor process config), default to
      "standard" mode.
    """

    destination, mode, size, backups = (self._DEFAULT_LOGGER.destination().get(),
                                        self._DEFAULT_LOGGER.mode().get(),
                                        None,
                                        None)

    logger = process.logger()
    if logger is Empty:
      if self._process_logger_destination:
        destination = self._process_logger_destination
      if self._process_logger_mode:
        mode = self._process_logger_mode
    else:
      destination = logger.destination().get()
      mode = logger.mode().get()

    if mode == LoggerMode.ROTATE:
      size = Amount(self._DEFAULT_ROTATION.log_size().get(), Data.BYTES)
      backups = self._DEFAULT_ROTATION.backups().get()
      if logger is Empty:
        if self._rotate_log_size_mb:
          size = Amount(self._rotate_log_size_mb, Data.MB)
        if self._rotate_log_backups:
          backups = self._rotate_log_backups
      else:
        rotate = logger.rotate()
        if rotate is not Empty:
          size = Amount(rotate.log_size().get(), Data.BYTES)
          backups = rotate.backups().get()

    return destination, mode, size, backups

  def deadlocked(self, plan=None):
    """Check whether a plan is deadlocked, i.e. there are no running/runnable processes, and the
    plan is not complete."""
    plan = plan or self._regular_plan
    now = self._clock.time()
    running = list(plan.running)
    runnable = list(plan.runnable_at(now))
    waiting = list(plan.waiting_at(now))
    log.debug('running:%d runnable:%d waiting:%d complete:%s',
      len(running), len(runnable), len(waiting), plan.is_complete())
    return len(running + runnable + waiting) == 0 and not plan.is_complete()

  def is_healthy(self):
    """Check whether the TaskRunner is healthy. A healthy TaskRunner is not deadlocked and has not
    reached its max_failures count."""
    max_failures = self._task.max_failures().get()
    deadlocked = self.deadlocked()
    under_failure_limit = max_failures == 0 or len(self._regular_plan.failed) < max_failures
    log.debug('max_failures:%d failed:%d under_failure_limit:%s deadlocked:%s ==> health:%s',
      max_failures, len(self._regular_plan.failed), under_failure_limit, deadlocked,
      not deadlocked and under_failure_limit)
    return not deadlocked and under_failure_limit

  def _current_process_run(self, process_name):
    if process_name not in self._state.processes or len(self._state.processes[process_name]) == 0:
      return None
    return self._state.processes[process_name][-1]

  def is_process_lost(self, process_name):
    """Determine whether or not we should mark a task as LOST and do so if necessary."""
    current_run = self._current_process_run(process_name)
    if not current_run:
      raise self.InternalError('No current_run for process %s!' % process_name)

    def forked_but_never_came_up():
      return current_run.state == ProcessState.FORKED and (
        self._clock.time() - current_run.fork_time > self.LOST_TIMEOUT.as_(Time.SECONDS))

    def running_but_coordinator_died():
      if current_run.state != ProcessState.RUNNING:
        return False
      coordinator_pid, _, _ = TaskRunnerHelper.scan_process(self.state, process_name)
      if coordinator_pid is not None:
        return False
      elif self._watcher.has_data(process_name):
        return False
      return True

    if forked_but_never_came_up() or running_but_coordinator_died():
      log.info('Detected a LOST task: %s', current_run)
      log.debug('  forked_but_never_came_up: %s', forked_but_never_came_up())
      log.debug('  running_but_coordinator_died: %s', running_but_coordinator_died())
      return True

    return False

  def _run_plan(self, plan):
    log.debug('Schedule pass:'******'running: %s', ' '.join(plan.running))
    log.debug('finished: %s', ' '.join(plan.finished))

    launched = []
    for process_name in plan.running:
      if self.is_process_lost(process_name):
        self._set_process_status(process_name, ProcessState.LOST)

    now = self._clock.time()
    runnable = list(plan.runnable_at(now))
    waiting = list(plan.waiting_at(now))
    log.debug('runnable: %s', ' '.join(runnable))
    log.debug('waiting: %s', ' '.join(
        '%s[T-%.1fs]' % (process, plan.get_wait(process)) for process in waiting))

    def pick_processes(process_list):
      if self._task.max_concurrency().get() == 0:
        return process_list
      num_to_pick = max(self._task.max_concurrency().get() - len(running), 0)
      return process_list[:num_to_pick]

    for process_name in pick_processes(runnable):
      tp = self._task_processes.get(process_name)
      if tp:
        current_run = self._current_process_run(process_name)
        assert current_run.state == ProcessState.WAITING
      else:
        self._set_process_status(process_name, ProcessState.WAITING)
        tp = self._task_processes[process_name]
      log.info('Forking Process(%s)', process_name)
      try:
        tp.start()
        launched.append(tp)
      except Process.Error as e:
        log.error('Failed to launch process: %s', e)
        self._set_process_status(process_name, ProcessState.FAILED)

    return len(launched) > 0

  def _terminate_plan(self, plan):
    TaskRunnerHelper.terminate_orphans(self.state)

    for process in plan.running:
      last_run = self._current_process_run(process)
      if last_run and last_run.state in (ProcessState.FORKED, ProcessState.RUNNING):
        TaskRunnerHelper.terminate_process(self.state, process)

  def has_running_processes(self):
    """
      Returns True if any processes associated with this task have active pids.
    """
    process_tree = TaskRunnerHelper.scan_tree(self.state)
    return any(any(process_set) for process_set in process_tree.values())

  def has_active_processes(self):
    """
      Returns True if any processes are in non-terminal states.
    """
    return any(not TaskRunnerHelper.is_process_terminal(run.state) for run in
        filter(None, (self._current_process_run(process) for process in self.state.processes)))

  def collect_updates(self, timeout=None):
    """
      Collects and applies updates from process checkpoint streams.  Returns the number
      of applied process checkpoints.
    """
    if not self.has_active_processes():
      return 0

    sleep_interval = self.COORDINATOR_INTERVAL_SLEEP.as_(Time.SECONDS)
    total_time = 0.0

    while True:
      process_updates = self._watcher.select()
      for process_update in process_updates:
        self._dispatcher.dispatch(self._state, process_update, self._recovery)
      if process_updates:
        return len(process_updates)
      if timeout is not None and total_time >= timeout:
        return 0
      total_time += sleep_interval
      self._clock.sleep(sleep_interval)

  def is_terminal(self):
    return TaskRunnerHelper.is_task_terminal(self.task_state())

  def terminal_state(self):
    if self._terminal_state:
      log.debug('Forced terminal state: %s' %
          TaskState._VALUES_TO_NAMES.get(self._terminal_state, 'UNKNOWN'))
      return self._terminal_state
    else:
      return TaskState.SUCCESS if self.is_healthy() else TaskState.FAILED

  def run(self, force=False):
    """
      Entrypoint to runner. Assume control of checkpoint stream, and execute TaskRunnerStages
      until runner is terminal.
    """
    if self.is_terminal():
      return
    with self.control(force):
      self._run()

  def _run(self):
    while not self.is_terminal():
      start = self._clock.time()
      # step 1: execute stage corresponding to the state we're currently in
      runner = self._stages[self.task_state()]
      iteration_wait = runner.run()
      if iteration_wait is None:
        log.debug('Run loop: No more work to be done in state %s' %
            TaskState._VALUES_TO_NAMES.get(self.task_state(), 'UNKNOWN'))
        self._set_task_status(runner.transition_to())
        continue
      log.debug('Run loop: Work to be done within %.1fs', iteration_wait)
      # step 2: check child process checkpoint streams for updates
      if not self.collect_updates(iteration_wait):
        # If we don't collect any updates, at least 'touch' the checkpoint stream
        # so as to prevent garbage collection.
        elapsed = self._clock.time() - start
        if elapsed < iteration_wait:
          log.debug('Update collection only took %.1fs, idling %.1fs',
              elapsed, iteration_wait - elapsed)
          self._clock.sleep(iteration_wait - elapsed)
        log.debug('Run loop: No updates collected, touching checkpoint.')
        os.utime(self._pathspec.getpath('runner_checkpoint'), None)
      # step 3: reap any zombie child processes
      TaskRunnerHelper.reap_children()

  def kill(self, force=False, terminal_status=TaskState.KILLED,
           preemption_wait=Amount(1, Time.MINUTES)):
    """
      Kill all processes associated with this task and set task/process states as terminal_status
      (defaults to KILLED)
    """
    log.debug('Runner issued kill: force:%s, preemption_wait:%s',
      force, preemption_wait)
    assert terminal_status in (TaskState.KILLED, TaskState.LOST)
    self._preemption_deadline = self._clock.time() + preemption_wait.as_(Time.SECONDS)
    with self.control(force):
      if self.is_terminal():
        log.warning('Task is not in ACTIVE state, cannot issue kill.')
        return
      self._terminal_state = terminal_status
      if self.task_state() == TaskState.ACTIVE:
        self._set_task_status(TaskState.CLEANING)
      self._run()

  def lose(self, force=False):
    """
      Mark a task as LOST and kill any straggling processes.
    """
    self.kill(force, preemption_wait=Amount(0, Time.SECONDS), terminal_status=TaskState.LOST)

  def _kill(self):
    processes = TaskRunnerHelper.scan_tree(self._state)
    for process, pid_tuple in processes.items():
      current_run = self._current_process_run(process)
      coordinator_pid, pid, tree = pid_tuple
      if TaskRunnerHelper.is_process_terminal(current_run.state):
        if coordinator_pid or pid or tree:
          log.warning('Terminal process (%s) still has running pids:', process)
          log.warning('  coordinator_pid: %s', coordinator_pid)
          log.warning('              pid: %s', pid)
          log.warning('             tree: %s', tree)
        TaskRunnerHelper.kill_process(self.state, process)
      else:
        if coordinator_pid or pid or tree:
          log.info('Transitioning %s to KILLED', process)
          self._set_process_status(process, ProcessState.KILLED,
            stop_time=self._clock.time(), return_code=-1)
        else:
          log.info('Transitioning %s to LOST', process)
          if current_run.state != ProcessState.WAITING:
            self._set_process_status(process, ProcessState.LOST)
Example #50
0
 def _runner_ckpt(self, task_id):
     """Return the runner checkpoint file for a given task_id."""
     return TaskPath(root=self._checkpoint_root,
                     task_id=task_id).getpath('runner_checkpoint')
Example #51
0
class ObservedTask(AbstractClass):
  """ Represents a Task being observed """

  @classmethod
  def safe_mtime(cls, path):
    try:
      return os.path.getmtime(path)
    except OSError:
      return None

  def __init__(self, root, task_id):
    self._root = root
    self._task_id = task_id
    self._pathspec = TaskPath(root=self._root, task_id=self._task_id)
    self._mtime = self._get_mtime()

  @property
  def root(self):
    return self._root

  @abstractproperty
  def type(self):
    """Indicates the type of task (active or finished)"""

  def _read_task(self, memoized={}):
    """Read the corresponding task from disk and return a ThermosTask.  Memoizes already-read tasks.
    """
    if self._task_id not in memoized:
      path = self._pathspec.given(state=self.type).getpath('task_path')
      if os.path.exists(path):
        task = ThermosTaskWrapper.from_file(path)
        if task is None:
          log.error('Error reading ThermosTask from %s in observer.' % path)
        else:
          context = self.context(self._task_id)
          if not context:
            log.warning('Task not yet available: %s' % self._task_id)
          task = task.task() % Environment(thermos=context)
          memoized[self._task_id] = task

    return memoized.get(self._task_id, None)

  def _get_mtime(self):
    """Retrieve the mtime of the task's state directory"""
    get_path = lambda state: self._pathspec.given(state=state).getpath('task_path')
    mtime = self.safe_mtime(get_path('active'))
    if mtime is None:
      mtime = self.safe_mtime(get_path('finished'))
    if mtime is None:
      log.error("Couldn't get mtime for task %s!" % self._task_id)
    return mtime

  def context(self, task_id):
    state = self.state
    if state.header is None:
      return None
    return ThermosContext(
      ports=state.header.ports if state.header.ports else {},
      task_id=state.header.task_id,
      user=state.header.user,
    )

  @property
  def task(self):
    """Return a ThermosTask representing this task"""
    return self._read_task()

  @property
  def task_id(self):
    """Return the task's task_id"""
    return self._task_id

  @property
  def mtime(self):
    """Return mtime of task file"""
    return self._mtime

  @abstractproperty
  def state(self):
    """Return state of task (gen.apache.thermos.ttypes.RunnerState)"""
Example #52
0
 def __init__(self, checkpoint_root):
   self._path = TaskPath(root=checkpoint_root)
class TaskDetector(object):
  """
    Helper class in front of TaskPath to detect active/finished/running tasks. Performs no
    introspection on the state of a task; merely detects based on file paths on disk.
  """
  class MatchingError(Exception): pass

  def __init__(self, root):
    self._root_dir = root
    self._pathspec = TaskPath()

  def get_task_ids(self, state=None):
    paths = glob.glob(self._pathspec.given(root=self._root_dir,
                                           task_id="*",
                                           state=state or '*')
                                    .getpath('task_path'))
    path_re = re.compile(self._pathspec.given(root=re.escape(self._root_dir),
                                              task_id="(\S+)",
                                              state='(\S+)')
                                       .getpath('task_path'))
    for path in paths:
      try:
        task_state, task_id = path_re.match(path).groups()
      except:
        continue
      if state is None or task_state == state:
        yield (task_state, task_id)

  def get_process_runs(self, task_id, log_dir):
    paths = glob.glob(self._pathspec.given(root=self._root_dir,
                                           task_id=task_id,
                                           log_dir=log_dir,
                                           process='*',
                                           run='*')
                                    .getpath('process_logdir'))
    path_re = re.compile(self._pathspec.given(root=re.escape(self._root_dir),
                                              task_id=re.escape(task_id),
                                              log_dir=log_dir,
                                              process='(\S+)',
                                              run='(\d+)')
                                       .getpath('process_logdir'))
    for path in paths:
      try:
        process, run = path_re.match(path).groups()
      except:
        continue
      yield process, int(run)

  def get_process_logs(self, task_id, log_dir):
    for process, run in self.get_process_runs(task_id, log_dir):
      for logtype in ('stdout', 'stderr'):
        path = (self._pathspec.with_filename(logtype).given(root=self._root_dir,
                                                           task_id=task_id,
                                                           log_dir=log_dir,
                                                           process=process,
                                                           run=run)
                                                     .getpath('process_logdir'))
        if os.path.exists(path):
          yield path

  def get_checkpoint(self, task_id):
    return self._pathspec.given(root=self._root_dir, task_id=task_id).getpath('runner_checkpoint')

  def get_process_checkpoints(self, task_id):
    matching_paths = glob.glob(self._pathspec.given(root=self._root_dir,
                                                    task_id=task_id,
                                                    process='*')
                                             .getpath('process_checkpoint'))
    path_re = re.compile(self._pathspec.given(root=re.escape(self._root_dir),
                                              task_id=re.escape(task_id),
                                              process='(\S+)')
                                       .getpath('process_checkpoint'))
    for path in matching_paths:
      try:
        process, = path_re.match(path).groups()
      except:
        continue
      yield path