Esempio n. 1
0
    def test_basic_as_job(self):
        proxy_driver = ProxyDriver()

        with temporary_dir() as tempdir:
            te = AuroraExecutor(runner_provider=make_provider(tempdir),
                                sandbox_provider=DefaultTestSandboxProvider())
            te.launchTask(proxy_driver,
                          make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0))
            te.runner_started.wait()
            while te._status_manager is None:
                time.sleep(0.1)
            te.terminated.wait()
            tm = TaskMonitor(TaskPath(root=tempdir),
                             task_id=HELLO_WORLD_TASK_ID)
            runner_state = tm.get_state()

        assert 'hello_world_hello_world-001' in runner_state.processes, (
            'Could not find processes, got: %s' %
            ' '.join(runner_state.processes))
        updates = proxy_driver.method_calls['sendStatusUpdate']
        assert len(updates) == 3
        status_updates = [arg_tuple[0][0] for arg_tuple in updates]
        assert status_updates[0].state == mesos_pb2.TASK_STARTING
        assert status_updates[1].state == mesos_pb2.TASK_RUNNING
        assert status_updates[2].state == mesos_pb2.TASK_FINISHED
Esempio n. 2
0
    def test_process_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)

        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'process'
        assert run_number == 0
        os.kill(process_state.pid, signal.SIGKILL)

        while True:
            if not hasattr(runner, 'state'):
                time.sleep(0.1)
            else:
                break

        assert runner.state.statuses[-1].state == TaskState.SUCCESS
        assert 'process' in runner.state.processes
        assert len(runner.state.processes['process']) == 2
        assert runner.state.processes['process'][
            0].state == ProcessState.KILLED
        assert runner.state.processes['process'][
            0].return_code == -signal.SIGKILL
        assert runner.state.processes['process'][
            1].state == ProcessState.SUCCESS
Esempio n. 3
0
 def add_active_task(self, task_id):
     if task_id in self.finished_tasks:
         log.error("Found an active task (%s) in finished tasks?" % task_id)
         return
     task_monitor = TaskMonitor(self._pathspec, task_id)
     if not task_monitor.get_state().header:
         log.info('Unable to load task "%s"' % task_id)
         return
     sandbox = task_monitor.get_state().header.sandbox
     resource_monitor = self._resource_monitor(task_monitor, sandbox)
     resource_monitor.start()
     self._active_tasks[task_id] = ActiveObservedTask(
         task_id=task_id, pathspec=self._pathspec, task_monitor=task_monitor, resource_monitor=resource_monitor
     )
 def add_active_task(self, task_id):
   if task_id in self.finished_tasks:
     log.error('Found an active task (%s) in finished tasks?' % task_id)
     return
   task_monitor = TaskMonitor(self._pathspec, task_id)
   if not task_monitor.get_state().header:
     log.info('Unable to load task "%s"' % task_id)
     return
   sandbox = task_monitor.get_state().header.sandbox
   resource_monitor = self._resource_monitor(task_monitor, sandbox)
   resource_monitor.start()
   self._active_tasks[task_id] = ActiveObservedTask(
     task_id=task_id, pathspec=self._pathspec,
     task_monitor=task_monitor, resource_monitor=resource_monitor
   )
Esempio n. 5
0
    def __on_active(self, root, task_id):
        log.debug('on_active(%r, %r)', root, task_id)
        if task_id in self.finished_tasks:
            log.error('Found an active task (%s) in finished tasks?', task_id)
            return
        task_monitor = TaskMonitor(root, task_id)

        if self._disable_task_resource_collection:
            resource_monitor = NullTaskResourceMonitor()

        else:
            disk_collector_provider = DiskCollectorProvider(
                self._enable_mesos_disk_collector,
                self._disk_collector_settings)

            resource_monitor = TaskResourceMonitor(
                task_id,
                task_monitor,
                disk_collector_provider=disk_collector_provider,
                process_collection_interval=self.
                _task_process_collection_interval,
                disk_collection_interval=self._disk_collector_settings.
                disk_collection_interval)

        resource_monitor.start()
        self._active_tasks[task_id] = ActiveObservedTask(
            root, task_id, task_monitor, resource_monitor)
Esempio n. 6
0
  def test_sample_by_process_from_history(self, mock_get_active_processes):

    fake_process_name_1 = 'fake-process-name-1'
    fake_process_name_2 = 'fake-process-name-2'
    task_path = '.'
    task_monitor = TaskMonitor(task_path, 'fake-task-id')
    fake_process_status_1 = ProcessStatus(process=fake_process_name_1)
    fake_process_status_2 = ProcessStatus(process=fake_process_name_2)
    mock_get_active_processes.return_value = [(fake_process_status_1, 1),
                                              (fake_process_status_2, 2)]

    fake_history = ResourceHistory(2)
    fake_history.add(time(), ResourceMonitorBase.FullResourceResult(
        {fake_process_status_1: ResourceMonitorBase.ProcResourceResult(ProcessSample.empty(), 1),
         fake_process_status_2: ResourceMonitorBase.ProcResourceResult(ProcessSample.empty(), 2),
         }, 10))

    task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor,
        history_provider=self.FakeResourceHistoryProvider(fake_history))

    assert task_resource_monitor.name == 'TaskResourceMonitor[fake-task-id]'
    assert task_resource_monitor.sample_by_process(fake_process_name_1) == ProcessSample.empty()
    assert task_resource_monitor.sample_by_process(fake_process_name_2) == ProcessSample.empty()

    _, sample = task_resource_monitor.sample()
    assert sample.num_procs == 3  # 1 pid in fake_process_status_1 and 2 in fake_process_status_2
    assert sample.process_sample == ProcessSample.empty()
    assert sample.disk_usage == 10
    assert mock_get_active_processes.mock_calls == [mock.call(task_monitor),
        mock.call(task_monitor)]
Esempio n. 7
0
 def from_assigned_task(self, assigned_task, sandbox):
     task_id = assigned_task.taskId
     resources = mesos_task_instance_from_assigned_task(
         assigned_task).task().resources()
     task_monitor = TaskMonitor(self._checkpoint_root, task_id)
     resource_monitor = TaskResourceMonitor(
         task_id, task_monitor, **self._resource_monitor_options)
     return ResourceManager(resources, resource_monitor)
Esempio n. 8
0
def tail(args, options):
  """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
  if len(args) == 0:
    app.error('Expected a task to tail, got nothing!')
  if len(args) not in (1, 2):
    app.error('Expected at most two arguments (task and optional process), got %d' % len(args))

  task_id = args[0]
  detector = TaskDetector(root=options.root)
  checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
  log_dir = checkpoint.header.log_dir
  process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
  if len(args) == 2:
    process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

  if len(process_runs) == 0:
    print('ERROR: No processes found.', file=sys.stderr)
    sys.exit(1)

  processes = set([process for process, _ in process_runs])
  if len(processes) != 1:
    print('ERROR: More than one process matches query.', file=sys.stderr)
    sys.exit(1)

  process = processes.pop()
  run = max([run for _, run in process_runs])

  logdir = TaskPath(root=options.root, task_id=args[0], process=process,
     run=run, log_dir=log_dir).getpath('process_logdir')
  logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout')

  monitor = TaskMonitor(TaskPath(root=options.root), args[0])
  def log_is_active():
    active_processes = monitor.get_active_processes()
    for process_status, process_run in active_processes:
      if process_status.process == process and process_run == run:
        return True
    return False

  if not log_is_active():
    print('Tail of terminal log %s' % logfile)
    for line in tail_closed(logfile):
      print(line.rstrip())
    return

  now = time.time()
  next_check = now + 5.0
  print('Tail of active log %s' % logfile)
  for line in tail_f(logfile, include_last=True, forever=False):
    print(line.rstrip())
    if time.time() > next_check:
      if not log_is_active():
        break
      else:
        next_check = time.time() + 5.0
Esempio n. 9
0
    def test_pg_is_killed(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'process'
        assert run_number == 0

        child_pidfile = os.path.join(runner.sandbox, runner.task_id,
                                     'child.txt')
        while not os.path.exists(child_pidfile):
            time.sleep(0.1)
        parent_pidfile = os.path.join(runner.sandbox, runner.task_id,
                                      'parent.txt')
        while not os.path.exists(parent_pidfile):
            time.sleep(0.1)
        with open(child_pidfile) as fp:
            child_pid = int(fp.read().rstrip())
        with open(parent_pidfile) as fp:
            parent_pid = int(fp.read().rstrip())

        ps = ProcessProviderFactory.get()
        ps.collect_all()
        assert parent_pid in ps.pids()
        assert child_pid in ps.pids()
        assert child_pid in ps.children_of(parent_pid)

        with open(os.path.join(runner.sandbox, runner.task_id, 'exit.txt'),
                  'w') as fp:
            fp.write('go away!')

        while tm.task_state() is not TaskState.SUCCESS:
            time.sleep(0.1)

        state = tm.get_state()
        assert state.processes['process'][0].state == ProcessState.SUCCESS

        # Another test case may have called setup_child_subreaping(). We therefore have to reap any
        # terminated re-parented child processes to ensure that we don't list already terminated
        # processes (i.e. zombies) in ps.pids() below.
        TaskRunnerHelper.reap_children()

        ps.collect_all()
        assert parent_pid not in ps.pids()
        assert child_pid not in ps.pids()
Esempio n. 10
0
  def test_coordinator_dead_kill(self):
    runner = self.start_runner()
    tm = TaskMonitor(runner.pathspec, runner.task_id)
    self.wait_until_running(tm)
    process_state, run_number = tm.get_active_processes()[0]
    assert process_state.process == 'ignorant_process'
    assert run_number == 0

    os.kill(runner.po.pid, signal.SIGKILL)
    os.kill(process_state.coordinator_pid, signal.SIGKILL)
    os.kill(process_state.pid, signal.SIGKILL)

    killer = TaskRunner.get(runner.task_id, runner.root)
    assert killer is not None
    killer.kill(force=True)

    state = tm.get_state()
    assert len(state.processes['ignorant_process']) == 1
    assert state.processes['ignorant_process'][0].state == ProcessState.LOST
Esempio n. 11
0
  def test_preemption_wait(self):
    runner = self.start_runner()
    tm = TaskMonitor(runner.pathspec, runner.task_id)
    self.wait_until_running(tm)
    process_state, run_number = tm.get_active_processes()[0]
    assert process_state.process == 'ignorant_process'
    assert run_number == 0

    preempter = TaskRunner.get(runner.task_id, runner.root)
    assert preempter is not None
    now = time.time()
    preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS))
    duration = time.time() - now

    # This is arbitrary, but make sure we finish within half a second of
    # requested preemption wait.
    assert abs(duration - 1.0) < 0.5

    assert preempter.state.statuses[-1].state == TaskState.KILLED
    assert preempter.state.processes['ignorant_process'][-1].state == ProcessState.KILLED
Esempio n. 12
0
    def test_coordinator_dead_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 0

        os.kill(runner.po.pid, signal.SIGKILL)
        os.kill(process_state.coordinator_pid, signal.SIGKILL)
        os.kill(process_state.pid, signal.SIGKILL)

        killer = TaskRunner.get(runner.task_id, runner.root)
        assert killer is not None
        killer.kill(force=True)

        state = tm.get_state()
        assert len(state.processes['ignorant_process']) == 1
        assert state.processes['ignorant_process'][
            0].state == ProcessState.LOST
Esempio n. 13
0
    def test_basic(self):
        proxy_driver = ProxyDriver()

        with temporary_dir() as tempdir:
            te = AuroraExecutor(runner_provider=make_provider(tempdir), sandbox_provider=DefaultTestSandboxProvider())
            te.launchTask(proxy_driver, make_task(HELLO_WORLD_MTI))
            te.terminated.wait()
            tm = TaskMonitor(tempdir, task_id=HELLO_WORLD_TASK_ID)
            runner_state = tm.get_state()

        assert "hello_world_hello_world-001" in runner_state.processes, "Could not find processes, got: %s" % " ".join(
            runner_state.processes
        )

        updates = proxy_driver.method_calls["sendStatusUpdate"]
        assert len(updates) == 3
        status_updates = [arg_tuple[0][0] for arg_tuple in updates]
        assert status_updates[0].state == mesos_pb2.TASK_STARTING
        assert status_updates[1].state == mesos_pb2.TASK_RUNNING
        assert status_updates[2].state == mesos_pb2.TASK_FINISHED
Esempio n. 14
0
  def test_coordinator_kill(self):
    runner = self.start_runner()
    tm = TaskMonitor(runner.pathspec, runner.task_id)
    self.wait_until_running(tm)

    process_state, run_number = tm.get_active_processes()[0]
    assert process_state.process == 'process'
    assert run_number == 0
    os.kill(process_state.coordinator_pid, signal.SIGKILL)

    while True:
      if not hasattr(runner, 'state'):
        time.sleep(0.1)
      else:
        break

    assert runner.state.statuses[-1].state == TaskState.SUCCESS
    assert 'process' in runner.state.processes
    assert len(runner.state.processes['process']) == 2
    assert runner.state.processes['process'][0].state == ProcessState.LOST
    assert runner.state.processes['process'][1].state == ProcessState.SUCCESS
Esempio n. 15
0
 def from_assigned_task(self, assigned_task, sandbox):
     task_id = assigned_task.taskId
     resources = mesos_task_instance_from_assigned_task(
         assigned_task).task().resources()
     task_path = TaskPath(root=self._checkpoint_root, task_id=task_id)
     task_monitor = TaskMonitor(task_path, task_id)
     resource_monitor = TaskResourceMonitor(
         task_monitor,
         sandbox.root,
         disk_collector=self._disk_collector,
         disk_collection_interval=self._disk_collection_interval)
     return ResourceManager(resources, resource_monitor)
Esempio n. 16
0
    def test_basic(self):
        proxy_driver = ProxyDriver()

        with temporary_dir() as tempdir:
            te = AuroraExecutor(runner_provider=make_provider(tempdir),
                                sandbox_provider=DefaultTestSandboxProvider())
            te.launchTask(proxy_driver, make_task(HELLO_WORLD_MTI))
            te.terminated.wait()
            tm = TaskMonitor(tempdir, task_id=HELLO_WORLD_TASK_ID)
            runner_state = tm.get_state()

        assert 'hello_world_hello_world-001' in runner_state.processes, (
            'Could not find processes, got: %s' %
            ' '.join(runner_state.processes))

        updates = proxy_driver.method_calls['sendStatusUpdate']
        assert len(updates) == 3
        status_updates = [arg_tuple[0][0] for arg_tuple in updates]
        assert status_updates[0].state == mesos_pb2.TASK_STARTING
        assert status_updates[1].state == mesos_pb2.TASK_RUNNING
        assert status_updates[2].state == mesos_pb2.TASK_FINISHED
Esempio n. 17
0
    def test_preemption_wait(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 0

        preempter = TaskRunner.get(runner.task_id, runner.root)
        assert preempter is not None
        now = time.time()
        preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS))
        duration = time.time() - now

        # This is arbitrary, but make sure we finish within half a second of
        # requested preemption wait.
        assert abs(duration - 1.0) < 0.5

        assert preempter.state.statuses[-1].state == TaskState.KILLED
        assert preempter.state.processes['ignorant_process'][
            -1].state == ProcessState.KILLED
Esempio n. 18
0
  def test_sample_by_process_no_process(self, mock_get_active_processes):
    task_path = '.'

    task_monitor = TaskMonitor(task_path, 'fake-task-id')
    mock_get_active_processes.return_value = []

    task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor)

    with self.assertRaises(ValueError):
      task_resource_monitor.sample_by_process('fake-process-name')

    assert mock_get_active_processes.mock_calls == [mock.call(task_monitor)]
Esempio n. 19
0
    def test_pg_is_killed(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'process'
        assert run_number == 0

        child_pidfile = os.path.join(runner.sandbox, runner.task_id,
                                     'child.txt')
        while not os.path.exists(child_pidfile):
            time.sleep(0.1)
        parent_pidfile = os.path.join(runner.sandbox, runner.task_id,
                                      'parent.txt')
        while not os.path.exists(parent_pidfile):
            time.sleep(0.1)
        with open(child_pidfile) as fp:
            child_pid = int(fp.read().rstrip())
        with open(parent_pidfile) as fp:
            parent_pid = int(fp.read().rstrip())

        ps = ProcessProviderFactory.get()
        ps.collect_all()
        assert parent_pid in ps.pids()
        assert child_pid in ps.pids()
        assert child_pid in ps.children_of(parent_pid)

        with open(os.path.join(runner.sandbox, runner.task_id, 'exit.txt'),
                  'w') as fp:
            fp.write('go away!')

        while tm.task_state() is not TaskState.SUCCESS:
            time.sleep(0.1)

        state = tm.get_state()
        assert state.processes['process'][0].state == ProcessState.SUCCESS

        ps.collect_all()
        assert parent_pid not in ps.pids()
        assert child_pid not in ps.pids()
Esempio n. 20
0
    def test_process_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)

        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == "process"
        assert run_number == 0
        os.kill(process_state.pid, signal.SIGKILL)

        while True:
            if not hasattr(runner, "state"):
                time.sleep(0.1)
            else:
                break

        assert runner.state.statuses[-1].state == TaskState.SUCCESS
        assert "process" in runner.state.processes
        assert len(runner.state.processes["process"]) == 2
        assert runner.state.processes["process"][0].state == ProcessState.KILLED
        assert runner.state.processes["process"][0].return_code == -signal.SIGKILL
        assert runner.state.processes["process"][1].state == ProcessState.SUCCESS
Esempio n. 21
0
  def test_basic_as_job(self):
    proxy_driver = ProxyDriver()

    with temporary_dir() as tempdir:
      te = AuroraExecutor(
          runner_provider=make_provider(tempdir),
          sandbox_provider=DefaultTestSandboxProvider())
      te.launchTask(proxy_driver, make_task(MESOS_JOB(task=HELLO_WORLD), instanceId=0))
      te.runner_started.wait()
      while te._status_manager is None:
        time.sleep(0.1)
      te.terminated.wait()
      tm = TaskMonitor(tempdir, task_id=HELLO_WORLD_TASK_ID)
      runner_state = tm.get_state()

    assert 'hello_world_hello_world-001' in runner_state.processes, (
      'Could not find processes, got: %s' % ' '.join(runner_state.processes))
    updates = proxy_driver.method_calls['sendStatusUpdate']
    assert len(updates) == 3
    status_updates = [arg_tuple[0][0] for arg_tuple in updates]
    assert status_updates[0].state == mesos_pb2.TASK_STARTING
    assert status_updates[1].state == mesos_pb2.TASK_RUNNING
    assert status_updates[2].state == mesos_pb2.TASK_FINISHED
Esempio n. 22
0
 def __on_active(self, root, task_id):
   log.debug('on_active(%r, %r)' % (root, task_id))
   if task_id in self.finished_tasks:
     log.error('Found an active task (%s) in finished tasks?' % task_id)
     return
   task_monitor = TaskMonitor(root, task_id)
   resource_monitor = self._resource_monitor_class(task_id, task_monitor)
   resource_monitor.start()
   self._active_tasks[task_id] = ActiveObservedTask(
       root,
       task_id,
       task_monitor,
       resource_monitor
   )
Esempio n. 23
0
  def test_pg_is_killed(self):
    runner = self.start_runner()
    tm = TaskMonitor(runner.pathspec, runner.task_id)
    self.wait_until_running(tm)
    process_state, run_number = tm.get_active_processes()[0]
    assert process_state.process == 'process'
    assert run_number == 0

    child_pidfile = os.path.join(runner.sandbox, runner.task_id, 'child.txt')
    while not os.path.exists(child_pidfile):
      time.sleep(0.1)
    parent_pidfile = os.path.join(runner.sandbox, runner.task_id, 'parent.txt')
    while not os.path.exists(parent_pidfile):
      time.sleep(0.1)
    with open(child_pidfile) as fp:
      child_pid = int(fp.read().rstrip())
    with open(parent_pidfile) as fp:
      parent_pid = int(fp.read().rstrip())

    ps = ProcessProviderFactory.get()
    ps.collect_all()
    assert parent_pid in ps.pids()
    assert child_pid in ps.pids()
    assert child_pid in ps.children_of(parent_pid)

    with open(os.path.join(runner.sandbox, runner.task_id, 'exit.txt'), 'w') as fp:
      fp.write('go away!')

    while tm.task_state() is not TaskState.SUCCESS:
      time.sleep(0.1)

    state = tm.get_state()
    assert state.processes['process'][0].state == ProcessState.SUCCESS

    ps.collect_all()
    assert parent_pid not in ps.pids()
    assert child_pid not in ps.pids()
Esempio n. 24
0
  def test_sample_by_process(self, mock_get_active_processes, mock_sample):
    fake_process_name = 'fake-process-name'
    task_path = '.'
    task_monitor = TaskMonitor(task_path, 'fake-task-id')
    fake_process_status = ProcessStatus(process=fake_process_name)
    mock_get_active_processes.return_value = [(fake_process_status, 1)]
    fake_process_sample = ProcessSample.empty()
    mock_sample.return_value = fake_process_sample

    task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor)

    assert fake_process_sample == task_resource_monitor.sample_by_process(fake_process_name)
    assert mock_get_active_processes.mock_calls == [mock.call(task_monitor)]
    assert mock_sample.mock_calls == [mock.call(
        task_resource_monitor._process_collectors[fake_process_status])]
Esempio n. 25
0
  def start(self, timeout=MAX_WAIT):
    """Fork the task runner and return once the underlying task is running, up to timeout."""
    self.forking.set()

    self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id)

    cmdline_args = self._cmdline()
    log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args))

    cwd = os.environ.get('MESOS_DIRECTORY')
    try:
      self._popen = subprocess.Popen(cmdline_args, cwd=cwd)
    except OSError as e:
      raise TaskError(e)

    self.forked.set()

    self.wait_start(timeout=timeout)
Esempio n. 26
0
    def test_coordinator_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 0
        os.kill(process_state.coordinator_pid, signal.SIGKILL)

        while True:
            active_procs = tm.get_active_processes()
            if active_procs and active_procs[0][1] > 0:
                break
            time.sleep(0.2)
        self.wait_until_running(tm)

        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 1
        os.kill(process_state.pid, signal.SIGKILL)

        while True:
            active_procs = tm.get_active_processes()
            if active_procs and active_procs[0][1] > 1:
                break
            time.sleep(0.2)
        self.wait_until_running(tm)

        os.kill(runner.po.pid, signal.SIGKILL)

        try:
            state = tm.get_state()
            assert state.processes['ignorant_process'][
                0].state == ProcessState.LOST
            assert state.processes['ignorant_process'][
                1].state == ProcessState.KILLED
            assert state.processes['ignorant_process'][
                2].state == ProcessState.RUNNING
        finally:
            os.kill(state.processes['ignorant_process'][2].coordinator_pid,
                    signal.SIGKILL)
            os.kill(state.processes['ignorant_process'][2].pid, signal.SIGKILL)
  def start(self, timeout=MAX_WAIT):
    """Fork the task runner and return once the underlying task is running, up to timeout."""
    self.forking.set()

    try:
      chmod_plus_x(self._runner_pex)
    except OSError as e:
      if e.errno != errno.EPERM:
        raise TaskError('Failed to chmod +x runner: %s' % e)

    self._monitor = TaskMonitor(TaskPath(root=self._checkpoint_root), self._task_id)

    cmdline_args = self._cmdline()
    log.info('Forking off runner with cmdline: %s' % ' '.join(cmdline_args))

    try:
      self._popen = subprocess.Popen(cmdline_args)
    except OSError as e:
      raise TaskError(e)

    self.forked.set()

    log.debug('Waiting for task to start.')

    def is_started():
      return self._monitor and (self._monitor.active or self._monitor.finished)

    waited = Amount(0, Time.SECONDS)
    while not is_started() and waited < timeout:
      log.debug('  - sleeping...')
      self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
      waited += self.POLL_INTERVAL

    if not is_started():
      log.error('Task did not start with in deadline, forcing loss.')
      self.lose()
      raise TaskError('Task did not start within deadline.')
Esempio n. 28
0
  def test_coordinator_kill(self):
    runner = self.start_runner()
    tm = TaskMonitor(runner.pathspec, runner.task_id)
    self.wait_until_running(tm)
    process_state, run_number = tm.get_active_processes()[0]
    assert process_state.process == 'ignorant_process'
    assert run_number == 0
    os.kill(process_state.coordinator_pid, signal.SIGKILL)

    while True:
      active_procs = tm.get_active_processes()
      if active_procs and active_procs[0][1] > 0:
        break
      time.sleep(0.2)
    self.wait_until_running(tm)

    process_state, run_number = tm.get_active_processes()[0]
    assert process_state.process == 'ignorant_process'
    assert run_number == 1
    os.kill(process_state.pid, signal.SIGKILL)

    while True:
      active_procs = tm.get_active_processes()
      if active_procs and active_procs[0][1] > 1:
        break
      time.sleep(0.2)
    self.wait_until_running(tm)

    os.kill(runner.po.pid, signal.SIGKILL)

    try:
      state = tm.get_state()
      assert state.processes['ignorant_process'][0].state == ProcessState.LOST
      assert state.processes['ignorant_process'][1].state == ProcessState.KILLED
      assert state.processes['ignorant_process'][2].state == ProcessState.RUNNING
    finally:
      os.kill(state.processes['ignorant_process'][2].coordinator_pid, signal.SIGKILL)
      os.kill(state.processes['ignorant_process'][2].pid, signal.SIGKILL)