Exemple #1
0
 def _write_process_update(self, **kw):
   """Write a process update to the coordinator's checkpoint stream."""
   process_status = ProcessStatus(**kw)
   process_status.seq = self._seq
   process_status.process = self.name()
   self._ckpt_write(RunnerCkpt(process_status=process_status))
   self._seq += 1
Exemple #2
0
  def test_sample_by_process_from_history(self, mock_get_active_processes):

    fake_process_name_1 = 'fake-process-name-1'
    fake_process_name_2 = 'fake-process-name-2'
    task_path = '.'
    task_monitor = TaskMonitor(task_path, 'fake-task-id')
    fake_process_status_1 = ProcessStatus(process=fake_process_name_1)
    fake_process_status_2 = ProcessStatus(process=fake_process_name_2)
    mock_get_active_processes.return_value = [(fake_process_status_1, 1),
                                              (fake_process_status_2, 2)]

    fake_history = ResourceHistory(2)
    fake_history.add(time(), ResourceMonitorBase.FullResourceResult(
        {fake_process_status_1: ResourceMonitorBase.ProcResourceResult(ProcessSample.empty(), 1),
         fake_process_status_2: ResourceMonitorBase.ProcResourceResult(ProcessSample.empty(), 2),
         }, 10))

    task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor,
        history_provider=self.FakeResourceHistoryProvider(fake_history))

    assert task_resource_monitor.name == 'TaskResourceMonitor[fake-task-id]'
    assert task_resource_monitor.sample_by_process(fake_process_name_1) == ProcessSample.empty()
    assert task_resource_monitor.sample_by_process(fake_process_name_2) == ProcessSample.empty()

    _, sample = task_resource_monitor.sample()
    assert sample.num_procs == 3  # 1 pid in fake_process_status_1 and 2 in fake_process_status_2
    assert sample.process_sample == ProcessSample.empty()
    assert sample.disk_usage == 10
    assert mock_get_active_processes.mock_calls == [mock.call(task_monitor),
        mock.call(task_monitor)]
Exemple #3
0
 def _write_process_update(self, **kw):
     """Write a process update to the coordinator's checkpoint stream."""
     process_status = ProcessStatus(**kw)
     process_status.seq = self._seq
     process_status.process = self.name()
     self._ckpt_write(RunnerCkpt(process_status=process_status))
     self._seq += 1
Exemple #4
0
  def kill(cls, task_id, checkpoint_root, force=False,
           terminal_status=TaskState.KILLED, clock=time):
    """
      An implementation of Task killing that doesn't require a fully hydrated TaskRunner object.
      Terminal status must be either KILLED or LOST state.
    """
    if terminal_status not in (TaskState.KILLED, TaskState.LOST):
      raise cls.Error('terminal_status must be KILLED or LOST (got %s)' %
                      TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status)
    pathspec = TaskPath(root=checkpoint_root, task_id=task_id)
    checkpoint = pathspec.getpath('runner_checkpoint')
    state = CheckpointDispatcher.from_file(checkpoint)

    if state is None or state.header is None or state.statuses is None:
      if force:
        log.error('Task has uninitialized TaskState - forcibly finalizing')
        cls.finalize_task(pathspec)
        return
      else:
        log.error('Cannot update states in uninitialized TaskState!')
        return

    ckpt = cls.open_checkpoint(checkpoint, force=force, state=state)

    def write_task_state(state):
      update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000),
                          runner_pid=os.getpid(), runner_uid=os.getuid())
      ckpt.write(RunnerCkpt(task_status=update))

    def write_process_status(status):
      ckpt.write(RunnerCkpt(process_status=status))

    if cls.is_task_terminal(state.statuses[-1].state):
      log.info('Task is already in terminal state!  Finalizing.')
      cls.finalize_task(pathspec)
      return

    with closing(ckpt):
      write_task_state(TaskState.ACTIVE)
      for process, history in state.processes.items():
        process_status = history[-1]
        if not cls.is_process_terminal(process_status.state):
          if cls.kill_process(state, process):
            write_process_status(ProcessStatus(process=process,
              state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9,
              stop_time=clock.time()))
          else:
            if process_status.state is not ProcessState.WAITING:
              write_process_status(ProcessStatus(process=process,
                state=ProcessState.LOST, seq=process_status.seq + 1))
      write_task_state(terminal_status)
    cls.finalize_task(pathspec)
Exemple #5
0
def make_runner_state(cpid=COORDINATOR_PID,
                      pid=PID,
                      user=USER1,
                      pname=PROCESS_NAME):
    return RunnerState(header=RunnerHeader(user=user),
                       processes={
                           pname: [
                               ProcessStatus(fork_time=CREATE_TIME,
                                             start_time=CREATE_TIME,
                                             pid=pid,
                                             coordinator_pid=cpid,
                                             process=pname)
                           ]
                       })
Exemple #6
0
  def test_sample_by_process(self, mock_get_active_processes, mock_sample):
    fake_process_name = 'fake-process-name'
    task_path = '.'
    task_monitor = TaskMonitor(task_path, 'fake-task-id')
    fake_process_status = ProcessStatus(process=fake_process_name)
    mock_get_active_processes.return_value = [(fake_process_status, 1)]
    fake_process_sample = ProcessSample.empty()
    mock_sample.return_value = fake_process_sample

    task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor)

    assert fake_process_sample == task_resource_monitor.sample_by_process(fake_process_name)
    assert mock_get_active_processes.mock_calls == [mock.call(task_monitor)]
    assert mock_sample.mock_calls == [mock.call(
        task_resource_monitor._process_collectors[fake_process_status])]
Exemple #7
0
 def _set_process_status(self, process_name, process_state, **kw):
   if 'sequence_number' in kw:
     sequence_number = kw.pop('sequence_number')
     log.debug('_set_process_status(%s <= %s, seq=%s[force])' % (process_name,
       ProcessState._VALUES_TO_NAMES.get(process_state), sequence_number))
   else:
     current_run = self._current_process_run(process_name)
     if not current_run:
       assert process_state == ProcessState.WAITING
       sequence_number = 0
     else:
       sequence_number = current_run.seq + 1
     log.debug('_set_process_status(%s <= %s, seq=%s[auto])' % (process_name,
       ProcessState._VALUES_TO_NAMES.get(process_state), sequence_number))
   runner_ckpt = RunnerCkpt(process_status=ProcessStatus(
       process=process_name, state=process_state, seq=sequence_number, **kw))
   self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)
Exemple #8
0
    def dispatch(self, state, runner_ckpt, recovery=False, truncate=False):
        """
      Given a RunnerState and a RunnerCkpt to apply to it, determine the appropriate action and
      dispatch to the appropriate handlers.

      state          = RunnerState to be updated
      runner_ckpt    = RunnerCkpt update to apply
      recovery       = if true, enable recovery mode (accept out-of-order sequence updates)
      truncate       = if true, store only the latest task/process states, instead of
                       history for all runs.

      Raises ErrorRecoveringState on failure.
    """
        # case 1: runner_header
        #   -> Initialization of the task stream.
        if runner_ckpt.runner_header is not None:
            if state.header is not None:
                raise self.ErrorRecoveringState(
                    "Attempting to rebind task with different parameters!")
            else:
                log.debug('Initializing TaskRunner header to %s' %
                          runner_ckpt.runner_header)
                state.header = runner_ckpt.runner_header
                self._run_header_dispatch(runner_ckpt.runner_header)
            return

        # case 2: task_status
        #   -> State transition on the task (ACTIVE, FAILED, SUCCESS, LOST)
        if runner_ckpt.task_status is not None:
            if state.statuses is None:
                state.statuses = []
                old_state = None
            else:
                old_state = state.statuses[-1].state
            if not truncate:
                state.statuses.append(runner_ckpt.task_status)
            else:
                state.statuses = [runner_ckpt.task_status]
            new_state = runner_ckpt.task_status.state
            log.debug(
                'Flipping task state from %s to %s' %
                (TaskState._VALUES_TO_NAMES.get(old_state, '(undefined)'),
                 TaskState._VALUES_TO_NAMES.get(new_state, '(undefined)')))
            self._run_task_dispatch(new_state, runner_ckpt.task_status)
            return

        # case 3: process_status
        #   -> State transition on a process itself
        #        (WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST)
        if runner_ckpt.process_status is not None:
            process_update = runner_ckpt.process_status
            name = process_update.process
            current_run = state.processes[name][
                -1] if name in state.processes else None
            if current_run and process_update.seq != current_run.seq + 1:
                if recovery:
                    log.debug('Skipping replayed out-of-order update: %s' %
                              process_update)
                    return
                else:
                    raise self.InvalidSequenceNumber(
                        "Out of order sequence number! %s => %s" %
                        (current_run, process_update))

            # One special case for WAITING: Initialize a new target ProcessState.
            if process_update.state == ProcessState.WAITING:
                assert current_run is None or self.is_terminal(current_run)
                if name not in state.processes:
                    state.processes[name] = [ProcessStatus(seq=-1)]
                else:
                    if not truncate:
                        state.processes[name].append(
                            ProcessStatus(seq=current_run.seq))
                    else:
                        state.processes[name] = [
                            ProcessStatus(seq=current_run.seq)
                        ]

            # Run the process state machine.
            log.debug('Running state machine for process=%s/seq=%s' %
                      (name, process_update.seq))
            if not state.processes or name not in state.processes:
                raise self.ErrorRecoveringState(
                    "Encountered potentially out of order "
                    "process update.  Are you sure this is a full checkpoint stream?"
                )
            self._update_process_state(state.processes[name][-1],
                                       process_update)
            self._run_process_dispatch(process_update.state, process_update)
            return

        raise self.ErrorRecoveringState("Empty RunnerCkpt encountered!")