def _write_process_update(self, **kw): """Write a process update to the coordinator's checkpoint stream.""" process_status = ProcessStatus(**kw) process_status.seq = self._seq process_status.process = self.name() self._ckpt_write(RunnerCkpt(process_status=process_status)) self._seq += 1
def test_sample_by_process_from_history(self, mock_get_active_processes): fake_process_name_1 = 'fake-process-name-1' fake_process_name_2 = 'fake-process-name-2' task_path = '.' task_monitor = TaskMonitor(task_path, 'fake-task-id') fake_process_status_1 = ProcessStatus(process=fake_process_name_1) fake_process_status_2 = ProcessStatus(process=fake_process_name_2) mock_get_active_processes.return_value = [(fake_process_status_1, 1), (fake_process_status_2, 2)] fake_history = ResourceHistory(2) fake_history.add(time(), ResourceMonitorBase.FullResourceResult( {fake_process_status_1: ResourceMonitorBase.ProcResourceResult(ProcessSample.empty(), 1), fake_process_status_2: ResourceMonitorBase.ProcResourceResult(ProcessSample.empty(), 2), }, 10)) task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor, history_provider=self.FakeResourceHistoryProvider(fake_history)) assert task_resource_monitor.name == 'TaskResourceMonitor[fake-task-id]' assert task_resource_monitor.sample_by_process(fake_process_name_1) == ProcessSample.empty() assert task_resource_monitor.sample_by_process(fake_process_name_2) == ProcessSample.empty() _, sample = task_resource_monitor.sample() assert sample.num_procs == 3 # 1 pid in fake_process_status_1 and 2 in fake_process_status_2 assert sample.process_sample == ProcessSample.empty() assert sample.disk_usage == 10 assert mock_get_active_processes.mock_calls == [mock.call(task_monitor), mock.call(task_monitor)]
def kill(cls, task_id, checkpoint_root, force=False, terminal_status=TaskState.KILLED, clock=time): """ An implementation of Task killing that doesn't require a fully hydrated TaskRunner object. Terminal status must be either KILLED or LOST state. """ if terminal_status not in (TaskState.KILLED, TaskState.LOST): raise cls.Error('terminal_status must be KILLED or LOST (got %s)' % TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status) pathspec = TaskPath(root=checkpoint_root, task_id=task_id) checkpoint = pathspec.getpath('runner_checkpoint') state = CheckpointDispatcher.from_file(checkpoint) if state is None or state.header is None or state.statuses is None: if force: log.error('Task has uninitialized TaskState - forcibly finalizing') cls.finalize_task(pathspec) return else: log.error('Cannot update states in uninitialized TaskState!') return ckpt = cls.open_checkpoint(checkpoint, force=force, state=state) def write_task_state(state): update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000), runner_pid=os.getpid(), runner_uid=os.getuid()) ckpt.write(RunnerCkpt(task_status=update)) def write_process_status(status): ckpt.write(RunnerCkpt(process_status=status)) if cls.is_task_terminal(state.statuses[-1].state): log.info('Task is already in terminal state! Finalizing.') cls.finalize_task(pathspec) return with closing(ckpt): write_task_state(TaskState.ACTIVE) for process, history in state.processes.items(): process_status = history[-1] if not cls.is_process_terminal(process_status.state): if cls.kill_process(state, process): write_process_status(ProcessStatus(process=process, state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9, stop_time=clock.time())) else: if process_status.state is not ProcessState.WAITING: write_process_status(ProcessStatus(process=process, state=ProcessState.LOST, seq=process_status.seq + 1)) write_task_state(terminal_status) cls.finalize_task(pathspec)
def make_runner_state(cpid=COORDINATOR_PID, pid=PID, user=USER1, pname=PROCESS_NAME): return RunnerState(header=RunnerHeader(user=user), processes={ pname: [ ProcessStatus(fork_time=CREATE_TIME, start_time=CREATE_TIME, pid=pid, coordinator_pid=cpid, process=pname) ] })
def test_sample_by_process(self, mock_get_active_processes, mock_sample): fake_process_name = 'fake-process-name' task_path = '.' task_monitor = TaskMonitor(task_path, 'fake-task-id') fake_process_status = ProcessStatus(process=fake_process_name) mock_get_active_processes.return_value = [(fake_process_status, 1)] fake_process_sample = ProcessSample.empty() mock_sample.return_value = fake_process_sample task_resource_monitor = TaskResourceMonitor('fake-task-id', task_monitor) assert fake_process_sample == task_resource_monitor.sample_by_process(fake_process_name) assert mock_get_active_processes.mock_calls == [mock.call(task_monitor)] assert mock_sample.mock_calls == [mock.call( task_resource_monitor._process_collectors[fake_process_status])]
def _set_process_status(self, process_name, process_state, **kw): if 'sequence_number' in kw: sequence_number = kw.pop('sequence_number') log.debug('_set_process_status(%s <= %s, seq=%s[force])' % (process_name, ProcessState._VALUES_TO_NAMES.get(process_state), sequence_number)) else: current_run = self._current_process_run(process_name) if not current_run: assert process_state == ProcessState.WAITING sequence_number = 0 else: sequence_number = current_run.seq + 1 log.debug('_set_process_status(%s <= %s, seq=%s[auto])' % (process_name, ProcessState._VALUES_TO_NAMES.get(process_state), sequence_number)) runner_ckpt = RunnerCkpt(process_status=ProcessStatus( process=process_name, state=process_state, seq=sequence_number, **kw)) self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)
def dispatch(self, state, runner_ckpt, recovery=False, truncate=False): """ Given a RunnerState and a RunnerCkpt to apply to it, determine the appropriate action and dispatch to the appropriate handlers. state = RunnerState to be updated runner_ckpt = RunnerCkpt update to apply recovery = if true, enable recovery mode (accept out-of-order sequence updates) truncate = if true, store only the latest task/process states, instead of history for all runs. Raises ErrorRecoveringState on failure. """ # case 1: runner_header # -> Initialization of the task stream. if runner_ckpt.runner_header is not None: if state.header is not None: raise self.ErrorRecoveringState( "Attempting to rebind task with different parameters!") else: log.debug('Initializing TaskRunner header to %s' % runner_ckpt.runner_header) state.header = runner_ckpt.runner_header self._run_header_dispatch(runner_ckpt.runner_header) return # case 2: task_status # -> State transition on the task (ACTIVE, FAILED, SUCCESS, LOST) if runner_ckpt.task_status is not None: if state.statuses is None: state.statuses = [] old_state = None else: old_state = state.statuses[-1].state if not truncate: state.statuses.append(runner_ckpt.task_status) else: state.statuses = [runner_ckpt.task_status] new_state = runner_ckpt.task_status.state log.debug( 'Flipping task state from %s to %s' % (TaskState._VALUES_TO_NAMES.get(old_state, '(undefined)'), TaskState._VALUES_TO_NAMES.get(new_state, '(undefined)'))) self._run_task_dispatch(new_state, runner_ckpt.task_status) return # case 3: process_status # -> State transition on a process itself # (WAITING, FORKED, RUNNING, SUCCESS, KILLED, FAILED, LOST) if runner_ckpt.process_status is not None: process_update = runner_ckpt.process_status name = process_update.process current_run = state.processes[name][ -1] if name in state.processes else None if current_run and process_update.seq != current_run.seq + 1: if recovery: log.debug('Skipping replayed out-of-order update: %s' % process_update) return else: raise self.InvalidSequenceNumber( "Out of order sequence number! %s => %s" % (current_run, process_update)) # One special case for WAITING: Initialize a new target ProcessState. if process_update.state == ProcessState.WAITING: assert current_run is None or self.is_terminal(current_run) if name not in state.processes: state.processes[name] = [ProcessStatus(seq=-1)] else: if not truncate: state.processes[name].append( ProcessStatus(seq=current_run.seq)) else: state.processes[name] = [ ProcessStatus(seq=current_run.seq) ] # Run the process state machine. log.debug('Running state machine for process=%s/seq=%s' % (name, process_update.seq)) if not state.processes or name not in state.processes: raise self.ErrorRecoveringState( "Encountered potentially out of order " "process update. Are you sure this is a full checkpoint stream?" ) self._update_process_state(state.processes[name][-1], process_update) self._run_process_dispatch(process_update.state, process_update) return raise self.ErrorRecoveringState("Empty RunnerCkpt encountered!")