def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse: if event.running: task_id = event.running.task_id elif event.done: task_id = event.done.task_id task = get_task_checked(task_id) node = get_node_checked(machine_id) node_task = NodeTasks(machine_id=machine_id, task_id=task_id, state=NodeTaskState.running) if event.running: if task.state not in TaskState.shutting_down(): task.state = TaskState.running if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node_task.save() task.on_start() elif event.done: # Only record exit status if the task isn't already shutting down. # # It's ok for the agent to fail because resources vanish out from underneath # it during deletion. if task.state not in TaskState.shutting_down(): exit_status = event.done.exit_status if not exit_status.success: logging.error("task failed: status = %s", exit_status) task.error = Error( code=ErrorCode.TASK_FAILED, errors=[ "task failed. exit_status = %s" % exit_status, event.done.stdout, event.done.stderr, ], ) task.state = TaskState.stopping if node.state not in NodeState.ready_for_reset(): node.state = NodeState.done node_task.delete() else: err = Error( code=ErrorCode.INVALID_REQUEST, errors=["invalid worker event type"], ) raise RequestException(err) task.save() node.save() task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event) task_event.save() return ok(BoolResult(result=True))
def mark_stopping(self) -> None: if self.state in TaskState.shutting_down(): logging.debug("ignoring post-task stop calls to stop %s:%s", self.job_id, self.task_id) return self.set_state(TaskState.stopping)
def post(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(CanScheduleRequest, req) if isinstance(request, Error): return not_ok(request, context="CanScheduleRequest") node = Node.get_by_machine_id(request.machine_id) if not node: return not_ok( Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]), context=request.machine_id, ) allowed = True work_stopped = False if not node.can_process_new_work(): allowed = False task = Task.get_by_task_id(request.task_id) work_stopped = isinstance(task, Error) or task.state in TaskState.shutting_down() if work_stopped: allowed = False return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
def check_task(self, task_id: UUID, scalesets: List[Scaleset]) -> Optional[str]: task = self.of.tasks.get(task_id) # Check if the scaleset the task is assigned is OK for scaleset in scalesets: if (task.config.pool is not None and scaleset.pool_name == task.config.pool.pool_name and scaleset.state not in scaleset.state.available()): return "task scaleset failed: %s - %s - %s (%s)" % ( self.jobs[self.tasks[task_id]].config.name, task.config.task.type.name, scaleset.state.name, scaleset.error, ) # check if the task itself has an error if task.error is not None: return "task failed: %s - %s (%s)" % ( self.jobs[self.tasks[task_id]].config.name, task.config.task.type.name, task.error, ) # just in case someone else stopped the task if task.state in TaskState.shutting_down(): return "task shutdown early: %s - %s" % ( self.jobs[self.tasks[task_id]].config.name, task.config.task.type.name, ) return None
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> None: if event.running: task_id = event.running.task_id elif event.done: task_id = event.done.task_id else: raise NotImplementedError task = get_task_checked(task_id) node = get_node_checked(machine_id) node_task = NodeTasks( machine_id=machine_id, task_id=task_id, state=NodeTaskState.running ) if event.running: if task.state not in TaskState.shutting_down(): task.state = TaskState.running if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node.save() node_task.save() # Start the clock for the task if it wasn't started already # (as happens in 1.0.0 agents) task.on_start() elif event.done: node_task.delete() exit_status = event.done.exit_status if not exit_status.success: logging.error("task failed. status:%s", exit_status) task.mark_failed( Error( code=ErrorCode.TASK_FAILED, errors=[ "task failed. exit_status:%s" % exit_status, event.done.stdout, event.done.stderr, ], ) ) else: task.mark_stopping() node.to_reimage(done=True) else: err = Error( code=ErrorCode.INVALID_REQUEST, errors=["invalid worker event type"], ) raise RequestException(err) task.save() task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event) task_event.save()
def mark_stopping(self) -> None: if self.state in TaskState.shutting_down(): logging.debug("ignoring post-task stop calls to stop %s:%s", self.job_id, self.task_id) return if self.state not in TaskState.has_started(): self.mark_failed( Error(code=ErrorCode.TASK_FAILED, errors=["task never started"])) self.set_state(TaskState.stopping)
def on_worker_event_running(machine_id: UUID, event: WorkerRunningEvent) -> Result[None]: task = Task.get_by_task_id(event.task_id) if isinstance(task, Error): return task node = get_node(machine_id) if isinstance(node, Error): return node if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node.save() node_task = NodeTasks(machine_id=machine_id, task_id=event.task_id, state=NodeTaskState.running) node_task.save() if task.state in TaskState.shutting_down(): logging.info( "ignoring task start from node. machine_id:%s %s:%s (state: %s)", machine_id, task.job_id, task.task_id, task.state, ) return None logging.info( "task started on node. machine_id:%s %s:%s", machine_id, task.job_id, task.task_id, ) task.state = TaskState.running task.save() # Start the clock for the task if it wasn't started already # (as happens in 1.0.0 agents) task.on_start() task_event = TaskEvent( task_id=task.task_id, machine_id=machine_id, event_data=WorkerEvent(running=event), ) task_event.save() return None
def check_current_job(self) -> Job: job = self.onefuzz.jobs.get(self.job.job_id) if job.state in JobState.shutting_down(): raise StoppedEarly("job unexpectedly stopped early") errors = [] for task in self.onefuzz.tasks.list(job_id=self.job.job_id, state=TaskState.shutting_down()): if task.error: errors.append("%s: %s" % (task.config.task.type, task.error)) else: errors.append("%s" % task.config.task.type) if errors: raise StoppedEarly("tasks stopped unexpectedly.\n%s" % "\n".join(errors)) return job
def on_worker_event_running(machine_id: UUID, event: WorkerRunningEvent) -> Result[None]: task = Task.get_by_task_id(event.task_id) if isinstance(task, Error): return task node = get_node(machine_id) if isinstance(node, Error): return node if node.state not in NodeState.ready_for_reset(): node.set_state(NodeState.busy) node_task = NodeTasks(machine_id=machine_id, task_id=event.task_id, state=NodeTaskState.running) node_task.save() if task.state in TaskState.shutting_down(): logging.info( "ignoring task start from node. " "machine_id:%s job_id:%s task_id:%s (state: %s)", machine_id, task.job_id, task.task_id, task.state, ) return None logging.info( "task started on node. machine_id:%s job_id%s task_id:%s", machine_id, task.job_id, task.task_id, ) task.set_state(TaskState.running) task_event = TaskEvent( task_id=task.task_id, machine_id=machine_id, event_data=WorkerEvent(running=event), ) task_event.save() return None
def mark_failed(self, error: Error, tasks_in_job: Optional[List["Task"]] = None) -> None: if self.state in TaskState.shutting_down(): logging.debug("ignoring post-task stop failures for %s:%s", self.job_id, self.task_id) return if self.error is not None: logging.debug("ignoring additional task error %s:%s", self.job_id, self.task_id) return logging.error("task failed %s:%s - %s", self.job_id, self.task_id, error) self.error = error self.set_state(TaskState.stopping) self.mark_dependants_failed(tasks_in_job=tasks_in_job)
def get_running_tasks_checked(self) -> List[Task]: self.job = self.onefuzz.jobs.get(self.job.job_id) if self.job.state in JobState.shutting_down(): raise StoppedEarly("job unexpectedly stopped early") errors = [] tasks = [] for task in self.onefuzz.tasks.list(job_id=self.job.job_id): if task.state in TaskState.shutting_down(): if task.error: errors.append("%s: %s" % (task.config.task.type, task.error)) else: errors.append("%s" % task.config.task.type) tasks.append(task) if errors: raise StoppedEarly("tasks stopped unexpectedly.\n%s" % "\n".join(errors)) return tasks
def stop_if_complete(self, done: bool = False) -> bool: # returns True on stopping the node and False if this doesn't stop the node from ..tasks.main import Task node_tasks = NodeTasks.get_by_machine_id(self.machine_id) for node_task in node_tasks: task = Task.get_by_task_id(node_task.task_id) # ignore invalid tasks when deciding if the node should be # shutdown if isinstance(task, Error): continue if task.state not in TaskState.shutting_down(): return False logging.info( "node: stopping busy node with all tasks complete: %s", self.machine_id, ) self.stop(done=done) return True