def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse: if event.running: task_id = event.running.task_id elif event.done: task_id = event.done.task_id task = get_task_checked(task_id) node = get_node_checked(machine_id) node_task = NodeTasks(machine_id=machine_id, task_id=task_id, state=NodeTaskState.running) if event.running: if task.state not in TaskState.shutting_down(): task.state = TaskState.running if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node_task.save() task.on_start() elif event.done: # Only record exit status if the task isn't already shutting down. # # It's ok for the agent to fail because resources vanish out from underneath # it during deletion. if task.state not in TaskState.shutting_down(): exit_status = event.done.exit_status if not exit_status.success: logging.error("task failed: status = %s", exit_status) task.error = Error( code=ErrorCode.TASK_FAILED, errors=[ "task failed. exit_status = %s" % exit_status, event.done.stdout, event.done.stderr, ], ) task.state = TaskState.stopping if node.state not in NodeState.ready_for_reset(): node.state = NodeState.done node_task.delete() else: err = Error( code=ErrorCode.INVALID_REQUEST, errors=["invalid worker event type"], ) raise RequestException(err) task.save() node.save() task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event) task_event.save() return ok(BoolResult(result=True))
def mark_stopping(self) -> None: if self.state in TaskState.shutting_down(): logging.debug("ignoring post-task stop calls to stop %s:%s", self.job_id, self.task_id) return if self.state not in TaskState.has_started(): self.mark_failed( Error(code=ErrorCode.TASK_FAILED, errors=["task never started"])) self.set_state(TaskState.stopping)
def post(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(CanScheduleRequest, req) if isinstance(request, Error): return not_ok(request, context="CanScheduleRequest") node = Node.get_by_machine_id(request.machine_id) if not node: return not_ok( Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]), context=request.machine_id, ) allowed = True work_stopped = False if not node.can_process_new_work(): allowed = False task = Task.get_by_task_id(request.task_id) work_stopped = isinstance(task, Error) or task.state in TaskState.shutting_down() if work_stopped: allowed = False return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
def get_queue_tasks() -> Sequence[Tuple[Task, Sequence[str]]]: results = [] for task in Task.search_states(states=TaskState.available()): containers = get_input_container_queues(task.config) if containers: results.append((task, containers)) return results
def mark_stopping(self) -> None: if self.state in TaskState.shutting_down(): logging.debug("ignoring post-task stop calls to stop %s:%s", self.job_id, self.task_id) return self.set_state(TaskState.stopping)
def check_task(self, task_id: UUID, scalesets: List[Scaleset]) -> Optional[str]: task = self.of.tasks.get(task_id) # Check if the scaleset the task is assigned is OK for scaleset in scalesets: if (task.config.pool is not None and scaleset.pool_name == task.config.pool.pool_name and scaleset.state not in scaleset.state.available()): return "task scaleset failed: %s - %s - %s (%s)" % ( self.jobs[self.tasks[task_id]].config.name, task.config.task.type.name, scaleset.state.name, scaleset.error, ) # check if the task itself has an error if task.error is not None: return "task failed: %s - %s (%s)" % ( self.jobs[self.tasks[task_id]].config.name, task.config.task.type.name, task.error, ) # just in case someone else stopped the task if task.state in TaskState.shutting_down(): return "task shutdown early: %s - %s" % ( self.jobs[self.tasks[task_id]].config.name, task.config.task.type.name, ) return None
def main(mytimer: func.TimerRequest, dashboard: func.Out[str]) -> None: # noqa: F841 expired_tasks = Task.search_expired() for task in expired_tasks: logging.info("stopping expired task: %s", task.job_id) task.stopping() expired_jobs = Job.search_expired() for job in expired_jobs: logging.info("stopping expired job: %s", job.job_id) job.stopping() jobs = Job.search_states(states=JobState.needs_work()) for job in jobs: logging.info("update job: %s", job.job_id) process_state_updates(job) tasks = Task.search_states(states=TaskState.needs_work()) for task in tasks: logging.info("update task: %s", task.task_id) process_state_updates(task) schedule_tasks() Job.stop_never_started_jobs() events = get_events() if events: dashboard.set(events)
def get_waiting(self) -> List[str]: tasks = self.onefuzz.tasks.list(job_id=self.job.job_id) waiting = [ "%s:%s" % (x.config.task.type.name, x.state.name) for x in tasks if x.state not in TaskState.has_started() ] return waiting
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> None: if event.running: task_id = event.running.task_id elif event.done: task_id = event.done.task_id else: raise NotImplementedError task = get_task_checked(task_id) node = get_node_checked(machine_id) node_task = NodeTasks( machine_id=machine_id, task_id=task_id, state=NodeTaskState.running ) if event.running: if task.state not in TaskState.shutting_down(): task.state = TaskState.running if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node.save() node_task.save() # Start the clock for the task if it wasn't started already # (as happens in 1.0.0 agents) task.on_start() elif event.done: node_task.delete() exit_status = event.done.exit_status if not exit_status.success: logging.error("task failed. status:%s", exit_status) task.mark_failed( Error( code=ErrorCode.TASK_FAILED, errors=[ "task failed. exit_status:%s" % exit_status, event.done.stdout, event.done.stderr, ], ) ) else: task.mark_stopping() node.to_reimage(done=True) else: err = Error( code=ErrorCode.INVALID_REQUEST, errors=["invalid worker event type"], ) raise RequestException(err) task.save() task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event) task_event.save()
def main(mytimer: func.TimerRequest, dashboard: func.Out[str]) -> None: # noqa: F841 proxies = Proxy.search_states(states=VmState.needs_work()) for proxy in proxies: logging.info("requeueing update proxy vm: %s", proxy.region) proxy.queue() vms = Repro.search_states(states=VmState.needs_work()) for vm in vms: logging.info("requeueing update vm: %s", vm.vm_id) vm.queue() tasks = Task.search_states(states=TaskState.needs_work()) for task in tasks: logging.info("requeueing update task: %s", task.task_id) task.queue() jobs = Job.search_states(states=JobState.needs_work()) for job in jobs: logging.info("requeueing update job: %s", job.job_id) job.queue() pools = Pool.search_states(states=PoolState.needs_work()) for pool in pools: logging.info("queuing update pool: %s (%s)", pool.pool_id, pool.name) pool.queue() nodes = Node.search_states(states=NodeState.needs_work()) for node in nodes: logging.info("queuing update node: %s", node.machine_id) node.queue() expired_tasks = Task.search_expired() for task in expired_tasks: logging.info("queuing stop for task: %s", task.job_id) task.queue_stop() expired_jobs = Job.search_expired() for job in expired_jobs: logging.info("queuing stop for job: %s", job.job_id) job.queue_stop() # Reminder, proxies are created on-demand. If something is "wrong" with # a proxy, the plan is: delete and recreate it. for proxy in Proxy.search(): if not proxy.is_alive(): logging.error("proxy alive check failed, stopping: %s", proxy.region) proxy.state = VmState.stopping proxy.save() else: proxy.save_proxy_config() event = get_event() if event: dashboard.set(event)
def get_waiting(self) -> List[str]: tasks = self.get_running_tasks_checked() waiting = [] for task in tasks: state_msg = task.state.name if task.state in TaskState.has_started(): task = self.onefuzz.tasks.get(task.task_id) if task.events: continue state_msg = "waiting-for-heartbeat" waiting.append(f"{task.config.task.type.name}:{state_msg}") return waiting
def on_worker_event_running(machine_id: UUID, event: WorkerRunningEvent) -> Result[None]: task = Task.get_by_task_id(event.task_id) if isinstance(task, Error): return task node = get_node(machine_id) if isinstance(node, Error): return node if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node.save() node_task = NodeTasks(machine_id=machine_id, task_id=event.task_id, state=NodeTaskState.running) node_task.save() if task.state in TaskState.shutting_down(): logging.info( "ignoring task start from node. machine_id:%s %s:%s (state: %s)", machine_id, task.job_id, task.task_id, task.state, ) return None logging.info( "task started on node. machine_id:%s %s:%s", machine_id, task.job_id, task.task_id, ) task.state = TaskState.running task.save() # Start the clock for the task if it wasn't started already # (as happens in 1.0.0 agents) task.on_start() task_event = TaskEvent( task_id=task.task_id, machine_id=machine_id, event_data=WorkerEvent(running=event), ) task_event.save() return None
def get_tasks_by_pool_name(cls, pool_name: str) -> List["Task"]: tasks = cls.search_states(states=TaskState.available()) if not tasks: return [] pool_tasks = [] for task in tasks: task_pool = task.get_pool() if not task_pool: continue if pool_name == task_pool.name: pool_tasks.append(task) return pool_tasks
def check_current_job(self) -> Job: job = self.onefuzz.jobs.get(self.job.job_id) if job.state in JobState.shutting_down(): raise StoppedEarly("job unexpectedly stopped early") errors = [] for task in self.onefuzz.tasks.list(job_id=self.job.job_id, state=TaskState.shutting_down()): if task.error: errors.append("%s: %s" % (task.config.task.type, task.error)) else: errors.append("%s" % task.config.task.type) if errors: raise StoppedEarly("tasks stopped unexpectedly.\n%s" % "\n".join(errors)) return job
def on_worker_event_running(machine_id: UUID, event: WorkerRunningEvent) -> Result[None]: task = Task.get_by_task_id(event.task_id) if isinstance(task, Error): return task node = get_node(machine_id) if isinstance(node, Error): return node if node.state not in NodeState.ready_for_reset(): node.set_state(NodeState.busy) node_task = NodeTasks(machine_id=machine_id, task_id=event.task_id, state=NodeTaskState.running) node_task.save() if task.state in TaskState.shutting_down(): logging.info( "ignoring task start from node. " "machine_id:%s job_id:%s task_id:%s (state: %s)", machine_id, task.job_id, task.task_id, task.state, ) return None logging.info( "task started on node. machine_id:%s job_id%s task_id:%s", machine_id, task.job_id, task.task_id, ) task.set_state(TaskState.running) task_event = TaskEvent( task_id=task.task_id, machine_id=machine_id, event_data=WorkerEvent(running=event), ) task_event.save() return None
def mark_failed(self, error: Error, tasks_in_job: Optional[List["Task"]] = None) -> None: if self.state in TaskState.shutting_down(): logging.debug("ignoring post-task stop failures for %s:%s", self.job_id, self.task_id) return if self.error is not None: logging.debug("ignoring additional task error %s:%s", self.job_id, self.task_id) return logging.error("task failed %s:%s - %s", self.job_id, self.task_id, error) self.error = error self.set_state(TaskState.stopping) self.mark_dependants_failed(tasks_in_job=tasks_in_job)
def get_running_tasks_checked(self) -> List[Task]: self.job = self.onefuzz.jobs.get(self.job.job_id) if self.job.state in JobState.shutting_down(): raise StoppedEarly("job unexpectedly stopped early") errors = [] tasks = [] for task in self.onefuzz.tasks.list(job_id=self.job.job_id): if task.state in TaskState.shutting_down(): if task.error: errors.append("%s: %s" % (task.config.task.type, task.error)) else: errors.append("%s" % task.config.task.type) tasks.append(task) if errors: raise StoppedEarly("tasks stopped unexpectedly.\n%s" % "\n".join(errors)) return tasks
def stop_if_complete(self, done: bool = False) -> bool: # returns True on stopping the node and False if this doesn't stop the node from ..tasks.main import Task node_tasks = NodeTasks.get_by_machine_id(self.machine_id) for node_task in node_tasks: task = Task.get_by_task_id(node_task.task_id) # ignore invalid tasks when deciding if the node should be # shutdown if isinstance(task, Error): continue if task.state not in TaskState.shutting_down(): return False logging.info( "node: stopping busy node with all tasks complete: %s", self.machine_id, ) self.stop(done=done) return True
def search_expired(cls) -> List["Task"]: time_filter = "end_time lt datetime'%s'" % datetime.utcnow().isoformat( ) return cls.search(query={"state": TaskState.available()}, raw_unchecked_filter=time_filter)