Example #1
0
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse:
    if event.running:
        task_id = event.running.task_id
    elif event.done:
        task_id = event.done.task_id

    task = get_task_checked(task_id)
    node = get_node_checked(machine_id)
    node_task = NodeTasks(machine_id=machine_id,
                          task_id=task_id,
                          state=NodeTaskState.running)

    if event.running:
        if task.state not in TaskState.shutting_down():
            task.state = TaskState.running
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.busy
        node_task.save()
        task.on_start()
    elif event.done:
        # Only record exit status if the task isn't already shutting down.
        #
        # It's ok for the agent to fail because resources vanish out from underneath
        # it during deletion.
        if task.state not in TaskState.shutting_down():
            exit_status = event.done.exit_status

            if not exit_status.success:
                logging.error("task failed: status = %s", exit_status)

                task.error = Error(
                    code=ErrorCode.TASK_FAILED,
                    errors=[
                        "task failed. exit_status = %s" % exit_status,
                        event.done.stdout,
                        event.done.stderr,
                    ],
                )

            task.state = TaskState.stopping
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.done
        node_task.delete()
    else:
        err = Error(
            code=ErrorCode.INVALID_REQUEST,
            errors=["invalid worker event type"],
        )
        raise RequestException(err)

    task.save()
    node.save()
    task_event = TaskEvent(task_id=task_id,
                           machine_id=machine_id,
                           event_data=event)
    task_event.save()
    return ok(BoolResult(result=True))
Example #2
0
    def mark_stopping(self) -> None:
        if self.state in TaskState.shutting_down():
            logging.debug("ignoring post-task stop calls to stop %s:%s",
                          self.job_id, self.task_id)
            return

        if self.state not in TaskState.has_started():
            self.mark_failed(
                Error(code=ErrorCode.TASK_FAILED,
                      errors=["task never started"]))

        self.set_state(TaskState.stopping)
Example #3
0
def post(req: func.HttpRequest) -> func.HttpResponse:
    request = parse_request(CanScheduleRequest, req)
    if isinstance(request, Error):
        return not_ok(request, context="CanScheduleRequest")

    node = Node.get_by_machine_id(request.machine_id)
    if not node:
        return not_ok(
            Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]),
            context=request.machine_id,
        )

    allowed = True
    work_stopped = False

    if not node.can_process_new_work():
        allowed = False

    task = Task.get_by_task_id(request.task_id)

    work_stopped = isinstance(task, Error) or task.state in TaskState.shutting_down()
    if work_stopped:
        allowed = False

    return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
Example #4
0
def get_queue_tasks() -> Sequence[Tuple[Task, Sequence[str]]]:
    results = []
    for task in Task.search_states(states=TaskState.available()):
        containers = get_input_container_queues(task.config)
        if containers:
            results.append((task, containers))
    return results
Example #5
0
    def mark_stopping(self) -> None:
        if self.state in TaskState.shutting_down():
            logging.debug("ignoring post-task stop calls to stop %s:%s",
                          self.job_id, self.task_id)
            return

        self.set_state(TaskState.stopping)
Example #6
0
    def check_task(self, task_id: UUID,
                   scalesets: List[Scaleset]) -> Optional[str]:
        task = self.of.tasks.get(task_id)

        # Check if the scaleset the task is assigned is OK
        for scaleset in scalesets:
            if (task.config.pool is not None
                    and scaleset.pool_name == task.config.pool.pool_name
                    and scaleset.state not in scaleset.state.available()):
                return "task scaleset failed: %s - %s - %s (%s)" % (
                    self.jobs[self.tasks[task_id]].config.name,
                    task.config.task.type.name,
                    scaleset.state.name,
                    scaleset.error,
                )

        # check if the task itself has an error
        if task.error is not None:
            return "task failed: %s - %s (%s)" % (
                self.jobs[self.tasks[task_id]].config.name,
                task.config.task.type.name,
                task.error,
            )

        # just in case someone else stopped the task
        if task.state in TaskState.shutting_down():
            return "task shutdown early: %s - %s" % (
                self.jobs[self.tasks[task_id]].config.name,
                task.config.task.type.name,
            )
        return None
Example #7
0
def main(mytimer: func.TimerRequest,
         dashboard: func.Out[str]) -> None:  # noqa: F841
    expired_tasks = Task.search_expired()
    for task in expired_tasks:
        logging.info("stopping expired task: %s", task.job_id)
        task.stopping()

    expired_jobs = Job.search_expired()
    for job in expired_jobs:
        logging.info("stopping expired job: %s", job.job_id)
        job.stopping()

    jobs = Job.search_states(states=JobState.needs_work())
    for job in jobs:
        logging.info("update job: %s", job.job_id)
        process_state_updates(job)

    tasks = Task.search_states(states=TaskState.needs_work())
    for task in tasks:
        logging.info("update task: %s", task.task_id)
        process_state_updates(task)

    schedule_tasks()

    Job.stop_never_started_jobs()

    events = get_events()
    if events:
        dashboard.set(events)
Example #8
0
 def get_waiting(self) -> List[str]:
     tasks = self.onefuzz.tasks.list(job_id=self.job.job_id)
     waiting = [
         "%s:%s" % (x.config.task.type.name, x.state.name) for x in tasks
         if x.state not in TaskState.has_started()
     ]
     return waiting
Example #9
0
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> None:
    if event.running:
        task_id = event.running.task_id
    elif event.done:
        task_id = event.done.task_id
    else:
        raise NotImplementedError

    task = get_task_checked(task_id)
    node = get_node_checked(machine_id)
    node_task = NodeTasks(
        machine_id=machine_id, task_id=task_id, state=NodeTaskState.running
    )

    if event.running:
        if task.state not in TaskState.shutting_down():
            task.state = TaskState.running
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.busy
            node.save()
        node_task.save()

        # Start the clock for the task if it wasn't started already
        # (as happens in 1.0.0 agents)
        task.on_start()
    elif event.done:
        node_task.delete()

        exit_status = event.done.exit_status
        if not exit_status.success:
            logging.error("task failed. status:%s", exit_status)
            task.mark_failed(
                Error(
                    code=ErrorCode.TASK_FAILED,
                    errors=[
                        "task failed. exit_status:%s" % exit_status,
                        event.done.stdout,
                        event.done.stderr,
                    ],
                )
            )
        else:
            task.mark_stopping()

        node.to_reimage(done=True)
    else:
        err = Error(
            code=ErrorCode.INVALID_REQUEST,
            errors=["invalid worker event type"],
        )
        raise RequestException(err)

    task.save()

    task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event)
    task_event.save()
Example #10
0
def main(mytimer: func.TimerRequest,
         dashboard: func.Out[str]) -> None:  # noqa: F841
    proxies = Proxy.search_states(states=VmState.needs_work())
    for proxy in proxies:
        logging.info("requeueing update proxy vm: %s", proxy.region)
        proxy.queue()

    vms = Repro.search_states(states=VmState.needs_work())
    for vm in vms:
        logging.info("requeueing update vm: %s", vm.vm_id)
        vm.queue()

    tasks = Task.search_states(states=TaskState.needs_work())
    for task in tasks:
        logging.info("requeueing update task: %s", task.task_id)
        task.queue()

    jobs = Job.search_states(states=JobState.needs_work())
    for job in jobs:
        logging.info("requeueing update job: %s", job.job_id)
        job.queue()

    pools = Pool.search_states(states=PoolState.needs_work())
    for pool in pools:
        logging.info("queuing update pool: %s (%s)", pool.pool_id, pool.name)
        pool.queue()

    nodes = Node.search_states(states=NodeState.needs_work())
    for node in nodes:
        logging.info("queuing update node: %s", node.machine_id)
        node.queue()

    expired_tasks = Task.search_expired()
    for task in expired_tasks:
        logging.info("queuing stop for task: %s", task.job_id)
        task.queue_stop()

    expired_jobs = Job.search_expired()
    for job in expired_jobs:
        logging.info("queuing stop for job: %s", job.job_id)
        job.queue_stop()

    # Reminder, proxies are created on-demand.  If something is "wrong" with
    # a proxy, the plan is: delete and recreate it.
    for proxy in Proxy.search():
        if not proxy.is_alive():
            logging.error("proxy alive check failed, stopping: %s",
                          proxy.region)
            proxy.state = VmState.stopping
            proxy.save()
        else:
            proxy.save_proxy_config()

    event = get_event()
    if event:
        dashboard.set(event)
Example #11
0
    def get_waiting(self) -> List[str]:
        tasks = self.get_running_tasks_checked()

        waiting = []
        for task in tasks:
            state_msg = task.state.name
            if task.state in TaskState.has_started():
                task = self.onefuzz.tasks.get(task.task_id)
                if task.events:
                    continue
                state_msg = "waiting-for-heartbeat"

            waiting.append(f"{task.config.task.type.name}:{state_msg}")
        return waiting
Example #12
0
def on_worker_event_running(machine_id: UUID,
                            event: WorkerRunningEvent) -> Result[None]:
    task = Task.get_by_task_id(event.task_id)
    if isinstance(task, Error):
        return task

    node = get_node(machine_id)
    if isinstance(node, Error):
        return node

    if node.state not in NodeState.ready_for_reset():
        node.state = NodeState.busy
        node.save()

    node_task = NodeTasks(machine_id=machine_id,
                          task_id=event.task_id,
                          state=NodeTaskState.running)
    node_task.save()

    if task.state in TaskState.shutting_down():
        logging.info(
            "ignoring task start from node.  machine_id:%s %s:%s (state: %s)",
            machine_id,
            task.job_id,
            task.task_id,
            task.state,
        )
        return None

    logging.info(
        "task started on node.  machine_id:%s %s:%s",
        machine_id,
        task.job_id,
        task.task_id,
    )
    task.state = TaskState.running
    task.save()

    # Start the clock for the task if it wasn't started already
    # (as happens in 1.0.0 agents)
    task.on_start()

    task_event = TaskEvent(
        task_id=task.task_id,
        machine_id=machine_id,
        event_data=WorkerEvent(running=event),
    )
    task_event.save()

    return None
Example #13
0
    def get_tasks_by_pool_name(cls, pool_name: str) -> List["Task"]:
        tasks = cls.search_states(states=TaskState.available())
        if not tasks:
            return []

        pool_tasks = []

        for task in tasks:
            task_pool = task.get_pool()
            if not task_pool:
                continue
            if pool_name == task_pool.name:
                pool_tasks.append(task)

        return pool_tasks
Example #14
0
    def check_current_job(self) -> Job:
        job = self.onefuzz.jobs.get(self.job.job_id)
        if job.state in JobState.shutting_down():
            raise StoppedEarly("job unexpectedly stopped early")

        errors = []
        for task in self.onefuzz.tasks.list(job_id=self.job.job_id,
                                            state=TaskState.shutting_down()):
            if task.error:
                errors.append("%s: %s" % (task.config.task.type, task.error))
            else:
                errors.append("%s" % task.config.task.type)

        if errors:
            raise StoppedEarly("tasks stopped unexpectedly.\n%s" %
                               "\n".join(errors))
        return job
Example #15
0
def on_worker_event_running(machine_id: UUID,
                            event: WorkerRunningEvent) -> Result[None]:
    task = Task.get_by_task_id(event.task_id)
    if isinstance(task, Error):
        return task

    node = get_node(machine_id)
    if isinstance(node, Error):
        return node

    if node.state not in NodeState.ready_for_reset():
        node.set_state(NodeState.busy)

    node_task = NodeTasks(machine_id=machine_id,
                          task_id=event.task_id,
                          state=NodeTaskState.running)
    node_task.save()

    if task.state in TaskState.shutting_down():
        logging.info(
            "ignoring task start from node. "
            "machine_id:%s job_id:%s task_id:%s (state: %s)",
            machine_id,
            task.job_id,
            task.task_id,
            task.state,
        )
        return None

    logging.info(
        "task started on node.  machine_id:%s job_id%s task_id:%s",
        machine_id,
        task.job_id,
        task.task_id,
    )
    task.set_state(TaskState.running)

    task_event = TaskEvent(
        task_id=task.task_id,
        machine_id=machine_id,
        event_data=WorkerEvent(running=event),
    )
    task_event.save()

    return None
Example #16
0
    def mark_failed(self,
                    error: Error,
                    tasks_in_job: Optional[List["Task"]] = None) -> None:
        if self.state in TaskState.shutting_down():
            logging.debug("ignoring post-task stop failures for %s:%s",
                          self.job_id, self.task_id)
            return

        if self.error is not None:
            logging.debug("ignoring additional task error %s:%s", self.job_id,
                          self.task_id)
            return

        logging.error("task failed %s:%s - %s", self.job_id, self.task_id,
                      error)

        self.error = error
        self.set_state(TaskState.stopping)

        self.mark_dependants_failed(tasks_in_job=tasks_in_job)
Example #17
0
    def get_running_tasks_checked(self) -> List[Task]:
        self.job = self.onefuzz.jobs.get(self.job.job_id)
        if self.job.state in JobState.shutting_down():
            raise StoppedEarly("job unexpectedly stopped early")

        errors = []
        tasks = []
        for task in self.onefuzz.tasks.list(job_id=self.job.job_id):
            if task.state in TaskState.shutting_down():
                if task.error:
                    errors.append("%s: %s" %
                                  (task.config.task.type, task.error))
                else:
                    errors.append("%s" % task.config.task.type)
            tasks.append(task)

        if errors:
            raise StoppedEarly("tasks stopped unexpectedly.\n%s" %
                               "\n".join(errors))
        return tasks
Example #18
0
    def stop_if_complete(self, done: bool = False) -> bool:
        # returns True on stopping the node and False if this doesn't stop the node
        from ..tasks.main import Task

        node_tasks = NodeTasks.get_by_machine_id(self.machine_id)
        for node_task in node_tasks:
            task = Task.get_by_task_id(node_task.task_id)
            # ignore invalid tasks when deciding if the node should be
            # shutdown
            if isinstance(task, Error):
                continue

            if task.state not in TaskState.shutting_down():
                return False

        logging.info(
            "node: stopping busy node with all tasks complete: %s",
            self.machine_id,
        )
        self.stop(done=done)
        return True
Example #19
0
 def search_expired(cls) -> List["Task"]:
     time_filter = "end_time lt datetime'%s'" % datetime.utcnow().isoformat(
     )
     return cls.search(query={"state": TaskState.available()},
                       raw_unchecked_filter=time_filter)