Esempio n. 1
0
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse:
    if event.running:
        task_id = event.running.task_id
    elif event.done:
        task_id = event.done.task_id

    task = get_task_checked(task_id)
    node = get_node_checked(machine_id)
    node_task = NodeTasks(machine_id=machine_id,
                          task_id=task_id,
                          state=NodeTaskState.running)

    if event.running:
        if task.state not in TaskState.shutting_down():
            task.state = TaskState.running
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.busy
        node_task.save()
        task.on_start()
    elif event.done:
        # Only record exit status if the task isn't already shutting down.
        #
        # It's ok for the agent to fail because resources vanish out from underneath
        # it during deletion.
        if task.state not in TaskState.shutting_down():
            exit_status = event.done.exit_status

            if not exit_status.success:
                logging.error("task failed: status = %s", exit_status)

                task.error = Error(
                    code=ErrorCode.TASK_FAILED,
                    errors=[
                        "task failed. exit_status = %s" % exit_status,
                        event.done.stdout,
                        event.done.stderr,
                    ],
                )

            task.state = TaskState.stopping
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.done
        node_task.delete()
    else:
        err = Error(
            code=ErrorCode.INVALID_REQUEST,
            errors=["invalid worker event type"],
        )
        raise RequestException(err)

    task.save()
    node.save()
    task_event = TaskEvent(task_id=task_id,
                           machine_id=machine_id,
                           event_data=event)
    task_event.save()
    return ok(BoolResult(result=True))
Esempio n. 2
0
    def mark_stopping(self) -> None:
        if self.state in TaskState.shutting_down():
            logging.debug("ignoring post-task stop calls to stop %s:%s",
                          self.job_id, self.task_id)
            return

        self.set_state(TaskState.stopping)
Esempio n. 3
0
def post(req: func.HttpRequest) -> func.HttpResponse:
    request = parse_request(CanScheduleRequest, req)
    if isinstance(request, Error):
        return not_ok(request, context="CanScheduleRequest")

    node = Node.get_by_machine_id(request.machine_id)
    if not node:
        return not_ok(
            Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]),
            context=request.machine_id,
        )

    allowed = True
    work_stopped = False

    if not node.can_process_new_work():
        allowed = False

    task = Task.get_by_task_id(request.task_id)

    work_stopped = isinstance(task, Error) or task.state in TaskState.shutting_down()
    if work_stopped:
        allowed = False

    return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
Esempio n. 4
0
    def check_task(self, task_id: UUID,
                   scalesets: List[Scaleset]) -> Optional[str]:
        task = self.of.tasks.get(task_id)

        # Check if the scaleset the task is assigned is OK
        for scaleset in scalesets:
            if (task.config.pool is not None
                    and scaleset.pool_name == task.config.pool.pool_name
                    and scaleset.state not in scaleset.state.available()):
                return "task scaleset failed: %s - %s - %s (%s)" % (
                    self.jobs[self.tasks[task_id]].config.name,
                    task.config.task.type.name,
                    scaleset.state.name,
                    scaleset.error,
                )

        # check if the task itself has an error
        if task.error is not None:
            return "task failed: %s - %s (%s)" % (
                self.jobs[self.tasks[task_id]].config.name,
                task.config.task.type.name,
                task.error,
            )

        # just in case someone else stopped the task
        if task.state in TaskState.shutting_down():
            return "task shutdown early: %s - %s" % (
                self.jobs[self.tasks[task_id]].config.name,
                task.config.task.type.name,
            )
        return None
Esempio n. 5
0
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> None:
    if event.running:
        task_id = event.running.task_id
    elif event.done:
        task_id = event.done.task_id
    else:
        raise NotImplementedError

    task = get_task_checked(task_id)
    node = get_node_checked(machine_id)
    node_task = NodeTasks(
        machine_id=machine_id, task_id=task_id, state=NodeTaskState.running
    )

    if event.running:
        if task.state not in TaskState.shutting_down():
            task.state = TaskState.running
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.busy
            node.save()
        node_task.save()

        # Start the clock for the task if it wasn't started already
        # (as happens in 1.0.0 agents)
        task.on_start()
    elif event.done:
        node_task.delete()

        exit_status = event.done.exit_status
        if not exit_status.success:
            logging.error("task failed. status:%s", exit_status)
            task.mark_failed(
                Error(
                    code=ErrorCode.TASK_FAILED,
                    errors=[
                        "task failed. exit_status:%s" % exit_status,
                        event.done.stdout,
                        event.done.stderr,
                    ],
                )
            )
        else:
            task.mark_stopping()

        node.to_reimage(done=True)
    else:
        err = Error(
            code=ErrorCode.INVALID_REQUEST,
            errors=["invalid worker event type"],
        )
        raise RequestException(err)

    task.save()

    task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event)
    task_event.save()
Esempio n. 6
0
    def mark_stopping(self) -> None:
        if self.state in TaskState.shutting_down():
            logging.debug("ignoring post-task stop calls to stop %s:%s",
                          self.job_id, self.task_id)
            return

        if self.state not in TaskState.has_started():
            self.mark_failed(
                Error(code=ErrorCode.TASK_FAILED,
                      errors=["task never started"]))

        self.set_state(TaskState.stopping)
Esempio n. 7
0
def on_worker_event_running(machine_id: UUID,
                            event: WorkerRunningEvent) -> Result[None]:
    task = Task.get_by_task_id(event.task_id)
    if isinstance(task, Error):
        return task

    node = get_node(machine_id)
    if isinstance(node, Error):
        return node

    if node.state not in NodeState.ready_for_reset():
        node.state = NodeState.busy
        node.save()

    node_task = NodeTasks(machine_id=machine_id,
                          task_id=event.task_id,
                          state=NodeTaskState.running)
    node_task.save()

    if task.state in TaskState.shutting_down():
        logging.info(
            "ignoring task start from node.  machine_id:%s %s:%s (state: %s)",
            machine_id,
            task.job_id,
            task.task_id,
            task.state,
        )
        return None

    logging.info(
        "task started on node.  machine_id:%s %s:%s",
        machine_id,
        task.job_id,
        task.task_id,
    )
    task.state = TaskState.running
    task.save()

    # Start the clock for the task if it wasn't started already
    # (as happens in 1.0.0 agents)
    task.on_start()

    task_event = TaskEvent(
        task_id=task.task_id,
        machine_id=machine_id,
        event_data=WorkerEvent(running=event),
    )
    task_event.save()

    return None
Esempio n. 8
0
    def check_current_job(self) -> Job:
        job = self.onefuzz.jobs.get(self.job.job_id)
        if job.state in JobState.shutting_down():
            raise StoppedEarly("job unexpectedly stopped early")

        errors = []
        for task in self.onefuzz.tasks.list(job_id=self.job.job_id,
                                            state=TaskState.shutting_down()):
            if task.error:
                errors.append("%s: %s" % (task.config.task.type, task.error))
            else:
                errors.append("%s" % task.config.task.type)

        if errors:
            raise StoppedEarly("tasks stopped unexpectedly.\n%s" %
                               "\n".join(errors))
        return job
Esempio n. 9
0
def on_worker_event_running(machine_id: UUID,
                            event: WorkerRunningEvent) -> Result[None]:
    task = Task.get_by_task_id(event.task_id)
    if isinstance(task, Error):
        return task

    node = get_node(machine_id)
    if isinstance(node, Error):
        return node

    if node.state not in NodeState.ready_for_reset():
        node.set_state(NodeState.busy)

    node_task = NodeTasks(machine_id=machine_id,
                          task_id=event.task_id,
                          state=NodeTaskState.running)
    node_task.save()

    if task.state in TaskState.shutting_down():
        logging.info(
            "ignoring task start from node. "
            "machine_id:%s job_id:%s task_id:%s (state: %s)",
            machine_id,
            task.job_id,
            task.task_id,
            task.state,
        )
        return None

    logging.info(
        "task started on node.  machine_id:%s job_id%s task_id:%s",
        machine_id,
        task.job_id,
        task.task_id,
    )
    task.set_state(TaskState.running)

    task_event = TaskEvent(
        task_id=task.task_id,
        machine_id=machine_id,
        event_data=WorkerEvent(running=event),
    )
    task_event.save()

    return None
Esempio n. 10
0
    def mark_failed(self,
                    error: Error,
                    tasks_in_job: Optional[List["Task"]] = None) -> None:
        if self.state in TaskState.shutting_down():
            logging.debug("ignoring post-task stop failures for %s:%s",
                          self.job_id, self.task_id)
            return

        if self.error is not None:
            logging.debug("ignoring additional task error %s:%s", self.job_id,
                          self.task_id)
            return

        logging.error("task failed %s:%s - %s", self.job_id, self.task_id,
                      error)

        self.error = error
        self.set_state(TaskState.stopping)

        self.mark_dependants_failed(tasks_in_job=tasks_in_job)
Esempio n. 11
0
    def get_running_tasks_checked(self) -> List[Task]:
        self.job = self.onefuzz.jobs.get(self.job.job_id)
        if self.job.state in JobState.shutting_down():
            raise StoppedEarly("job unexpectedly stopped early")

        errors = []
        tasks = []
        for task in self.onefuzz.tasks.list(job_id=self.job.job_id):
            if task.state in TaskState.shutting_down():
                if task.error:
                    errors.append("%s: %s" %
                                  (task.config.task.type, task.error))
                else:
                    errors.append("%s" % task.config.task.type)
            tasks.append(task)

        if errors:
            raise StoppedEarly("tasks stopped unexpectedly.\n%s" %
                               "\n".join(errors))
        return tasks
Esempio n. 12
0
    def stop_if_complete(self, done: bool = False) -> bool:
        # returns True on stopping the node and False if this doesn't stop the node
        from ..tasks.main import Task

        node_tasks = NodeTasks.get_by_machine_id(self.machine_id)
        for node_task in node_tasks:
            task = Task.get_by_task_id(node_task.task_id)
            # ignore invalid tasks when deciding if the node should be
            # shutdown
            if isinstance(task, Error):
                continue

            if task.state not in TaskState.shutting_down():
                return False

        logging.info(
            "node: stopping busy node with all tasks complete: %s",
            self.machine_id,
        )
        self.stop(done=done)
        return True