Exemple #1
0
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse:
    if event.running:
        task_id = event.running.task_id
    elif event.done:
        task_id = event.done.task_id

    task = get_task_checked(task_id)
    node = get_node_checked(machine_id)
    node_task = NodeTasks(machine_id=machine_id,
                          task_id=task_id,
                          state=NodeTaskState.running)

    if event.running:
        if task.state not in TaskState.shutting_down():
            task.state = TaskState.running
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.busy
        node_task.save()
        task.on_start()
    elif event.done:
        # Only record exit status if the task isn't already shutting down.
        #
        # It's ok for the agent to fail because resources vanish out from underneath
        # it during deletion.
        if task.state not in TaskState.shutting_down():
            exit_status = event.done.exit_status

            if not exit_status.success:
                logging.error("task failed: status = %s", exit_status)

                task.error = Error(
                    code=ErrorCode.TASK_FAILED,
                    errors=[
                        "task failed. exit_status = %s" % exit_status,
                        event.done.stdout,
                        event.done.stderr,
                    ],
                )

            task.state = TaskState.stopping
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.done
        node_task.delete()
    else:
        err = Error(
            code=ErrorCode.INVALID_REQUEST,
            errors=["invalid worker event type"],
        )
        raise RequestException(err)

    task.save()
    node.save()
    task_event = TaskEvent(task_id=task_id,
                           machine_id=machine_id,
                           event_data=event)
    task_event.save()
    return ok(BoolResult(result=True))
Exemple #2
0
def main(mytimer: func.TimerRequest) -> None:  # noqa: F841
    # NOTE: Update pools first, such that scalesets impacted by pool updates
    # (such as shutdown or resize) happen during this iteration `timer_worker`
    # rather than the following iteration.

    pools = Pool.search()
    for pool in pools:
        if pool.state in PoolState.needs_work():
            logging.info("update pool: %s (%s)", pool.pool_id, pool.name)
            process_state_updates(pool)

        if pool.state in PoolState.available() and pool.autoscale:
            autoscale_pool(pool)

    # NOTE: Nodes, and Scalesets should be processed in a consistent order such
    # during 'pool scale down' operations. This means that pools that are
    # scaling down will more likely remove from the same scalesets over time.
    # By more likely removing from the same scalesets, we are more likely to
    # get to empty scalesets, which can safely be deleted.

    Node.mark_outdated_nodes()
    Node.cleanup_busy_nodes_without_work()
    nodes = Node.search_states(states=NodeState.needs_work())
    for node in sorted(nodes, key=lambda x: x.machine_id):
        logging.info("update node: %s", node.machine_id)
        process_state_updates(node)

    scalesets = Scaleset.search()
    for scaleset in sorted(scalesets, key=lambda x: x.scaleset_id):
        process_scaleset(scaleset)
Exemple #3
0
    def cleanup_nodes(self) -> bool:
        if self.state == ScalesetState.halt:
            logging.info("halting scaleset: %s", self.scaleset_id)
            self.halt()
            return True

        to_reimage = []
        to_delete = []

        nodes = Node.search_states(
            scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset()
        )

        if not nodes:
            logging.info("no nodes need updating: %s", self.scaleset_id)
            return False

        # ground truth of existing nodes
        azure_nodes = list_instance_ids(self.scaleset_id)

        for node in nodes:
            if node.machine_id not in azure_nodes:
                logging.info(
                    "no longer in scaleset: %s:%s", self.scaleset_id, node.machine_id
                )
                node.delete()
            elif node.delete_requested:
                to_delete.append(node)
            else:
                if ScalesetShrinkQueue(self.scaleset_id).should_shrink():
                    node.set_halt()
                    to_delete.append(node)
                elif not node.reimage_queued:
                    # only add nodes that are not already set to reschedule
                    to_reimage.append(node)

        dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME)
        for node in dead_nodes:
            node.set_halt()
            to_reimage.append(node)

        # Perform operations until they fail due to scaleset getting locked
        try:
            if to_delete:
                logging.info(
                    "deleting nodes: %s - count: %d", self.scaleset_id, len(to_delete)
                )
                self.delete_nodes(to_delete)
                for node in to_delete:
                    node.set_halt()
                    node.state = NodeState.halt
                    node.save()

            if to_reimage:
                self.reimage_nodes(to_reimage)
        except UnableToUpdate:
            logging.info("scaleset update already in progress: %s", self.scaleset_id)

        return True
Exemple #4
0
    def to_reimage(self, done: bool = False) -> None:
        if done:
            if self.state not in NodeState.ready_for_reset():
                self.state = NodeState.done

        if not self.reimage_requested and not self.delete_requested:
            logging.info("setting reimage_requested: %s", self.machine_id)
            self.reimage_requested = True
        self.save()
Exemple #5
0
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> None:
    if event.running:
        task_id = event.running.task_id
    elif event.done:
        task_id = event.done.task_id
    else:
        raise NotImplementedError

    task = get_task_checked(task_id)
    node = get_node_checked(machine_id)
    node_task = NodeTasks(
        machine_id=machine_id, task_id=task_id, state=NodeTaskState.running
    )

    if event.running:
        if task.state not in TaskState.shutting_down():
            task.state = TaskState.running
        if node.state not in NodeState.ready_for_reset():
            node.state = NodeState.busy
            node.save()
        node_task.save()

        # Start the clock for the task if it wasn't started already
        # (as happens in 1.0.0 agents)
        task.on_start()
    elif event.done:
        node_task.delete()

        exit_status = event.done.exit_status
        if not exit_status.success:
            logging.error("task failed. status:%s", exit_status)
            task.mark_failed(
                Error(
                    code=ErrorCode.TASK_FAILED,
                    errors=[
                        "task failed. exit_status:%s" % exit_status,
                        event.done.stdout,
                        event.done.stderr,
                    ],
                )
            )
        else:
            task.mark_stopping()

        node.to_reimage(done=True)
    else:
        err = Error(
            code=ErrorCode.INVALID_REQUEST,
            errors=["invalid worker event type"],
        )
        raise RequestException(err)

    task.save()

    task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event)
    task_event.save()
Exemple #6
0
def main(mytimer: func.TimerRequest,
         dashboard: func.Out[str]) -> None:  # noqa: F841
    proxies = Proxy.search_states(states=VmState.needs_work())
    for proxy in proxies:
        logging.info("requeueing update proxy vm: %s", proxy.region)
        proxy.queue()

    vms = Repro.search_states(states=VmState.needs_work())
    for vm in vms:
        logging.info("requeueing update vm: %s", vm.vm_id)
        vm.queue()

    tasks = Task.search_states(states=TaskState.needs_work())
    for task in tasks:
        logging.info("requeueing update task: %s", task.task_id)
        task.queue()

    jobs = Job.search_states(states=JobState.needs_work())
    for job in jobs:
        logging.info("requeueing update job: %s", job.job_id)
        job.queue()

    pools = Pool.search_states(states=PoolState.needs_work())
    for pool in pools:
        logging.info("queuing update pool: %s (%s)", pool.pool_id, pool.name)
        pool.queue()

    nodes = Node.search_states(states=NodeState.needs_work())
    for node in nodes:
        logging.info("queuing update node: %s", node.machine_id)
        node.queue()

    expired_tasks = Task.search_expired()
    for task in expired_tasks:
        logging.info("queuing stop for task: %s", task.job_id)
        task.queue_stop()

    expired_jobs = Job.search_expired()
    for job in expired_jobs:
        logging.info("queuing stop for job: %s", job.job_id)
        job.queue_stop()

    # Reminder, proxies are created on-demand.  If something is "wrong" with
    # a proxy, the plan is: delete and recreate it.
    for proxy in Proxy.search():
        if not proxy.is_alive():
            logging.error("proxy alive check failed, stopping: %s",
                          proxy.region)
            proxy.state = VmState.stopping
            proxy.save()
        else:
            proxy.save_proxy_config()

    event = get_event()
    if event:
        dashboard.set(event)
Exemple #7
0
    def cleanup_nodes(self) -> bool:
        if self.state == ScalesetState.halt:
            self.halt()
            return True

        nodes = Node.search_states(scaleset_id=self.scaleset_id,
                                   states=NodeState.ready_for_reset())

        outdated = Node.search_outdated(
            scaleset_id=self.scaleset_id,
            states=[NodeState.free],
        )

        if not (nodes or outdated):
            logging.debug("scaleset node gc done (no nodes) %s",
                          self.scaleset_id)
            return False

        to_delete = []
        to_reimage = []

        for node in outdated:
            if node.version == "1.0.0":
                to_reimage.append(node)
            else:
                stop_message = NodeMessage(
                    agent_id=node.machine_id,
                    message=NodeCommand(stop=StopNodeCommand()),
                )
                stop_message.save()

        for node in nodes:
            # delete nodes that are not waiting on the scaleset GC
            if not node.scaleset_node_exists():
                node.delete()
            elif node.state in [NodeState.shutdown, NodeState.halt]:
                to_delete.append(node)
            else:
                to_reimage.append(node)

        # Perform operations until they fail due to scaleset getting locked
        try:
            if to_delete:
                self.delete_nodes(to_delete)
                for node in to_delete:
                    node.state = NodeState.halt
                    node.save()

            if to_reimage:
                self.reimage_nodes(to_reimage)
        except UnableToUpdate:
            logging.info("scaleset update already in progress: %s",
                         self.scaleset_id)
        return True
Exemple #8
0
def on_state_update(machine_id: UUID, state: NodeState) -> func.HttpResponse:
    node = get_node_checked(machine_id)

    if state == NodeState.init or node.state not in NodeState.ready_for_reset(
    ):
        if node.state != state:
            node.state = state
            node.save()
    else:
        logging.info("ignoring state updates from the node: %s: %s",
                     machine_id, state)

    return ok(BoolResult(result=True))
Exemple #9
0
 def stop_task(cls, task_id: UUID) -> None:
     # For now, this just re-images the node.  Eventually, this
     # should send a message to the node to let the agent shut down
     # gracefully
     nodes = NodeTasks.get_nodes_by_task_id(task_id)
     for node in nodes:
         if node.state not in NodeState.ready_for_reset():
             logging.info(
                 "stopping machine_id:%s running task:%s",
                 node.machine_id,
                 task_id,
             )
             node.stop()
Exemple #10
0
    def to_reimage(self, done: bool = False) -> None:
        if done:
            if self.state not in NodeState.ready_for_reset():
                self.state = NodeState.done

        if not self.reimage_requested and not self.delete_requested:
            logging.info("setting reimage_requested: %s", self.machine_id)
            self.reimage_requested = True

        # if we're going to reimage, make sure the node doesn't pick up new work
        # too.
        self.send_stop_if_free()

        self.save()
Exemple #11
0
def on_worker_event_running(machine_id: UUID,
                            event: WorkerRunningEvent) -> Result[None]:
    task = Task.get_by_task_id(event.task_id)
    if isinstance(task, Error):
        return task

    node = get_node(machine_id)
    if isinstance(node, Error):
        return node

    if node.state not in NodeState.ready_for_reset():
        node.state = NodeState.busy
        node.save()

    node_task = NodeTasks(machine_id=machine_id,
                          task_id=event.task_id,
                          state=NodeTaskState.running)
    node_task.save()

    if task.state in TaskState.shutting_down():
        logging.info(
            "ignoring task start from node.  machine_id:%s %s:%s (state: %s)",
            machine_id,
            task.job_id,
            task.task_id,
            task.state,
        )
        return None

    logging.info(
        "task started on node.  machine_id:%s %s:%s",
        machine_id,
        task.job_id,
        task.task_id,
    )
    task.state = TaskState.running
    task.save()

    # Start the clock for the task if it wasn't started already
    # (as happens in 1.0.0 agents)
    task.on_start()

    task_event = TaskEvent(
        task_id=task.task_id,
        machine_id=machine_id,
        event_data=WorkerEvent(running=event),
    )
    task_event.save()

    return None
Exemple #12
0
def on_state_update(
    machine_id: UUID,
    state_update: NodeStateUpdate,
) -> func.HttpResponse:
    state = state_update.state
    node = get_node_checked(machine_id)

    if state == NodeState.init or node.state not in NodeState.ready_for_reset(
    ):
        if node.state != state:
            node.state = state
            node.save()

            if state == NodeState.setting_up:
                # This field will be required in the future.
                # For now, it is optional for back compat.
                if state_update.data:
                    for task_id in state_update.data.tasks:
                        task = get_task_checked(task_id)

                        # The task state may be `running` if it has `vm_count` > 1, and
                        # another node is concurrently executing the task. If so, leave
                        # the state as-is, to represent the max progress made.
                        #
                        # Other states we would want to preserve are excluded by the
                        # outermost conditional check.
                        if task.state != TaskState.running:
                            task.state = TaskState.setting_up

                        # We don't yet call `on_start()` for the task.
                        # This will happen once we see a worker event that
                        # reports it as `running`.
                        task.save()

                        # Note: we set the node task state to `setting_up`, even though
                        # the task itself may be `running`.
                        node_task = NodeTasks(
                            machine_id=machine_id,
                            task_id=task_id,
                            state=NodeTaskState.setting_up,
                        )
                        node_task.save()
    else:
        logging.info("ignoring state updates from the node: %s: %s",
                     machine_id, state)

    return ok(BoolResult(result=True))
Exemple #13
0
def on_worker_event_running(machine_id: UUID,
                            event: WorkerRunningEvent) -> Result[None]:
    task = Task.get_by_task_id(event.task_id)
    if isinstance(task, Error):
        return task

    node = get_node(machine_id)
    if isinstance(node, Error):
        return node

    if node.state not in NodeState.ready_for_reset():
        node.set_state(NodeState.busy)

    node_task = NodeTasks(machine_id=machine_id,
                          task_id=event.task_id,
                          state=NodeTaskState.running)
    node_task.save()

    if task.state in TaskState.shutting_down():
        logging.info(
            "ignoring task start from node. "
            "machine_id:%s job_id:%s task_id:%s (state: %s)",
            machine_id,
            task.job_id,
            task.task_id,
            task.state,
        )
        return None

    logging.info(
        "task started on node.  machine_id:%s job_id%s task_id:%s",
        machine_id,
        task.job_id,
        task.task_id,
    )
    task.set_state(TaskState.running)

    task_event = TaskEvent(
        task_id=task.task_id,
        machine_id=machine_id,
        event_data=WorkerEvent(running=event),
    )
    task_event.save()

    return None
Exemple #14
0
    def can_process_new_work(self) -> bool:
        if self.is_outdated():
            logging.info(
                "can_schedule old version machine_id:%s version:%s",
                self.machine_id,
                self.version,
            )
            self.stop()
            return False

        if self.state in NodeState.ready_for_reset():
            logging.info("can_schedule node is set for reset.  machine_id:%s",
                         self.machine_id)
            return False

        if self.delete_requested:
            logging.info(
                "can_schedule is set to be deleted.  machine_id:%s",
                self.machine_id,
            )
            self.stop()
            return False

        if self.reimage_requested:
            logging.info(
                "can_schedule is set to be reimaged.  machine_id:%s",
                self.machine_id,
            )
            self.stop()
            return False

        if self.could_shrink_scaleset():
            self.set_halt()
            logging.info("node scheduled to shrink.  machine_id:%s",
                         self.machine_id)
            return False

        return True
Exemple #15
0
def main(mytimer: func.TimerRequest,
         dashboard: func.Out[str]) -> None:  # noqa: F841
    Node.mark_outdated_nodes()
    nodes = Node.search_states(states=NodeState.needs_work())
    for node in nodes:
        logging.info("update node: %s", node.machine_id)
        process_state_updates(node)

    scalesets = Scaleset.search()
    for scaleset in scalesets:
        process_scaleset(scaleset)

    pools = Pool.search()
    for pool in pools:
        if pool.state in PoolState.needs_work():
            logging.info("update pool: %s (%s)", pool.pool_id, pool.name)
            process_state_updates(pool)
        elif pool.state in PoolState.available() and pool.autoscale:
            autoscale_pool(pool)

    event = get_event()
    if event:
        dashboard.set(event)
Exemple #16
0
    def cleanup_nodes(self) -> bool:
        if self.state == ScalesetState.halt:
            self.halt()
            return True

        nodes = Node.search_states(
            scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset()
        )
        if not nodes:
            logging.debug("scaleset node gc done (no nodes) %s", self.scaleset_id)
            return False

        to_delete = []
        to_reimage = []

        for node in nodes:
            # delete nodes that are not waiting on the scaleset GC
            if not node.scaleset_node_exists():
                node.delete()
            elif node.state in [NodeState.shutdown, NodeState.halt]:
                to_delete.append(node)
            else:
                to_reimage.append(node)

        # Perform operations until they fail due to scaleset getting locked
        try:
            if to_delete:
                self.delete_nodes(to_delete)
                for node in to_delete:
                    node.state = NodeState.halt
                    node.save()

            if to_reimage:
                self.reimage_nodes(to_reimage)
        except UnableToUpdate:
            logging.info("scaleset update already in progress: %s", self.scaleset_id)
        return True
Exemple #17
0
    def cleanup_nodes(self) -> bool:
        from .pools import Pool

        logging.info(SCALESET_LOG_PREFIX + "cleaning up nodes. scaleset_id:%s",
                     self.scaleset_id)
        if self.state == ScalesetState.halt:
            logging.info(
                SCALESET_LOG_PREFIX + "halting scaleset scaleset_id:%s",
                self.scaleset_id,
            )
            self.halt()
            return True

        pool = Pool.get_by_name(self.pool_name)
        if isinstance(pool, Error):
            logging.error(
                "unable to find pool during cleanup: %s - %s",
                self.scaleset_id,
                pool,
            )
            self.set_failed(pool)
            return True

        Node.reimage_long_lived_nodes(self.scaleset_id)

        to_reimage = []
        to_delete = []

        # ground truth of existing nodes
        azure_nodes = list_instance_ids(self.scaleset_id)

        nodes = Node.search_states(scaleset_id=self.scaleset_id)

        # Nodes do not exists in scalesets but in table due to unknown failure
        for node in nodes:
            if node.machine_id not in azure_nodes:
                logging.info(
                    SCALESET_LOG_PREFIX +
                    "no longer in scaleset. scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    node.machine_id,
                )
                node.delete()

        # Scalesets can have nodes that never check in (such as broken OS setup
        # scripts).
        #
        # This will add nodes that Azure knows about but have not checked in
        # such that the `dead node` detection will eventually reimage the node.
        #
        # NOTE: If node setup takes longer than NODE_EXPIRATION_TIME (1 hour),
        # this will cause the nodes to continuously get reimaged.
        node_machine_ids = [x.machine_id for x in nodes]
        for machine_id in azure_nodes:
            if machine_id in node_machine_ids:
                continue

            logging.info(
                SCALESET_LOG_PREFIX +
                "adding missing azure node. scaleset_id:%s machine_id:%s",
                self.scaleset_id,
                machine_id,
            )

            # Note, using `new=True` makes it such that if a node already has
            # checked in, this won't overwrite it.
            Node.create(
                pool_id=pool.pool_id,
                pool_name=self.pool_name,
                machine_id=machine_id,
                scaleset_id=self.scaleset_id,
                version=__version__,
                new=True,
            )

        existing_nodes = [x for x in nodes if x.machine_id in azure_nodes]
        nodes_to_reset = [
            x for x in existing_nodes
            if x.state in NodeState.ready_for_reset()
        ]

        for node in nodes_to_reset:
            if node.delete_requested:
                to_delete.append(node)
            else:
                if ShrinkQueue(self.scaleset_id).should_shrink():
                    node.set_halt()
                    to_delete.append(node)
                elif ShrinkQueue(pool.pool_id).should_shrink():
                    node.set_halt()
                    to_delete.append(node)
                else:
                    to_reimage.append(node)

        dead_nodes = Node.get_dead_nodes(self.scaleset_id,
                                         NODE_EXPIRATION_TIME)
        if dead_nodes:
            logging.info(
                SCALESET_LOG_PREFIX +
                "reimaging uninitialized nodes or nodes with expired heartbeats. "
                + "scaleset_id:%s nodes:%s",
                self.scaleset_id,
                ",".join(str(x.machine_id) for x in dead_nodes),
            )
            for node in dead_nodes:
                if node.heartbeat:
                    error_message = "node reimaged due to expired heartbeat"
                else:
                    error_message = "node reimaged due to never receiving a heartbeat"
                error = Error(
                    code=ErrorCode.TASK_FAILED,
                    errors=[
                        error_message,
                        f"scaleset_id:{node.scaleset_id} machine_id:{node.machine_id}",
                        f"last heartbeat:{node.heartbeat}",
                    ],
                )
                node.mark_tasks_stopped_early(error=error)
                node.to_reimage(done=True)
                if node not in to_reimage:
                    to_reimage.append(node)

        # Perform operations until they fail due to scaleset getting locked
        try:
            strategy_str = os.getenv("ONEFUZZ_NODE_DISPOSAL_STRATEGY",
                                     "scale_in")
            if strategy_str == "decomission":
                strategy = NodeDisaposalStrategy.decomission
            else:
                strategy = NodeDisaposalStrategy.scale_in
            self.reimage_nodes(to_reimage, strategy)
            self.delete_nodes(to_delete, strategy)
        except UnableToUpdate:
            logging.info(
                SCALESET_LOG_PREFIX +
                "scaleset update already in progress: scaleset_id:%s",
                self.scaleset_id,
            )

        return bool(to_reimage) or bool(to_delete)
Exemple #18
0
    def cleanup_nodes(self) -> bool:
        if self.state == ScalesetState.halt:
            logging.info(
                SCALESET_LOG_PREFIX + "halting scaleset scaleset_id:%s",
                self.scaleset_id,
            )
            self.halt()
            return True

        Node.reimage_long_lived_nodes(self.scaleset_id)

        to_reimage = []
        to_delete = []

        # ground truth of existing nodes
        azure_nodes = list_instance_ids(self.scaleset_id)

        nodes = Node.search_states(scaleset_id=self.scaleset_id)

        # Nodes do not exists in scalesets but in table due to unknown failure
        for node in nodes:
            if node.machine_id not in azure_nodes:
                logging.info(
                    SCALESET_LOG_PREFIX +
                    "no longer in scaleset. scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    node.machine_id,
                )
                node.delete()

        # Scalesets can have nodes that never check in (such as broken OS setup
        # scripts).
        #
        # This will add nodes that Azure knows about but have not checked in
        # such that the `dead node` detection will eventually reimage the node.
        #
        # NOTE: If node setup takes longer than NODE_EXPIRATION_TIME (1 hour),
        # this will cause the nodes to continuously get reimaged.
        node_machine_ids = [x.machine_id for x in nodes]
        for machine_id in azure_nodes:
            if machine_id in node_machine_ids:
                continue

            logging.info(
                SCALESET_LOG_PREFIX +
                "adding missing azure node. scaleset_id:%s machine_id:%s",
                self.scaleset_id,
                machine_id,
            )

            # Note, using `new=True` makes it such that if a node already has
            # checked in, this won't overwrite it.
            Node.create(
                pool_name=self.pool_name,
                machine_id=machine_id,
                scaleset_id=self.scaleset_id,
                version=__version__,
                new=True,
            )

        existing_nodes = [x for x in nodes if x.machine_id in azure_nodes]
        nodes_to_reset = [
            x for x in existing_nodes
            if x.state in NodeState.ready_for_reset()
        ]

        for node in nodes_to_reset:
            if node.delete_requested:
                to_delete.append(node)
            else:
                if ScalesetShrinkQueue(self.scaleset_id).should_shrink():
                    node.set_halt()
                    to_delete.append(node)
                elif not node.reimage_queued:
                    # only add nodes that are not already set to reschedule
                    to_reimage.append(node)

        dead_nodes = Node.get_dead_nodes(self.scaleset_id,
                                         NODE_EXPIRATION_TIME)
        for node in dead_nodes:
            node.set_halt()
            to_reimage.append(node)

        # Perform operations until they fail due to scaleset getting locked
        try:
            if to_delete:
                logging.info(
                    SCALESET_LOG_PREFIX +
                    "deleting nodes. scaleset_id:%s count:%d",
                    self.scaleset_id,
                    len(to_delete),
                )
                self.delete_nodes(to_delete)
                for node in to_delete:
                    node.set_halt()

            if to_reimage:
                logging.info(
                    SCALESET_LOG_PREFIX +
                    "reimaging nodes: scaleset_id:%s count:%d",
                    self.scaleset_id,
                    len(to_reimage),
                )
                self.reimage_nodes(to_reimage)
        except UnableToUpdate:
            logging.info(
                SCALESET_LOG_PREFIX +
                "scaleset update already in progress: scaleset_id:%s",
                self.scaleset_id,
            )

        return bool(to_reimage) or bool(to_delete)
Exemple #19
0
def on_state_update(
    machine_id: UUID,
    state_update: NodeStateUpdate,
) -> None:
    state = state_update.state
    node = get_node_checked(machine_id)

    if state == NodeState.free:
        if node.reimage_requested or node.delete_requested:
            logging.info("stopping free node with reset flags: %s", node.machine_id)
            node.stop()
            return

        if node.could_shrink_scaleset():
            logging.info("stopping free node to resize scaleset: %s", node.machine_id)
            node.set_halt()
            return

    if state == NodeState.init:
        if node.delete_requested:
            node.stop()
            return
        node.reimage_requested = False
        node.save()
    elif node.state not in NodeState.ready_for_reset():
        if node.state != state:
            node.state = state
            node.save()

            if state == NodeState.setting_up:
                # Model-validated.
                #
                # This field will be required in the future.
                # For now, it is optional for back compat.
                setting_up_data = cast(
                    Optional[NodeSettingUpEventData],
                    state_update.data,
                )

                if setting_up_data:
                    for task_id in setting_up_data.tasks:
                        task = get_task_checked(task_id)

                        # The task state may be `running` if it has `vm_count` > 1, and
                        # another node is concurrently executing the task. If so, leave
                        # the state as-is, to represent the max progress made.
                        #
                        # Other states we would want to preserve are excluded by the
                        # outermost conditional check.
                        if task.state != TaskState.running:
                            task.state = TaskState.setting_up

                        task.on_start()
                        task.save()

                        # Note: we set the node task state to `setting_up`, even though
                        # the task itself may be `running`.
                        node_task = NodeTasks(
                            machine_id=machine_id,
                            task_id=task_id,
                            state=NodeTaskState.setting_up,
                        )
                        node_task.save()
            elif state == NodeState.done:
                # if tasks are running on the node when it reports as Done
                # those are stopped early
                node.mark_tasks_stopped_early()

                # Model-validated.
                #
                # This field will be required in the future.
                # For now, it is optional for back compat.
                done_data = cast(Optional[NodeDoneEventData], state_update.data)
                if done_data:
                    # TODO: do something with this done data
                    if done_data.error:
                        logging.error(
                            "node 'done' with error: machine_id:%s, data:%s",
                            machine_id,
                            done_data,
                        )
    else:
        logging.info("ignoring state updates from the node: %s: %s", machine_id, state)
Exemple #20
0
    def can_process_new_work(self) -> bool:
        from .pools import Pool
        from .scalesets import Scaleset

        if (self.is_outdated()
                and os.environ.get("ONEFUZZ_ALLOW_OUTDATED_AGENT") != "true"):
            logging.info(
                "can_process_new_work agent and service versions differ, "
                "stopping node. "
                "machine_id:%s agent_version:%s service_version: %s",
                self.machine_id,
                self.version,
                __version__,
            )
            self.stop(done=True)
            return False

        if self.is_too_old():
            logging.info(
                "can_process_new_work node is too old.  machine_id:%s",
                self.machine_id)
            self.stop(done=True)
            return False

        if self.state not in NodeState.can_process_new_work():
            logging.info(
                "can_process_new_work node not in appropriate state for new work"
                "machine_id:%s state:%s",
                self.machine_id,
                self.state.name,
            )
            return False

        if self.state in NodeState.ready_for_reset():
            logging.info(
                "can_process_new_work node is set for reset.  machine_id:%s",
                self.machine_id,
            )
            return False

        if self.delete_requested:
            logging.info(
                "can_process_new_work is set to be deleted.  machine_id:%s",
                self.machine_id,
            )
            self.stop(done=True)
            return False

        if self.reimage_requested:
            logging.info(
                "can_process_new_work is set to be reimaged.  machine_id:%s",
                self.machine_id,
            )
            self.stop(done=True)
            return False

        if self.could_shrink_scaleset():
            logging.info(
                "can_process_new_work node scheduled to shrink.  machine_id:%s",
                self.machine_id,
            )
            self.set_halt()
            return False

        if self.scaleset_id:
            scaleset = Scaleset.get_by_id(self.scaleset_id)
            if isinstance(scaleset, Error):
                logging.info(
                    "can_process_new_work invalid scaleset.  "
                    "scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    self.machine_id,
                )
                return False

            if scaleset.state not in ScalesetState.available():
                logging.info(
                    "can_process_new_work scaleset not available for work. "
                    "scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    self.machine_id,
                )
                return False

        pool = Pool.get_by_name(self.pool_name)
        if isinstance(pool, Error):
            logging.info(
                "can_schedule - invalid pool. "
                "pool_name:%s machine_id:%s",
                self.pool_name,
                self.machine_id,
            )
            return False
        if pool.state not in PoolState.available():
            logging.info(
                "can_schedule - pool is not available for work. "
                "pool_name:%s machine_id:%s",
                self.pool_name,
                self.machine_id,
            )
            return False

        return True
Exemple #21
0
    def can_process_new_work(self) -> bool:
        from .pools import Pool
        from .scalesets import Scaleset

        if self.is_outdated():
            logging.info(
                "can_schedule agent and service versions differ, stopping node. "
                "machine_id:%s agent_version:%s service_version: %s",
                self.machine_id,
                self.version,
                __version__,
            )
            self.stop()
            return False

        if self.state in NodeState.ready_for_reset():
            logging.info("can_schedule node is set for reset.  machine_id:%s",
                         self.machine_id)
            return False

        if self.delete_requested:
            logging.info(
                "can_schedule is set to be deleted.  machine_id:%s",
                self.machine_id,
            )
            self.stop()
            return False

        if self.reimage_requested:
            logging.info(
                "can_schedule is set to be reimaged.  machine_id:%s",
                self.machine_id,
            )
            self.stop()
            return False

        if self.could_shrink_scaleset():
            self.set_halt()
            logging.info("node scheduled to shrink.  machine_id:%s",
                         self.machine_id)
            return False

        if self.scaleset_id:
            scaleset = Scaleset.get_by_id(self.scaleset_id)
            if isinstance(scaleset, Error):
                logging.info(
                    "can_schedule - invalid scaleset.  scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    self.machine_id,
                )
                return False

            if scaleset.state not in ScalesetState.available():
                logging.info(
                    "can_schedule - scaleset not available for work. "
                    "scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    self.machine_id,
                )
                return False

        pool = Pool.get_by_name(self.pool_name)
        if isinstance(pool, Error):
            logging.info(
                "can_schedule - invalid pool. "
                "pool_name:%s machine_id:%s",
                self.pool_name,
                self.machine_id,
            )
            return False
        if pool.state not in PoolState.available():
            logging.info(
                "can_schedule - pool is not available for work. "
                "pool_name:%s machine_id:%s",
                self.pool_name,
                self.machine_id,
            )
            return False

        return True
Exemple #22
0
    def cleanup_nodes(self) -> bool:
        if self.state == ScalesetState.halt:
            logging.info("halting scaleset: %s", self.scaleset_id)
            self.halt()
            return True

        to_reimage = []
        to_delete = []

        outdated = Node.search_outdated(scaleset_id=self.scaleset_id)
        for node in outdated:
            logging.info(
                "node is outdated: %s - node_version:%s api_version:%s",
                node.machine_id,
                node.version,
                __version__,
            )
            if node.version == "1.0.0":
                node.state = NodeState.done
                to_reimage.append(node)
            else:
                node.to_reimage()

        nodes = Node.search_states(
            scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset()
        )

        if not outdated and not nodes:
            logging.info("no nodes need updating: %s", self.scaleset_id)
            return False

        # ground truth of existing nodes
        azure_nodes = list_instance_ids(self.scaleset_id)

        for node in nodes:
            if node.machine_id not in azure_nodes:
                logging.info(
                    "no longer in scaleset: %s:%s", self.scaleset_id, node.machine_id
                )
                node.delete()
            elif node.delete_requested:
                to_delete.append(node)
            else:
                if ScalesetShrinkQueue(self.scaleset_id).should_shrink():
                    node.set_halt()
                    to_delete.append(node)
                else:
                    to_reimage.append(node)

        # Perform operations until they fail due to scaleset getting locked
        try:
            if to_delete:
                logging.info(
                    "deleting nodes: %s - count: %d", self.scaleset_id, len(to_delete)
                )
                self.delete_nodes(to_delete)
                for node in to_delete:
                    node.set_halt()
                    node.state = NodeState.halt
                    node.save()

            if to_reimage:
                self.reimage_nodes(to_reimage)
        except UnableToUpdate:
            logging.info("scaleset update already in progress: %s", self.scaleset_id)

        return True