Ejemplo n.º 1
0
def main(mytimer: func.TimerRequest) -> None:  # noqa: F841
    # NOTE: Update pools first, such that scalesets impacted by pool updates
    # (such as shutdown or resize) happen during this iteration `timer_worker`
    # rather than the following iteration.

    pools = Pool.search()
    for pool in pools:
        if pool.state in PoolState.needs_work():
            logging.info("update pool: %s (%s)", pool.pool_id, pool.name)
            process_state_updates(pool)

        if pool.state in PoolState.available() and pool.autoscale:
            autoscale_pool(pool)

    # NOTE: Nodes, and Scalesets should be processed in a consistent order such
    # during 'pool scale down' operations. This means that pools that are
    # scaling down will more likely remove from the same scalesets over time.
    # By more likely removing from the same scalesets, we are more likely to
    # get to empty scalesets, which can safely be deleted.

    Node.mark_outdated_nodes()
    Node.cleanup_busy_nodes_without_work()
    nodes = Node.search_states(states=NodeState.needs_work())
    for node in sorted(nodes, key=lambda x: x.machine_id):
        logging.info("update node: %s", node.machine_id)
        process_state_updates(node)

    scalesets = Scaleset.search()
    for scaleset in sorted(scalesets, key=lambda x: x.scaleset_id):
        process_scaleset(scaleset)
Ejemplo n.º 2
0
def main(mytimer: func.TimerRequest,
         dashboard: func.Out[str]) -> None:  # noqa: F841
    proxies = Proxy.search_states(states=VmState.needs_work())
    for proxy in proxies:
        logging.info("requeueing update proxy vm: %s", proxy.region)
        proxy.queue()

    vms = Repro.search_states(states=VmState.needs_work())
    for vm in vms:
        logging.info("requeueing update vm: %s", vm.vm_id)
        vm.queue()

    tasks = Task.search_states(states=TaskState.needs_work())
    for task in tasks:
        logging.info("requeueing update task: %s", task.task_id)
        task.queue()

    jobs = Job.search_states(states=JobState.needs_work())
    for job in jobs:
        logging.info("requeueing update job: %s", job.job_id)
        job.queue()

    pools = Pool.search_states(states=PoolState.needs_work())
    for pool in pools:
        logging.info("queuing update pool: %s (%s)", pool.pool_id, pool.name)
        pool.queue()

    nodes = Node.search_states(states=NodeState.needs_work())
    for node in nodes:
        logging.info("queuing update node: %s", node.machine_id)
        node.queue()

    expired_tasks = Task.search_expired()
    for task in expired_tasks:
        logging.info("queuing stop for task: %s", task.job_id)
        task.queue_stop()

    expired_jobs = Job.search_expired()
    for job in expired_jobs:
        logging.info("queuing stop for job: %s", job.job_id)
        job.queue_stop()

    # Reminder, proxies are created on-demand.  If something is "wrong" with
    # a proxy, the plan is: delete and recreate it.
    for proxy in Proxy.search():
        if not proxy.is_alive():
            logging.error("proxy alive check failed, stopping: %s",
                          proxy.region)
            proxy.state = VmState.stopping
            proxy.save()
        else:
            proxy.save_proxy_config()

    event = get_event()
    if event:
        dashboard.set(event)
Ejemplo n.º 3
0
def schedule_workset(workset: WorkSet, pool: Pool, count: int) -> bool:
    if pool.state not in PoolState.available():
        logging.info("pool not available for work: %s state: %s", pool.name,
                     pool.state.name)
        return False

    for _ in range(count):
        if not pool.schedule_workset(workset):
            logging.error("unable to schedule workset. pool:%s workset:%s",
                          pool.name, workset)
            return False
    return True
Ejemplo n.º 4
0
def main(mytimer: func.TimerRequest,
         dashboard: func.Out[str]) -> None:  # noqa: F841
    Node.mark_outdated_nodes()
    nodes = Node.search_states(states=NodeState.needs_work())
    for node in nodes:
        logging.info("update node: %s", node.machine_id)
        process_state_updates(node)

    scalesets = Scaleset.search()
    for scaleset in scalesets:
        process_scaleset(scaleset)

    pools = Pool.search()
    for pool in pools:
        if pool.state in PoolState.needs_work():
            logging.info("update pool: %s (%s)", pool.pool_id, pool.name)
            process_state_updates(pool)
        elif pool.state in PoolState.available() and pool.autoscale:
            autoscale_pool(pool)

    event = get_event()
    if event:
        dashboard.set(event)
Ejemplo n.º 5
0
    def can_process_new_work(self) -> bool:
        from .pools import Pool
        from .scalesets import Scaleset

        if (self.is_outdated()
                and os.environ.get("ONEFUZZ_ALLOW_OUTDATED_AGENT") != "true"):
            logging.info(
                "can_process_new_work agent and service versions differ, "
                "stopping node. "
                "machine_id:%s agent_version:%s service_version: %s",
                self.machine_id,
                self.version,
                __version__,
            )
            self.stop(done=True)
            return False

        if self.is_too_old():
            logging.info(
                "can_process_new_work node is too old.  machine_id:%s",
                self.machine_id)
            self.stop(done=True)
            return False

        if self.state not in NodeState.can_process_new_work():
            logging.info(
                "can_process_new_work node not in appropriate state for new work"
                "machine_id:%s state:%s",
                self.machine_id,
                self.state.name,
            )
            return False

        if self.state in NodeState.ready_for_reset():
            logging.info(
                "can_process_new_work node is set for reset.  machine_id:%s",
                self.machine_id,
            )
            return False

        if self.delete_requested:
            logging.info(
                "can_process_new_work is set to be deleted.  machine_id:%s",
                self.machine_id,
            )
            self.stop(done=True)
            return False

        if self.reimage_requested:
            logging.info(
                "can_process_new_work is set to be reimaged.  machine_id:%s",
                self.machine_id,
            )
            self.stop(done=True)
            return False

        if self.could_shrink_scaleset():
            logging.info(
                "can_process_new_work node scheduled to shrink.  machine_id:%s",
                self.machine_id,
            )
            self.set_halt()
            return False

        if self.scaleset_id:
            scaleset = Scaleset.get_by_id(self.scaleset_id)
            if isinstance(scaleset, Error):
                logging.info(
                    "can_process_new_work invalid scaleset.  "
                    "scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    self.machine_id,
                )
                return False

            if scaleset.state not in ScalesetState.available():
                logging.info(
                    "can_process_new_work scaleset not available for work. "
                    "scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    self.machine_id,
                )
                return False

        pool = Pool.get_by_name(self.pool_name)
        if isinstance(pool, Error):
            logging.info(
                "can_schedule - invalid pool. "
                "pool_name:%s machine_id:%s",
                self.pool_name,
                self.machine_id,
            )
            return False
        if pool.state not in PoolState.available():
            logging.info(
                "can_schedule - pool is not available for work. "
                "pool_name:%s machine_id:%s",
                self.pool_name,
                self.machine_id,
            )
            return False

        return True
Ejemplo n.º 6
0
    def can_process_new_work(self) -> bool:
        from .pools import Pool
        from .scalesets import Scaleset

        if self.is_outdated():
            logging.info(
                "can_schedule agent and service versions differ, stopping node. "
                "machine_id:%s agent_version:%s service_version: %s",
                self.machine_id,
                self.version,
                __version__,
            )
            self.stop()
            return False

        if self.state in NodeState.ready_for_reset():
            logging.info("can_schedule node is set for reset.  machine_id:%s",
                         self.machine_id)
            return False

        if self.delete_requested:
            logging.info(
                "can_schedule is set to be deleted.  machine_id:%s",
                self.machine_id,
            )
            self.stop()
            return False

        if self.reimage_requested:
            logging.info(
                "can_schedule is set to be reimaged.  machine_id:%s",
                self.machine_id,
            )
            self.stop()
            return False

        if self.could_shrink_scaleset():
            self.set_halt()
            logging.info("node scheduled to shrink.  machine_id:%s",
                         self.machine_id)
            return False

        if self.scaleset_id:
            scaleset = Scaleset.get_by_id(self.scaleset_id)
            if isinstance(scaleset, Error):
                logging.info(
                    "can_schedule - invalid scaleset.  scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    self.machine_id,
                )
                return False

            if scaleset.state not in ScalesetState.available():
                logging.info(
                    "can_schedule - scaleset not available for work. "
                    "scaleset_id:%s machine_id:%s",
                    self.scaleset_id,
                    self.machine_id,
                )
                return False

        pool = Pool.get_by_name(self.pool_name)
        if isinstance(pool, Error):
            logging.info(
                "can_schedule - invalid pool. "
                "pool_name:%s machine_id:%s",
                self.pool_name,
                self.machine_id,
            )
            return False
        if pool.state not in PoolState.available():
            logging.info(
                "can_schedule - pool is not available for work. "
                "pool_name:%s machine_id:%s",
                self.pool_name,
                self.machine_id,
            )
            return False

        return True