def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse: if event.running: task_id = event.running.task_id elif event.done: task_id = event.done.task_id task = get_task_checked(task_id) node = get_node_checked(machine_id) node_task = NodeTasks(machine_id=machine_id, task_id=task_id, state=NodeTaskState.running) if event.running: if task.state not in TaskState.shutting_down(): task.state = TaskState.running if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node_task.save() task.on_start() elif event.done: # Only record exit status if the task isn't already shutting down. # # It's ok for the agent to fail because resources vanish out from underneath # it during deletion. if task.state not in TaskState.shutting_down(): exit_status = event.done.exit_status if not exit_status.success: logging.error("task failed: status = %s", exit_status) task.error = Error( code=ErrorCode.TASK_FAILED, errors=[ "task failed. exit_status = %s" % exit_status, event.done.stdout, event.done.stderr, ], ) task.state = TaskState.stopping if node.state not in NodeState.ready_for_reset(): node.state = NodeState.done node_task.delete() else: err = Error( code=ErrorCode.INVALID_REQUEST, errors=["invalid worker event type"], ) raise RequestException(err) task.save() node.save() task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event) task_event.save() return ok(BoolResult(result=True))
def main(mytimer: func.TimerRequest) -> None: # noqa: F841 # NOTE: Update pools first, such that scalesets impacted by pool updates # (such as shutdown or resize) happen during this iteration `timer_worker` # rather than the following iteration. pools = Pool.search() for pool in pools: if pool.state in PoolState.needs_work(): logging.info("update pool: %s (%s)", pool.pool_id, pool.name) process_state_updates(pool) if pool.state in PoolState.available() and pool.autoscale: autoscale_pool(pool) # NOTE: Nodes, and Scalesets should be processed in a consistent order such # during 'pool scale down' operations. This means that pools that are # scaling down will more likely remove from the same scalesets over time. # By more likely removing from the same scalesets, we are more likely to # get to empty scalesets, which can safely be deleted. Node.mark_outdated_nodes() Node.cleanup_busy_nodes_without_work() nodes = Node.search_states(states=NodeState.needs_work()) for node in sorted(nodes, key=lambda x: x.machine_id): logging.info("update node: %s", node.machine_id) process_state_updates(node) scalesets = Scaleset.search() for scaleset in sorted(scalesets, key=lambda x: x.scaleset_id): process_scaleset(scaleset)
def cleanup_nodes(self) -> bool: if self.state == ScalesetState.halt: logging.info("halting scaleset: %s", self.scaleset_id) self.halt() return True to_reimage = [] to_delete = [] nodes = Node.search_states( scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset() ) if not nodes: logging.info("no nodes need updating: %s", self.scaleset_id) return False # ground truth of existing nodes azure_nodes = list_instance_ids(self.scaleset_id) for node in nodes: if node.machine_id not in azure_nodes: logging.info( "no longer in scaleset: %s:%s", self.scaleset_id, node.machine_id ) node.delete() elif node.delete_requested: to_delete.append(node) else: if ScalesetShrinkQueue(self.scaleset_id).should_shrink(): node.set_halt() to_delete.append(node) elif not node.reimage_queued: # only add nodes that are not already set to reschedule to_reimage.append(node) dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME) for node in dead_nodes: node.set_halt() to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: if to_delete: logging.info( "deleting nodes: %s - count: %d", self.scaleset_id, len(to_delete) ) self.delete_nodes(to_delete) for node in to_delete: node.set_halt() node.state = NodeState.halt node.save() if to_reimage: self.reimage_nodes(to_reimage) except UnableToUpdate: logging.info("scaleset update already in progress: %s", self.scaleset_id) return True
def to_reimage(self, done: bool = False) -> None: if done: if self.state not in NodeState.ready_for_reset(): self.state = NodeState.done if not self.reimage_requested and not self.delete_requested: logging.info("setting reimage_requested: %s", self.machine_id) self.reimage_requested = True self.save()
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> None: if event.running: task_id = event.running.task_id elif event.done: task_id = event.done.task_id else: raise NotImplementedError task = get_task_checked(task_id) node = get_node_checked(machine_id) node_task = NodeTasks( machine_id=machine_id, task_id=task_id, state=NodeTaskState.running ) if event.running: if task.state not in TaskState.shutting_down(): task.state = TaskState.running if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node.save() node_task.save() # Start the clock for the task if it wasn't started already # (as happens in 1.0.0 agents) task.on_start() elif event.done: node_task.delete() exit_status = event.done.exit_status if not exit_status.success: logging.error("task failed. status:%s", exit_status) task.mark_failed( Error( code=ErrorCode.TASK_FAILED, errors=[ "task failed. exit_status:%s" % exit_status, event.done.stdout, event.done.stderr, ], ) ) else: task.mark_stopping() node.to_reimage(done=True) else: err = Error( code=ErrorCode.INVALID_REQUEST, errors=["invalid worker event type"], ) raise RequestException(err) task.save() task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event) task_event.save()
def main(mytimer: func.TimerRequest, dashboard: func.Out[str]) -> None: # noqa: F841 proxies = Proxy.search_states(states=VmState.needs_work()) for proxy in proxies: logging.info("requeueing update proxy vm: %s", proxy.region) proxy.queue() vms = Repro.search_states(states=VmState.needs_work()) for vm in vms: logging.info("requeueing update vm: %s", vm.vm_id) vm.queue() tasks = Task.search_states(states=TaskState.needs_work()) for task in tasks: logging.info("requeueing update task: %s", task.task_id) task.queue() jobs = Job.search_states(states=JobState.needs_work()) for job in jobs: logging.info("requeueing update job: %s", job.job_id) job.queue() pools = Pool.search_states(states=PoolState.needs_work()) for pool in pools: logging.info("queuing update pool: %s (%s)", pool.pool_id, pool.name) pool.queue() nodes = Node.search_states(states=NodeState.needs_work()) for node in nodes: logging.info("queuing update node: %s", node.machine_id) node.queue() expired_tasks = Task.search_expired() for task in expired_tasks: logging.info("queuing stop for task: %s", task.job_id) task.queue_stop() expired_jobs = Job.search_expired() for job in expired_jobs: logging.info("queuing stop for job: %s", job.job_id) job.queue_stop() # Reminder, proxies are created on-demand. If something is "wrong" with # a proxy, the plan is: delete and recreate it. for proxy in Proxy.search(): if not proxy.is_alive(): logging.error("proxy alive check failed, stopping: %s", proxy.region) proxy.state = VmState.stopping proxy.save() else: proxy.save_proxy_config() event = get_event() if event: dashboard.set(event)
def cleanup_nodes(self) -> bool: if self.state == ScalesetState.halt: self.halt() return True nodes = Node.search_states(scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset()) outdated = Node.search_outdated( scaleset_id=self.scaleset_id, states=[NodeState.free], ) if not (nodes or outdated): logging.debug("scaleset node gc done (no nodes) %s", self.scaleset_id) return False to_delete = [] to_reimage = [] for node in outdated: if node.version == "1.0.0": to_reimage.append(node) else: stop_message = NodeMessage( agent_id=node.machine_id, message=NodeCommand(stop=StopNodeCommand()), ) stop_message.save() for node in nodes: # delete nodes that are not waiting on the scaleset GC if not node.scaleset_node_exists(): node.delete() elif node.state in [NodeState.shutdown, NodeState.halt]: to_delete.append(node) else: to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: if to_delete: self.delete_nodes(to_delete) for node in to_delete: node.state = NodeState.halt node.save() if to_reimage: self.reimage_nodes(to_reimage) except UnableToUpdate: logging.info("scaleset update already in progress: %s", self.scaleset_id) return True
def on_state_update(machine_id: UUID, state: NodeState) -> func.HttpResponse: node = get_node_checked(machine_id) if state == NodeState.init or node.state not in NodeState.ready_for_reset( ): if node.state != state: node.state = state node.save() else: logging.info("ignoring state updates from the node: %s: %s", machine_id, state) return ok(BoolResult(result=True))
def stop_task(cls, task_id: UUID) -> None: # For now, this just re-images the node. Eventually, this # should send a message to the node to let the agent shut down # gracefully nodes = NodeTasks.get_nodes_by_task_id(task_id) for node in nodes: if node.state not in NodeState.ready_for_reset(): logging.info( "stopping machine_id:%s running task:%s", node.machine_id, task_id, ) node.stop()
def to_reimage(self, done: bool = False) -> None: if done: if self.state not in NodeState.ready_for_reset(): self.state = NodeState.done if not self.reimage_requested and not self.delete_requested: logging.info("setting reimage_requested: %s", self.machine_id) self.reimage_requested = True # if we're going to reimage, make sure the node doesn't pick up new work # too. self.send_stop_if_free() self.save()
def on_worker_event_running(machine_id: UUID, event: WorkerRunningEvent) -> Result[None]: task = Task.get_by_task_id(event.task_id) if isinstance(task, Error): return task node = get_node(machine_id) if isinstance(node, Error): return node if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node.save() node_task = NodeTasks(machine_id=machine_id, task_id=event.task_id, state=NodeTaskState.running) node_task.save() if task.state in TaskState.shutting_down(): logging.info( "ignoring task start from node. machine_id:%s %s:%s (state: %s)", machine_id, task.job_id, task.task_id, task.state, ) return None logging.info( "task started on node. machine_id:%s %s:%s", machine_id, task.job_id, task.task_id, ) task.state = TaskState.running task.save() # Start the clock for the task if it wasn't started already # (as happens in 1.0.0 agents) task.on_start() task_event = TaskEvent( task_id=task.task_id, machine_id=machine_id, event_data=WorkerEvent(running=event), ) task_event.save() return None
def on_state_update( machine_id: UUID, state_update: NodeStateUpdate, ) -> func.HttpResponse: state = state_update.state node = get_node_checked(machine_id) if state == NodeState.init or node.state not in NodeState.ready_for_reset( ): if node.state != state: node.state = state node.save() if state == NodeState.setting_up: # This field will be required in the future. # For now, it is optional for back compat. if state_update.data: for task_id in state_update.data.tasks: task = get_task_checked(task_id) # The task state may be `running` if it has `vm_count` > 1, and # another node is concurrently executing the task. If so, leave # the state as-is, to represent the max progress made. # # Other states we would want to preserve are excluded by the # outermost conditional check. if task.state != TaskState.running: task.state = TaskState.setting_up # We don't yet call `on_start()` for the task. # This will happen once we see a worker event that # reports it as `running`. task.save() # Note: we set the node task state to `setting_up`, even though # the task itself may be `running`. node_task = NodeTasks( machine_id=machine_id, task_id=task_id, state=NodeTaskState.setting_up, ) node_task.save() else: logging.info("ignoring state updates from the node: %s: %s", machine_id, state) return ok(BoolResult(result=True))
def on_worker_event_running(machine_id: UUID, event: WorkerRunningEvent) -> Result[None]: task = Task.get_by_task_id(event.task_id) if isinstance(task, Error): return task node = get_node(machine_id) if isinstance(node, Error): return node if node.state not in NodeState.ready_for_reset(): node.set_state(NodeState.busy) node_task = NodeTasks(machine_id=machine_id, task_id=event.task_id, state=NodeTaskState.running) node_task.save() if task.state in TaskState.shutting_down(): logging.info( "ignoring task start from node. " "machine_id:%s job_id:%s task_id:%s (state: %s)", machine_id, task.job_id, task.task_id, task.state, ) return None logging.info( "task started on node. machine_id:%s job_id%s task_id:%s", machine_id, task.job_id, task.task_id, ) task.set_state(TaskState.running) task_event = TaskEvent( task_id=task.task_id, machine_id=machine_id, event_data=WorkerEvent(running=event), ) task_event.save() return None
def can_process_new_work(self) -> bool: if self.is_outdated(): logging.info( "can_schedule old version machine_id:%s version:%s", self.machine_id, self.version, ) self.stop() return False if self.state in NodeState.ready_for_reset(): logging.info("can_schedule node is set for reset. machine_id:%s", self.machine_id) return False if self.delete_requested: logging.info( "can_schedule is set to be deleted. machine_id:%s", self.machine_id, ) self.stop() return False if self.reimage_requested: logging.info( "can_schedule is set to be reimaged. machine_id:%s", self.machine_id, ) self.stop() return False if self.could_shrink_scaleset(): self.set_halt() logging.info("node scheduled to shrink. machine_id:%s", self.machine_id) return False return True
def main(mytimer: func.TimerRequest, dashboard: func.Out[str]) -> None: # noqa: F841 Node.mark_outdated_nodes() nodes = Node.search_states(states=NodeState.needs_work()) for node in nodes: logging.info("update node: %s", node.machine_id) process_state_updates(node) scalesets = Scaleset.search() for scaleset in scalesets: process_scaleset(scaleset) pools = Pool.search() for pool in pools: if pool.state in PoolState.needs_work(): logging.info("update pool: %s (%s)", pool.pool_id, pool.name) process_state_updates(pool) elif pool.state in PoolState.available() and pool.autoscale: autoscale_pool(pool) event = get_event() if event: dashboard.set(event)
def cleanup_nodes(self) -> bool: if self.state == ScalesetState.halt: self.halt() return True nodes = Node.search_states( scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset() ) if not nodes: logging.debug("scaleset node gc done (no nodes) %s", self.scaleset_id) return False to_delete = [] to_reimage = [] for node in nodes: # delete nodes that are not waiting on the scaleset GC if not node.scaleset_node_exists(): node.delete() elif node.state in [NodeState.shutdown, NodeState.halt]: to_delete.append(node) else: to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: if to_delete: self.delete_nodes(to_delete) for node in to_delete: node.state = NodeState.halt node.save() if to_reimage: self.reimage_nodes(to_reimage) except UnableToUpdate: logging.info("scaleset update already in progress: %s", self.scaleset_id) return True
def cleanup_nodes(self) -> bool: from .pools import Pool logging.info(SCALESET_LOG_PREFIX + "cleaning up nodes. scaleset_id:%s", self.scaleset_id) if self.state == ScalesetState.halt: logging.info( SCALESET_LOG_PREFIX + "halting scaleset scaleset_id:%s", self.scaleset_id, ) self.halt() return True pool = Pool.get_by_name(self.pool_name) if isinstance(pool, Error): logging.error( "unable to find pool during cleanup: %s - %s", self.scaleset_id, pool, ) self.set_failed(pool) return True Node.reimage_long_lived_nodes(self.scaleset_id) to_reimage = [] to_delete = [] # ground truth of existing nodes azure_nodes = list_instance_ids(self.scaleset_id) nodes = Node.search_states(scaleset_id=self.scaleset_id) # Nodes do not exists in scalesets but in table due to unknown failure for node in nodes: if node.machine_id not in azure_nodes: logging.info( SCALESET_LOG_PREFIX + "no longer in scaleset. scaleset_id:%s machine_id:%s", self.scaleset_id, node.machine_id, ) node.delete() # Scalesets can have nodes that never check in (such as broken OS setup # scripts). # # This will add nodes that Azure knows about but have not checked in # such that the `dead node` detection will eventually reimage the node. # # NOTE: If node setup takes longer than NODE_EXPIRATION_TIME (1 hour), # this will cause the nodes to continuously get reimaged. node_machine_ids = [x.machine_id for x in nodes] for machine_id in azure_nodes: if machine_id in node_machine_ids: continue logging.info( SCALESET_LOG_PREFIX + "adding missing azure node. scaleset_id:%s machine_id:%s", self.scaleset_id, machine_id, ) # Note, using `new=True` makes it such that if a node already has # checked in, this won't overwrite it. Node.create( pool_id=pool.pool_id, pool_name=self.pool_name, machine_id=machine_id, scaleset_id=self.scaleset_id, version=__version__, new=True, ) existing_nodes = [x for x in nodes if x.machine_id in azure_nodes] nodes_to_reset = [ x for x in existing_nodes if x.state in NodeState.ready_for_reset() ] for node in nodes_to_reset: if node.delete_requested: to_delete.append(node) else: if ShrinkQueue(self.scaleset_id).should_shrink(): node.set_halt() to_delete.append(node) elif ShrinkQueue(pool.pool_id).should_shrink(): node.set_halt() to_delete.append(node) else: to_reimage.append(node) dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME) if dead_nodes: logging.info( SCALESET_LOG_PREFIX + "reimaging uninitialized nodes or nodes with expired heartbeats. " + "scaleset_id:%s nodes:%s", self.scaleset_id, ",".join(str(x.machine_id) for x in dead_nodes), ) for node in dead_nodes: if node.heartbeat: error_message = "node reimaged due to expired heartbeat" else: error_message = "node reimaged due to never receiving a heartbeat" error = Error( code=ErrorCode.TASK_FAILED, errors=[ error_message, f"scaleset_id:{node.scaleset_id} machine_id:{node.machine_id}", f"last heartbeat:{node.heartbeat}", ], ) node.mark_tasks_stopped_early(error=error) node.to_reimage(done=True) if node not in to_reimage: to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: strategy_str = os.getenv("ONEFUZZ_NODE_DISPOSAL_STRATEGY", "scale_in") if strategy_str == "decomission": strategy = NodeDisaposalStrategy.decomission else: strategy = NodeDisaposalStrategy.scale_in self.reimage_nodes(to_reimage, strategy) self.delete_nodes(to_delete, strategy) except UnableToUpdate: logging.info( SCALESET_LOG_PREFIX + "scaleset update already in progress: scaleset_id:%s", self.scaleset_id, ) return bool(to_reimage) or bool(to_delete)
def cleanup_nodes(self) -> bool: if self.state == ScalesetState.halt: logging.info( SCALESET_LOG_PREFIX + "halting scaleset scaleset_id:%s", self.scaleset_id, ) self.halt() return True Node.reimage_long_lived_nodes(self.scaleset_id) to_reimage = [] to_delete = [] # ground truth of existing nodes azure_nodes = list_instance_ids(self.scaleset_id) nodes = Node.search_states(scaleset_id=self.scaleset_id) # Nodes do not exists in scalesets but in table due to unknown failure for node in nodes: if node.machine_id not in azure_nodes: logging.info( SCALESET_LOG_PREFIX + "no longer in scaleset. scaleset_id:%s machine_id:%s", self.scaleset_id, node.machine_id, ) node.delete() # Scalesets can have nodes that never check in (such as broken OS setup # scripts). # # This will add nodes that Azure knows about but have not checked in # such that the `dead node` detection will eventually reimage the node. # # NOTE: If node setup takes longer than NODE_EXPIRATION_TIME (1 hour), # this will cause the nodes to continuously get reimaged. node_machine_ids = [x.machine_id for x in nodes] for machine_id in azure_nodes: if machine_id in node_machine_ids: continue logging.info( SCALESET_LOG_PREFIX + "adding missing azure node. scaleset_id:%s machine_id:%s", self.scaleset_id, machine_id, ) # Note, using `new=True` makes it such that if a node already has # checked in, this won't overwrite it. Node.create( pool_name=self.pool_name, machine_id=machine_id, scaleset_id=self.scaleset_id, version=__version__, new=True, ) existing_nodes = [x for x in nodes if x.machine_id in azure_nodes] nodes_to_reset = [ x for x in existing_nodes if x.state in NodeState.ready_for_reset() ] for node in nodes_to_reset: if node.delete_requested: to_delete.append(node) else: if ScalesetShrinkQueue(self.scaleset_id).should_shrink(): node.set_halt() to_delete.append(node) elif not node.reimage_queued: # only add nodes that are not already set to reschedule to_reimage.append(node) dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME) for node in dead_nodes: node.set_halt() to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: if to_delete: logging.info( SCALESET_LOG_PREFIX + "deleting nodes. scaleset_id:%s count:%d", self.scaleset_id, len(to_delete), ) self.delete_nodes(to_delete) for node in to_delete: node.set_halt() if to_reimage: logging.info( SCALESET_LOG_PREFIX + "reimaging nodes: scaleset_id:%s count:%d", self.scaleset_id, len(to_reimage), ) self.reimage_nodes(to_reimage) except UnableToUpdate: logging.info( SCALESET_LOG_PREFIX + "scaleset update already in progress: scaleset_id:%s", self.scaleset_id, ) return bool(to_reimage) or bool(to_delete)
def on_state_update( machine_id: UUID, state_update: NodeStateUpdate, ) -> None: state = state_update.state node = get_node_checked(machine_id) if state == NodeState.free: if node.reimage_requested or node.delete_requested: logging.info("stopping free node with reset flags: %s", node.machine_id) node.stop() return if node.could_shrink_scaleset(): logging.info("stopping free node to resize scaleset: %s", node.machine_id) node.set_halt() return if state == NodeState.init: if node.delete_requested: node.stop() return node.reimage_requested = False node.save() elif node.state not in NodeState.ready_for_reset(): if node.state != state: node.state = state node.save() if state == NodeState.setting_up: # Model-validated. # # This field will be required in the future. # For now, it is optional for back compat. setting_up_data = cast( Optional[NodeSettingUpEventData], state_update.data, ) if setting_up_data: for task_id in setting_up_data.tasks: task = get_task_checked(task_id) # The task state may be `running` if it has `vm_count` > 1, and # another node is concurrently executing the task. If so, leave # the state as-is, to represent the max progress made. # # Other states we would want to preserve are excluded by the # outermost conditional check. if task.state != TaskState.running: task.state = TaskState.setting_up task.on_start() task.save() # Note: we set the node task state to `setting_up`, even though # the task itself may be `running`. node_task = NodeTasks( machine_id=machine_id, task_id=task_id, state=NodeTaskState.setting_up, ) node_task.save() elif state == NodeState.done: # if tasks are running on the node when it reports as Done # those are stopped early node.mark_tasks_stopped_early() # Model-validated. # # This field will be required in the future. # For now, it is optional for back compat. done_data = cast(Optional[NodeDoneEventData], state_update.data) if done_data: # TODO: do something with this done data if done_data.error: logging.error( "node 'done' with error: machine_id:%s, data:%s", machine_id, done_data, ) else: logging.info("ignoring state updates from the node: %s: %s", machine_id, state)
def can_process_new_work(self) -> bool: from .pools import Pool from .scalesets import Scaleset if (self.is_outdated() and os.environ.get("ONEFUZZ_ALLOW_OUTDATED_AGENT") != "true"): logging.info( "can_process_new_work agent and service versions differ, " "stopping node. " "machine_id:%s agent_version:%s service_version: %s", self.machine_id, self.version, __version__, ) self.stop(done=True) return False if self.is_too_old(): logging.info( "can_process_new_work node is too old. machine_id:%s", self.machine_id) self.stop(done=True) return False if self.state not in NodeState.can_process_new_work(): logging.info( "can_process_new_work node not in appropriate state for new work" "machine_id:%s state:%s", self.machine_id, self.state.name, ) return False if self.state in NodeState.ready_for_reset(): logging.info( "can_process_new_work node is set for reset. machine_id:%s", self.machine_id, ) return False if self.delete_requested: logging.info( "can_process_new_work is set to be deleted. machine_id:%s", self.machine_id, ) self.stop(done=True) return False if self.reimage_requested: logging.info( "can_process_new_work is set to be reimaged. machine_id:%s", self.machine_id, ) self.stop(done=True) return False if self.could_shrink_scaleset(): logging.info( "can_process_new_work node scheduled to shrink. machine_id:%s", self.machine_id, ) self.set_halt() return False if self.scaleset_id: scaleset = Scaleset.get_by_id(self.scaleset_id) if isinstance(scaleset, Error): logging.info( "can_process_new_work invalid scaleset. " "scaleset_id:%s machine_id:%s", self.scaleset_id, self.machine_id, ) return False if scaleset.state not in ScalesetState.available(): logging.info( "can_process_new_work scaleset not available for work. " "scaleset_id:%s machine_id:%s", self.scaleset_id, self.machine_id, ) return False pool = Pool.get_by_name(self.pool_name) if isinstance(pool, Error): logging.info( "can_schedule - invalid pool. " "pool_name:%s machine_id:%s", self.pool_name, self.machine_id, ) return False if pool.state not in PoolState.available(): logging.info( "can_schedule - pool is not available for work. " "pool_name:%s machine_id:%s", self.pool_name, self.machine_id, ) return False return True
def can_process_new_work(self) -> bool: from .pools import Pool from .scalesets import Scaleset if self.is_outdated(): logging.info( "can_schedule agent and service versions differ, stopping node. " "machine_id:%s agent_version:%s service_version: %s", self.machine_id, self.version, __version__, ) self.stop() return False if self.state in NodeState.ready_for_reset(): logging.info("can_schedule node is set for reset. machine_id:%s", self.machine_id) return False if self.delete_requested: logging.info( "can_schedule is set to be deleted. machine_id:%s", self.machine_id, ) self.stop() return False if self.reimage_requested: logging.info( "can_schedule is set to be reimaged. machine_id:%s", self.machine_id, ) self.stop() return False if self.could_shrink_scaleset(): self.set_halt() logging.info("node scheduled to shrink. machine_id:%s", self.machine_id) return False if self.scaleset_id: scaleset = Scaleset.get_by_id(self.scaleset_id) if isinstance(scaleset, Error): logging.info( "can_schedule - invalid scaleset. scaleset_id:%s machine_id:%s", self.scaleset_id, self.machine_id, ) return False if scaleset.state not in ScalesetState.available(): logging.info( "can_schedule - scaleset not available for work. " "scaleset_id:%s machine_id:%s", self.scaleset_id, self.machine_id, ) return False pool = Pool.get_by_name(self.pool_name) if isinstance(pool, Error): logging.info( "can_schedule - invalid pool. " "pool_name:%s machine_id:%s", self.pool_name, self.machine_id, ) return False if pool.state not in PoolState.available(): logging.info( "can_schedule - pool is not available for work. " "pool_name:%s machine_id:%s", self.pool_name, self.machine_id, ) return False return True
def cleanup_nodes(self) -> bool: if self.state == ScalesetState.halt: logging.info("halting scaleset: %s", self.scaleset_id) self.halt() return True to_reimage = [] to_delete = [] outdated = Node.search_outdated(scaleset_id=self.scaleset_id) for node in outdated: logging.info( "node is outdated: %s - node_version:%s api_version:%s", node.machine_id, node.version, __version__, ) if node.version == "1.0.0": node.state = NodeState.done to_reimage.append(node) else: node.to_reimage() nodes = Node.search_states( scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset() ) if not outdated and not nodes: logging.info("no nodes need updating: %s", self.scaleset_id) return False # ground truth of existing nodes azure_nodes = list_instance_ids(self.scaleset_id) for node in nodes: if node.machine_id not in azure_nodes: logging.info( "no longer in scaleset: %s:%s", self.scaleset_id, node.machine_id ) node.delete() elif node.delete_requested: to_delete.append(node) else: if ScalesetShrinkQueue(self.scaleset_id).should_shrink(): node.set_halt() to_delete.append(node) else: to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: if to_delete: logging.info( "deleting nodes: %s - count: %d", self.scaleset_id, len(to_delete) ) self.delete_nodes(to_delete) for node in to_delete: node.set_halt() node.state = NodeState.halt node.save() if to_reimage: self.reimage_nodes(to_reimage) except UnableToUpdate: logging.info("scaleset update already in progress: %s", self.scaleset_id) return True