def cleanup_nodes(self) -> bool: if self.state == ScalesetState.halt: self.halt() return True nodes = Node.search_states(scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset()) outdated = Node.search_outdated( scaleset_id=self.scaleset_id, states=[NodeState.free], ) if not (nodes or outdated): logging.debug("scaleset node gc done (no nodes) %s", self.scaleset_id) return False to_delete = [] to_reimage = [] for node in outdated: if node.version == "1.0.0": to_reimage.append(node) else: stop_message = NodeMessage( agent_id=node.machine_id, message=NodeCommand(stop=StopNodeCommand()), ) stop_message.save() for node in nodes: # delete nodes that are not waiting on the scaleset GC if not node.scaleset_node_exists(): node.delete() elif node.state in [NodeState.shutdown, NodeState.halt]: to_delete.append(node) else: to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: if to_delete: self.delete_nodes(to_delete) for node in to_delete: node.state = NodeState.halt node.save() if to_reimage: self.reimage_nodes(to_reimage) except UnableToUpdate: logging.info("scaleset update already in progress: %s", self.scaleset_id) return True
def add_ssh_public_key(self, public_key: str) -> Result[None]: if self.scaleset_id is None: return Error( code=ErrorCode.INVALID_REQUEST, errors=["only able to add ssh keys to scaleset nodes"], ) if not public_key.endswith("\n"): public_key += "\n" self.send_message( NodeCommand(add_ssh_key=NodeCommandAddSshKey( public_key=public_key))) return None
def stop_task(cls, task_id: UUID) -> None: # For now, this just re-images the node. Eventually, this # should send a message to the node to let the agent shut down # gracefully nodes = NodeTasks.get_nodes_by_task_id(task_id) for node in nodes: node.send_message( NodeCommand(stop_task=StopTaskNodeCommand(task_id=task_id))) if not node.stop_if_complete(): logging.info( "nodes: stopped task on node, " "but not reimaging due to other tasks: task_id:%s machine_id:%s", task_id, node.machine_id, )
def post(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(CanScheduleRequest, req) if isinstance(request, Error): return not_ok(request, context="CanScheduleRequest") node = Node.get_by_machine_id(request.machine_id) if not node: return not_ok( Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]), context=request.machine_id, ) allowed = True work_stopped = False if node.is_outdated(): logging.info( "received can_schedule request from outdated node '%s' version '%s'", node.machine_id, node.version, ) allowed = False stop_message = NodeMessage( agent_id=node.machine_id, message=NodeCommand(stop=StopNodeCommand()), ) stop_message.save() task = Task.get_by_task_id(request.task_id) work_stopped = isinstance(task, Error) or (task.state != TaskState.scheduled) if work_stopped: allowed = False return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
def stop(self, done: bool = False) -> None: self.to_reimage(done=done) self.send_message(NodeCommand(stop=StopNodeCommand()))
def send_stop_if_free(self) -> None: if is_minimum_version(version=self.version, minimum="2.16.1"): self.send_message( NodeCommand(stop_if_free=NodeCommandStopIfFree()))
def stop(self) -> None: self.to_reimage() self.send_message(NodeCommand(stop=StopNodeCommand()))