def bootup(self, nodes: Optional[List[Node]] = None) -> BootupResult: nodes = nodes if nodes is not None else self.get_demand().new_nodes if not nodes: logging.info("No nodes to bootup.") return BootupResult("success", OperationId(""), None) logging.debug("booting up %s", [n.name for n in nodes]) return self.node_mgr.bootup(nodes)
def delete(self, nodes: Optional[List[Node]] = None) -> DeleteResult: nodes = nodes if nodes is not None else self.get_demand( ).unmatched_nodes if not nodes: logging.info("No nodes to delete.") return DeleteResult("success", OperationId(""), None) logging.debug("deleting %s", [n.name for n in nodes]) return self.node_mgr.delete(nodes)
def parse_scheduler_nodes( config: Dict, pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition], ) -> List[Node]: """ Gets the current state of the nodes as the scheduler sees them, including resources, assigned resources, jobs currently running etc. """ ret: List[Node] = [] ignore_onprem = config.get("pbspro", {}).get("ignore_onprem", False) ignore_hostnames_re_expr = config.get("pbspro", {}).get("ignore_hostnames_re") ignore_hostnames_re = None if ignore_hostnames_re_expr: try: ignore_hostnames_re = re.compile(ignore_hostnames_re_expr) except: logging.exception( f"Could not parse {ignore_hostnames_re_expr} as a regular expression" ) ignored_hostnames = [] for ndict in pbscmd.pbsnodes_parsed("-a"): if ignore_hostnames_re and ignore_hostnames_re.match(ndict["name"]): ignored_hostnames.append(ndict["name"]) continue if ignore_onprem and ndict.get("resources_available.ccnodeid"): ignored_hostnames.append(ndict["name"]) continue node = parse_scheduler_node(ndict, resource_definitions) if not node.available.get("ccnodeid"): node.metadata["override_resources"] = False logging.fine( "'ccnodeid' is not defined so %s has not been joined to the cluster by the autoscaler" + " yet or this is not a CycleCloud managed node", node, ) ret.append(node) if ignored_hostnames: if len(ignored_hostnames) < 5: logging.info( f"Ignored {len(ignored_hostnames)} hostnames. {','.join(ignored_hostnames)}" ) else: logging.info( f"Ignored {len(ignored_hostnames)} hostnames. {','.join(ignored_hostnames[:5])}..." ) return ret
def get_grow_decision(self) -> Dict[str, GrowDecision]: res = self._post(self.get_grow_decision.__name__, self.GROW_DECISION_API_ROUTE, data=None) logging.info(res.content) grow_decision_dict = { k: GrowDecision(v['CoresToGrow'], v['NodesToGrow'], v['SocketsToGrow']) for k, v in json.loads(res.content).items() } if not ci_in("Default", grow_decision_dict): grow_decision_dict["Default"] = GrowDecision(0.0, 0.0, 0.0) return grow_decision_dict
def retire_records(self, timeout: int = (7 * 24 * 60 * 60), commit: bool = True) -> None: if self.read_only: return retire_omega = self.now() - timeout cursor = self._execute( """DELETE from nodes where delete_time is not null AND delete_time < {} AND delete_time > 0""" .format(retire_omega)) deleted = list(cursor) logging.info("Deleted %s nodes - %s", len(deleted), [(d[0], d[1]) for d in deleted]) if commit: self.conn.commit()
def get_hostgroups_for_pe(self, pe_name: str) -> List[str]: if not self.has_pe(pe_name): raise RuntimeError( "Queue {} does not support parallel_environment {}".format( self.qname, pe_name)) ret = self.__pe_to_hostgroups[pe_name] if set(ret) == set([None]): logging.info( "PE %s has no specified hostgroup and will be put into hostgroup %s", pe_name, self.default_hg, ) self.__pe_to_hostgroups[pe_name] = ret = [self.default_hg] return [h for h in ret if h]
def _post(self, function_name: str, function_route: str, data) -> Response: headers = {"Content-Type": "application/json"} url = function_route.format(self.hostname) res = requests.post(url, data=data, headers=headers, verify=False, cert=self._pem) try: res.raise_for_status() logging.info("{} resp: {}".format(function_name, str(res.content))) return res except HTTPError: logging.error("{}: status_code:{} content:{}".format( function_name, res.status_code, res.content)) raise
def handle_draining(self, nodes: List[Node]) -> List[Node]: # TODO batch these up, but keep it underneath the # max arg limit ret = [] for node in nodes: if not node.hostname: logging.info("Node %s has no hostname.", node) continue # TODO implement after we have resources added back in # what about deleting partially initialized nodes? I think we # just need to skip non-managed nodes # if not node.resources.get("ccnodeid"): # continue if not node.managed and not node.resources.get("ccnodeid"): logging.debug("Ignoring attempt to drain unmanaged %s", node) continue if "offline" in node.metadata.get("pbs_state", ""): if node.assignments: logging.info("Node %s has jobs still running on it.", node) # node is already 'offline' i.e. draining, but a job is still running continue else: # ok - it is offline _and_ no jobs are running on it. ret.append(node) else: try: self.pbscmd.pbsnodes("-o", node.hostname) # # Due to a delay in when pbsnodes -o exits to when pbsnodes -a # # actually reports an offline state, w ewill just optimistically set it to offline # # otherwise ~50% of the time you get the old state (free) # response = self.pbscmd.pbsnodes_parsed("-a", node.hostname) # if response: # node.metadata["pbs_state"] = response[0]["state"] node.metadata["pbs_state"] = "offline" except CalledProcessError as e: if node.private_ip: logging.error( "'pbsnodes -o %s' failed and this node will not be scaled down: %s", node.hostname, e, ) return ret
def handle_failed_nodes(self, nodes: List[Node]) -> List[Node]: to_delete = [] to_drain = [] now = datetime.datetime.now() for node in nodes: if node.state == "Failed": node.closed = True to_delete.append(node) continue if not node.resources.get("ccnodeid"): logging.fine( "Attempting to delete %s but ccnodeid is not set yet.", node) continue job_state = node.metadata.get("pbs_state", "") if "down" in job_state: node.closed = True # no private_ip == no dns entry, so we can safely remove it if "offline" in job_state or not node.private_ip: to_delete.append(node) else: if self._down_long_enough(now, node): to_drain.append(node) if to_drain: logging.info("Draining down nodes: %s", to_drain) self.handle_draining(to_drain) if to_delete: logging.info("Deleting down,offline nodes: %s", to_drain) return self.handle_post_delete(to_delete) return []
def add_nodes_to_cluster(self, nodes: List[Node]) -> List[Node]: self.initialize() all_nodes = self.pbscmd.pbsnodes_parsed("-a") by_ccnodeid = partition( all_nodes, lambda x: x.get("resources_available.ccnodeid")) ret = [] for node in nodes: if not node.hostname: continue if not node.private_ip: continue node_id = node.delayed_node_id.node_id if not node_id: logging.error("%s does not have a nodeid! Skipping", node) continue if node_id in by_ccnodeid: skip_node = False for ndict in by_ccnodeid[node_id]: if ndict["name"].lower() != node.hostname.lower(): logging.error( "Duplicate hostname found for the same node id! %s and %s. See 'valid_hostnames' in autoscale as a possible workaround.", node, ndict["name"], ) skip_node = True break if skip_node: continue if not is_valid_hostname(self.config, node): continue if not self._validate_reverse_dns(node): logging.fine( "%s still has a hostname that can not be looked via reverse dns. This should repair itself.", node, ) continue if not node.resources.get("ccnodeid"): logging.info( "%s is not managed by CycleCloud, or at least 'ccnodeid' is not defined. Ignoring", node, ) continue try: try: ndicts = self.pbscmd.qmgr_parsed("list", "node", node.hostname) if ndicts and ndicts[0].get( "resources_available.ccnodeid"): logging.info( "ccnodeid is already defined on %s. Skipping", node) continue # TODO RDH should we just delete it instead? logging.info( "%s already exists in this cluster. Setting resources.", node) except CalledProcessError: logging.info( "%s does not exist in this cluster yet. Creating.", node) self.pbscmd.qmgr("create", "node", node.hostname) for res_name, res_value in node.resources.items(): # we set ccnodeid last, so that we can see that we have completely joined a node # if and only if ccnodeid has been set if res_name == "ccnodeid": continue if res_value is None: continue # TODO RDH track down if res_name == "group_id" and res_value == "None": continue # skip things like host which are useful to set default resources on non-existent # nodes for autoscale packing, but not on actual nodes if res_name in self.read_only_resources: continue if res_name not in self.resource_definitions: # TODO bump to a warning? logging.fine( "%s is an unknown PBS resource for node %s. Skipping this resource", res_name, node, ) continue res_value_str: str # pbs size does not support decimals if isinstance(res_value, ht.Size): res_value_str = "{}{}".format(int(res_value.value), res_value.magnitude) elif isinstance(res_value, bool): res_value_str = "1" if bool else "0" else: res_value_str = str(res_value) self.pbscmd.qmgr( "set", "node", node.hostname, "resources_available.{}={}".format( res_name, res_value_str), ) self.pbscmd.qmgr( "set", "node", node.hostname, "resources_available.{}={}".format( "ccnodeid", node.resources["ccnodeid"]), ) self.pbscmd.pbsnodes("-r", node.hostname) ret.append(node) except SubprocessError as e: logging.error( "Could not fully add %s to cluster: %s. Will attempt next cycle", node, e, ) return ret
def autoscale_hpcpack( config: Dict[str, Any], ctx_handler: DefaultContextHandler = None, hpcpack_rest_client: Optional[HpcRestClient] = None, dry_run: bool = False, ) -> None: if not hpcpack_rest_client: hpcpack_rest_client = new_rest_client(config) if ctx_handler: ctx_handler.set_context("[Sync-Status]") autoscale_config = config.get("autoscale") or {} # Load history info idle_timeout_seconds: int = autoscale_config.get("idle_timeout") or 600 provisioning_timeout_seconds = autoscale_config.get("boot_timeout") or 1500 statefile = autoscale_config.get( "statefile") or "C:\\cycle\\jetpack\\config\\autoscaler_state.txt" archivefile = autoscale_config.get( "archivefile") or "C:\\cycle\\jetpack\\config\\autoscaler_archive.txt" node_history = HpcNodeHistory( statefile=statefile, archivefile=archivefile, provisioning_timeout=provisioning_timeout_seconds, idle_timeout=idle_timeout_seconds) logging.info("Synchronizing the nodes between Cycle cloud and HPC Pack") # Initialize data of History info, cc nodes, HPC Pack nodes, HPC grow decisions # Get node list from Cycle Cloud def nodes_state_key(n: Node) -> Tuple[int, str, int]: try: state_pri = 1 if n.state == 'Deallocated': state_pri = 2 elif n.state == 'Stopping': state_pri = 3 elif n.state == 'Terminating': state_pri = 4 name, index = n.name.rsplit("-", 1) return (state_pri, name, int(index)) except Exception: return (state_pri, n.name, 0) node_mgr: NodeManager = new_node_manager(config) for b in node_mgr.get_buckets(): b.nodes.sort(key=nodes_state_key) cc_nodes: List[Node] = node_mgr.get_nodes() cc_nodes_by_id = partition_single(cc_nodes, func=lambda n: n.delayed_node_id.node_id) # Get compute node list and grow decision from HPC Pack hpc_node_groups = hpcpack_rest_client.list_node_groups() grow_decisions = hpcpack_rest_client.get_grow_decision() logging.info("grow decision: {}".format(grow_decisions)) hpc_cn_nodes: List[HpcNode] = hpcpack_rest_client.list_computenodes() hpc_cn_nodes = [n for n in hpc_cn_nodes if n.active] # This function will link node history items, cc nodes and hpc nodes node_history.synchronize(cc_nodes, hpc_cn_nodes) cc_nodearrays = set([b.nodearray for b in node_mgr.get_buckets()]) logging.info("Current node arrays in cyclecloud: {}".format(cc_nodearrays)) # Create HPC node groups for CC node arrays cc_map_hpc_groups = ["CycleCloudNodes"] + list(cc_nodearrays) for cc_grp in cc_map_hpc_groups: if ci_notin(cc_grp, hpc_node_groups): logging.info("Create HPC node group: {}".format(cc_grp)) hpcpack_rest_client.add_node_group(cc_grp, "Cycle Cloud Node group") # Add HPC nodes into corresponding node groups add_cc_tag_nodes = [ n.name for n in hpc_cn_nodes if n.shall_addcyclecloudtag ] if len(add_cc_tag_nodes) > 0: logging.info( "Adding HPC nodes to node group CycleCloudNodes: {}".format( add_cc_tag_nodes)) hpcpack_rest_client.add_node_to_node_group("CycleCloudNodes", add_cc_tag_nodes) for cc_grp in list(cc_nodearrays): add_array_tag_nodes = [ n.name for n in hpc_cn_nodes if n.shall_addnodearraytag and ci_equals(n.cc_nodearray, cc_grp) ] if len(add_array_tag_nodes) > 0: logging.info("Adding HPC nodes to node group {}: {}".format( cc_grp, add_array_tag_nodes)) hpcpack_rest_client.add_node_to_node_group(cc_grp, add_array_tag_nodes) # Possible values for HPC NodeState (states marked with * shall not occur for CC nodes): # Unknown, Provisioning, Offline, Starting, Online, Draining, Rejected(*), Removing, NotDeployed(*), Stopping(*) # Remove the following HPC Pack nodes: # 1. The corresponding CC node already removed # 2. The corresponding CC node is stopped and HPC node is not assigned a node template # Take offline the following HPC Pack nodes: # 1. The corresponding CC node is stopped or is going to stop hpc_nodes_to_remove = [ n.name for n in hpc_cn_nodes if n.removed_cc_node or (n.stopped_cc_node and not n.template_assigned) ] hpc_nodes_to_take_offline = [ n.name for n in hpc_cn_nodes if n.stopped_cc_node and ci_equals(n.state, "Online") ] if len(hpc_nodes_to_remove) > 0: logging.info("Removing the HPC nodes: {}".format(hpc_nodes_to_remove)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.remove_nodes(hpc_nodes_to_remove) hpc_cn_nodes = [ n for n in hpc_cn_nodes if not (n.stopped_cc_node or n.removed_cc_node) ] # Assign default node template for unapproved CC node hpc_nodes_to_assign_template = [ n.name for n in hpc_cn_nodes if n.bound_cc_node and not n.template_assigned ] if len(hpc_nodes_to_assign_template) > 0: logging.info( "Assigning default node template for the HPC nodes: {}".format( hpc_nodes_to_assign_template)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.assign_default_compute_node_template( hpc_nodes_to_assign_template) ### Start scale up checking: logging.info("Start scale up checking ...") if ctx_handler: ctx_handler.set_context("[scale-up]") hpc_nodes_with_active_cc = [ n for n in hpc_cn_nodes if n.template_assigned and n.bound_cc_node ] # Exclude the already online healthy HPC nodes before calling node_mgr.allocate for hpc_node in hpc_nodes_with_active_cc: if hpc_node.ready_for_job: hpc_node.bound_cc_node.closed = True # Terminate the provisioning timeout CC nodes cc_node_to_terminate: List[Node] = [] for cc_node in cc_nodes: if ci_equals(cc_node.target_state, 'Deallocated') or ci_equals( cc_node.target_state, 'Terminated') or cc_node.create_time_remaining: continue nhi = node_history.find(cc_id=cc_node.delayed_node_id.node_id) if not nhi.hpc_id: cc_node.closed = True cc_node_to_terminate.append(cc_node) else: hpc_node = ci_find_one(hpc_nodes_with_active_cc, nhi.hpc_id, lambda n: n.id) if hpc_node and hpc_node.error: cc_node.closed = True cc_node_to_terminate.append(cc_node) # "ComputeNodes", "CycleCloudNodes", "AzureIaaSNodes" are all treated as default # grow_by_socket not supported yet, treat as grow_by_node defaultGroups = [ "Default", "ComputeNodes", "AzureIaaSNodes", "CycleCloudNodes" ] default_cores_to_grow = default_nodes_to_grow = 0.0 # If the current CC nodes in the node array cannot satisfy the grow decision, the group is hungry # For a hungry group, no idle check is required if the node health is OK group_hungry: Dict[str, bool] = {} nbrNewNodes: int = 0 grow_groups = list(grow_decisions.keys()) for grp in grow_groups: tmp = grow_decisions.pop(grp) if not (tmp.cores_to_grow + tmp.nodes_to_grow + tmp.sockets_to_grow): continue if ci_in(grp, defaultGroups): default_cores_to_grow += tmp.cores_to_grow default_nodes_to_grow += tmp.nodes_to_grow + tmp.sockets_to_grow continue if ci_notin(grp, cc_nodearrays): logging.warning( "No mapping node array for the grow requirement {}:{}".format( grp, grow_decisions[grp])) grow_decisions.pop(grp) continue group_hungry[grp] = False array = ci_lookup(grp, cc_nodearrays) selector = {'ncpus': 1, 'node.nodearray': [array]} target_cores = math.ceil(tmp.cores_to_grow) target_nodes = math.ceil(tmp.nodes_to_grow + tmp.sockets_to_grow) if target_nodes: logging.info("Allocate: {} Target Nodes: {}".format( selector, target_nodes)) result = node_mgr.allocate(selector, node_count=target_nodes) logging.info(result) if not result or result.total_slots < target_nodes: group_hungry[grp] = True if target_cores: logging.info("Allocate: {} Target Cores: {}".format( selector, target_cores)) result = node_mgr.allocate(selector, slot_count=target_cores) logging.info(result) if not result or result.total_slots < target_cores: group_hungry[grp] = True if len(node_mgr.new_nodes) > nbrNewNodes: group_hungry[grp] = True nbrNewNodes = len(node_mgr.new_nodes) # We then check the grow decision for the default node groups: checkShrinkNeeded = True growForDefaultGroup = True if default_nodes_to_grow or default_cores_to_grow else False if growForDefaultGroup: selector = {'ncpus': 1} if default_nodes_to_grow: target_nodes = math.ceil(default_nodes_to_grow) logging.info("Allocate: {} Target Nodes: {}".format( selector, target_nodes)) result = node_mgr.allocate({'ncpus': 1}, node_count=target_nodes) if not result or result.total_slots < target_nodes: checkShrinkNeeded = False if default_cores_to_grow: target_cores = math.ceil(default_cores_to_grow) logging.info("Allocate: {} Target Cores: {}".format( selector, target_cores)) result = node_mgr.allocate({'ncpus': 1}, slot_count=target_cores) if not result or result.total_slots < target_cores: checkShrinkNeeded = False if len(node_mgr.new_nodes) > nbrNewNodes: checkShrinkNeeded = False nbrNewNodes = len(node_mgr.new_nodes) if nbrNewNodes > 0: logging.info("Need to Allocate {} nodes in total".format(nbrNewNodes)) if dry_run: logging.info("Dry-run: skipping node bootup...") else: logging.info("Allocating {} nodes in total".format( len(node_mgr.new_nodes))) bootup_result: BootupResult = node_mgr.bootup() logging.info(bootup_result) if bootup_result and bootup_result.nodes: for cc_node in bootup_result.nodes: nhi = node_history.find( cc_id=cc_node.delayed_node_id.node_id) if nhi is None: nhi = node_history.insert( NodeHistoryItem(cc_node.delayed_node_id.node_id)) else: nhi.restart() else: logging.info("No need to allocate new nodes ...") ### Start the shrink checking if ctx_handler: ctx_handler.set_context("[scale-down]") cc_node_to_shutdown: List[Node] = [] if not checkShrinkNeeded: logging.info("No shrink check at this round ...") if not dry_run: for nhi in node_history.items: if not nhi.stopped and nhi.hpc_id: nhi.idle_from = None else: logging.info("Start scale down checking ...") # By default, we check idle for active CC nodes in HPC Pack with 'Offline', 'Starting', 'Online', 'Draining' state candidate_idle_check_nodes = [ n for n in hpc_nodes_with_active_cc if (not n.bound_cc_node.keep_alive) and ci_in(n.state, ["Offline", "Starting", "Online", "Draining"]) ] # We can exclude some nodes from idle checking: # 1. If HPC Pack ask for grow in default node group(s), all healthy ONLINE nodes are considered as busy # 2. If HPC Pack ask for grow in certain node group, all healthy ONLINE nodes in that node group are considered as busy # 3. If a node group is hungry (new CC required or grow request not satisfied), no idle check needed for all nodes in that node array if growForDefaultGroup: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not n.ready_for_job ] for grp, hungry in group_hungry.items(): if hungry: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not ci_equals(grp, n.cc_nodearray) ] elif not growForDefaultGroup: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not (ci_equals(grp, n.cc_nodearray) and n.ready_for_job) ] curtime = datetime.utcnow() # Offline node must be idle idle_node_names = [ n.name for n in candidate_idle_check_nodes if ci_equals(n.state, 'Offline') ] if len(candidate_idle_check_nodes) > len(idle_node_names): idle_nodes = hpcpack_rest_client.check_nodes_idle([ n.name for n in candidate_idle_check_nodes if not ci_equals(n.state, 'Offline') ]) if len(idle_nodes) > 0: idle_node_names.extend([n.node_name for n in idle_nodes]) if len(idle_node_names) > 0: logging.info( "The following node is idle: {}".format(idle_node_names)) else: logging.info("No idle node found in this round.") retention_days = autoscale_config.get("vm_retention_days") or 7 for nhi in node_history.items: if nhi.stopped: if nhi.stop_time + timedelta( days=retention_days) < datetime.utcnow(): cc_node = cc_nodes_by_id.get(nhi.cc_id) if cc_node is not None: cc_node_to_terminate.append(cc_node) continue if ci_in(nhi.hostname, idle_node_names): if nhi.idle_from is None: nhi.idle_from = curtime elif nhi.idle_timeout(idle_timeout_seconds): nhi.stop_time = curtime cc_node = cc_nodes_by_id.get(nhi.cc_id) if cc_node is not None: cc_node_to_shutdown.append(cc_node) else: nhi.idle_from = None shrinking_cc_node_ids = [ n.delayed_node_id.node_id for n in cc_node_to_terminate ] shrinking_cc_node_ids.extend( [n.delayed_node_id.node_id for n in cc_node_to_shutdown]) hpc_nodes_to_bring_online = [ n.name for n in hpc_nodes_with_active_cc if ci_equals(n.state, 'Offline') and not n.error and ci_notin(n.cc_node_id, shrinking_cc_node_ids) ] hpc_nodes_to_take_offline.extend([ n.name for n in hpc_nodes_with_active_cc if ci_equals(n.state, 'Online') and ci_in(n.cc_node_id, shrinking_cc_node_ids) ]) if len(hpc_nodes_to_bring_online) > 0: logging.info("Bringing the HPC nodes online: {}".format( hpc_nodes_to_bring_online)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.bring_nodes_online(hpc_nodes_to_bring_online) if len(hpc_nodes_to_take_offline) > 0: logging.info("Taking the HPC nodes offline: {}".format( hpc_nodes_to_take_offline)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.take_nodes_offline(hpc_nodes_to_take_offline) if len(cc_node_to_shutdown) > 0: logging.info("Shut down the following Cycle cloud node: {}".format( [cn.name for cn in cc_node_to_shutdown])) if dry_run: logging.info("Dry-run: skip ...") else: node_mgr.shutdown_nodes(cc_node_to_shutdown) if len(cc_node_to_terminate) > 0: logging.info( "Terminating the following provisioning-timeout Cycle cloud nodes: {}" .format([cn.name for cn in cc_node_to_terminate])) if dry_run: logging.info("Dry-run: skip ...") else: node_mgr.terminate_nodes(cc_node_to_terminate) if not dry_run: logging.info("Save node history: {}".format(node_history)) node_history.save()
def new_rest_client(config: Dict[str, Any]) -> HpcRestClient: hpcpack_config = config.get('hpcpack') or {} hpc_pem_file = hpcpack_config.get('pem') hn_hostname = hpcpack_config.get('hn_hostname') return HpcRestClient(config, pem=hpc_pem_file, hostname=hn_hostname) if __name__ == "__main__": config_file = "" if len(sys.argv) > 1: config_file = sys.argv[1] dry_run = False if len(sys.argv) > 2: dry_run = ci_in(sys.argv[2], ['true', 'dryrun']) ctx_handler = register_result_handler( DefaultContextHandler("[initialization]")) config = load_config(config_file) logging.initialize_logging(config) logging.info( "------------------------------------------------------------------------" ) if config["autoscale"]["start_enabled"]: autoscale_hpcpack(config, ctx_handler=ctx_handler, dry_run=dry_run) else: logging.info("Autoscaler is not enabled")
def autoscale_grid_engine( config: Dict[str, Any], ge_env: Optional[GridEngineEnvironment] = None, ge_driver: Optional["GridEngineDriver"] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, dry_run: bool = False, ) -> DemandResult: global _exit_code assert not config.get("read_only", False) if dry_run: logging.warning("Running gridengine autoscaler in dry run mode") # allow multiple instances config["lock_file"] = None # put in read only mode config["read_only"] = True if ge_env is None: ge_env = envlib.from_qconf(config) # interface to GE, generally by cli if ge_driver is None: # allow tests to pass in a mock ge_driver = new_driver(config, ge_env) ge_driver.initialize_environment() config = ge_driver.preprocess_config(config) logging.fine("Driver = %s", ge_driver) invalid_nodes = [] # we need an instance without any scheduler nodes, so don't # pass in the existing nodes. tmp_node_mgr = new_node_manager(config) by_hostname = partition_single(tmp_node_mgr.get_nodes(), lambda n: n.hostname_or_uuid) for node in ge_env.nodes: # many combinations of a u and other states. However, # as long as a and u are in there it is down state = node.metadata.get("state", "") cc_node = by_hostname.get(node.hostname) ccnodeid = node.resources.get("ccnodeid") if cc_node: if not ccnodeid or ccnodeid == cc_node.delayed_node_id.node_id: if cc_node.state in ["Preparing", "Acquiring"]: continue if "a" in state and "u" in state: invalid_nodes.append(node) # nodes in error state must also be deleted nodes_to_delete = ge_driver.clean_hosts(invalid_nodes) for node in nodes_to_delete: ge_env.delete_node(node) demand_calculator = calculate_demand(config, ge_env, ge_driver, ctx_handler, node_history) ge_driver.handle_failed_nodes( demand_calculator.node_mgr.get_failed_nodes()) demand_result = demand_calculator.finish() if ctx_handler: ctx_handler.set_context("[joining]") # details here are that we pass in nodes that matter (matched) and the driver figures out # which ones are new and need to be added via qconf joined = ge_driver.handle_join_cluster( [x for x in demand_result.compute_nodes if x.exists]) ge_driver.handle_post_join_cluster(joined) if ctx_handler: ctx_handler.set_context("[scaling]") # bootup all nodes. Optionally pass in a filtered list if demand_result.new_nodes: if not dry_run: demand_calculator.bootup() if not dry_run: demand_calculator.update_history() # we also tell the driver about nodes that are unmatched. It filters them out # and returns a list of ones we can delete. idle_timeout = int(config.get("idle_timeout", 300)) boot_timeout = int(config.get("boot_timeout", 3600)) logging.fine("Idle timeout is %s", idle_timeout) unmatched_for_5_mins = demand_calculator.find_unmatched_for( at_least=idle_timeout) timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout) # I don't care about nodes that have keep_alive=true timed_out_booting = [n for n in timed_out_booting if not n.keep_alive] timed_out_to_deleted = [] unmatched_nodes_to_delete = [] if timed_out_booting: logging.info("The following nodes have timed out while booting: %s", timed_out_booting) timed_out_to_deleted = ge_driver.handle_boot_timeout( timed_out_booting) or [] if unmatched_for_5_mins: node_expr = ", ".join([str(x) for x in unmatched_for_5_mins]) logging.info("Unmatched for at least %s seconds: %s", idle_timeout, node_expr) unmatched_nodes_to_delete = ( ge_driver.handle_draining(unmatched_for_5_mins) or []) nodes_to_delete = [] for node in timed_out_to_deleted + unmatched_nodes_to_delete: if node.assignments: logging.warning( "%s has jobs assigned to it so we will take no action.", node) continue nodes_to_delete.append(node) if nodes_to_delete: try: logging.info("Deleting %s", [str(n) for n in nodes_to_delete]) delete_result = demand_calculator.delete(nodes_to_delete) if delete_result: # in case it has anything to do after a node is deleted (usually just remove it from the cluster) ge_driver.handle_post_delete(delete_result.nodes) except Exception as e: _exit_code = 1 logging.warning( "Deletion failed, will retry on next iteration: %s", e) logging.exception(str(e)) print_demand(config, demand_result, log=not dry_run) return demand_result
def _parse_complexes( autoscale_config: Dict, complex_lines: List[str] ) -> Dict[str, "Complex"]: relevant_complexes = None if autoscale_config: relevant_complexes = autoscale_config.get("gridengine", {}).get( "relevant_complexes" ) if relevant_complexes: # special handling of ccnodeid, since it is something we # create for the user relevant_complexes = relevant_complexes + ["ccnodeid"] if relevant_complexes: logging.info( "Restricting complexes for autoscaling to %s", relevant_complexes ) complexes: List[Complex] = [] headers = complex_lines[0].lower().replace("#", "").split() required = set(["name", "type", "consumable"]) missing = required - set(headers) if missing: logging.error( "Could not parse complex file as it is missing expected columns: %s." + " Autoscale likely will not work.", list(missing), ) return {} for n, line in enumerate(complex_lines[1:]): if line.startswith("#"): continue toks = line.split() if len(toks) != len(headers): logging.warning( "Could not parse complex at line {} - ignoring: '{}'".format(n, line) ) continue c = dict(zip(headers, toks)) try: if ( relevant_complexes and c["name"] not in relevant_complexes and c["shortcut"] not in relevant_complexes ): logging.trace( "Ignoring complex %s because it was not defined in gridengine.relevant_complexes", c["name"], ) continue complex = Complex( name=c["name"], shortcut=c.get("shortcut", c["name"]), complex_type=c["type"], relop=c.get("relop", "=="), requestable=c.get("requestable", "YES").lower() == "yes", consumable=c.get("consumable", "YES").lower() == "yes", default=c.get("default"), urgency=int(c.get("urgency", 0)), ) complexes.append(complex) except Exception: logging.exception("Could not parse complex %s - %s", line, c) # TODO test RDH ret = partition_single(complexes, lambda x: x.name) shortcut_dict = partition_single(complexes, lambda x: x.shortcut) ret.update(shortcut_dict) return ret
def autoscale_pbspro( config: Dict[str, Any], pbs_env: Optional[PBSProEnvironment] = None, pbs_driver: Optional[PBSProDriver] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, dry_run: bool = False, ) -> DemandResult: global _exit_code assert not config.get("read_only", False) if dry_run: logging.warning("Running pbs autoscaler in dry run mode") # allow multiple instances config["lock_file"] = None # put in read only mode config["read_only"] = True # interface to PBSPro, generally by cli if pbs_driver is None: # allow tests to pass in a mock pbs_driver = PBSProDriver(config) if pbs_env is None: pbs_env = envlib.from_driver(config, pbs_driver) pbs_driver.initialize() config = pbs_driver.preprocess_config(config) logging.debug("Driver = %s", pbs_driver) demand_calculator = calculate_demand(config, pbs_env, ctx_handler, node_history) failed_nodes = demand_calculator.node_mgr.get_failed_nodes() for node in pbs_env.scheduler_nodes: if "down" in node.metadata.get("pbs_state", ""): failed_nodes.append(node) pbs_driver.handle_failed_nodes(failed_nodes) demand_result = demand_calculator.finish() if ctx_handler: ctx_handler.set_context("[joining]") # details here are that we pass in nodes that matter (matched) and the driver figures out # which ones are new and need to be added joined = pbs_driver.add_nodes_to_cluster( [x for x in demand_result.compute_nodes if x.exists]) pbs_driver.handle_post_join_cluster(joined) if ctx_handler: ctx_handler.set_context("[scaling]") # bootup all nodes. Optionally pass in a filtered list if demand_result.new_nodes: if not dry_run: demand_calculator.bootup() if not dry_run: demand_calculator.update_history() # we also tell the driver about nodes that are unmatched. It filters them out # and returns a list of ones we can delete. idle_timeout = int(config.get("idle_timeout", 300)) boot_timeout = int(config.get("boot_timeout", 3600)) logging.fine("Idle timeout is %s", idle_timeout) unmatched_for_5_mins = demand_calculator.find_unmatched_for( at_least=idle_timeout) timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout) # I don't care about nodes that have keep_alive=true timed_out_booting = [n for n in timed_out_booting if not n.keep_alive] timed_out_to_deleted = [] unmatched_nodes_to_delete = [] if timed_out_booting: logging.info("The following nodes have timed out while booting: %s", timed_out_booting) timed_out_to_deleted = pbs_driver.handle_boot_timeout( timed_out_booting) or [] if unmatched_for_5_mins: logging.info("unmatched_for_5_mins %s", unmatched_for_5_mins) unmatched_nodes_to_delete = ( pbs_driver.handle_draining(unmatched_for_5_mins) or []) nodes_to_delete = [] for node in timed_out_to_deleted + unmatched_nodes_to_delete: if node.assignments: logging.warning( "%s has jobs assigned to it so we will take no action.", node) continue nodes_to_delete.append(node) if nodes_to_delete: try: logging.info("Deleting %s", [str(n) for n in nodes_to_delete]) delete_result = demand_calculator.delete(nodes_to_delete) if delete_result: # in case it has anything to do after a node is deleted (usually just remove it from the cluster) pbs_driver.handle_post_delete(delete_result.nodes) except Exception as e: _exit_code = 1 logging.warning( "Deletion failed, will retry on next iteration: %s", e) logging.exception(str(e)) print_demand(config, demand_result, log=not dry_run) return demand_result