def read_schedulers( pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition] ) -> Dict[Hostname, PBSProScheduler]: parser = get_pbspro_parser() sched_dicts = pbscmd.qmgr_parsed("list", "sched") server_dicts = pbscmd.qmgr_parsed("list", "server") server_dicts_by_host = partition_single(server_dicts, lambda s: s["server_host"]) ret: Dict[str, PBSProScheduler] = {} for sched_dict in sched_dicts: hostname = sched_dict["sched_host"] server_dict = server_dicts_by_host[hostname] for key, value in server_dict.items(): if key not in sched_dict: sched_dict[key] = value # this is a scheduler, so it has no parent shared resources resource_state = parser.parse_resource_state( sched_dict, parent_shared_resources=None) scheduler = PBSProScheduler(sched_dict, resource_state) ret[scheduler.hostname] = scheduler return ret
def __init__(self) -> None: clilib.CommonCLI.__init__(self, "pbspro") # bootstrap parser set_pbspro_parser(PBSProParser({})) self.pbscmd = PBSCMD(get_pbspro_parser()) # lazily initialized self.__pbs_env: Optional[environment.PBSProEnvironment] = None self.__driver: Optional[PBSProDriver] = None
def read_resource_definitions( pbscmd: PBSCMD, config: Dict) -> Dict[str, "PBSProResourceDefinition"]: ret: Dict[str, PBSProResourceDefinition] = {} res_dicts = pbscmd.qmgr_parsed("list", "resource") res_names = set([x["name"] for x in res_dicts]) # TODO I believe this is the only one, but leaving a config option # as a backup plan read_only = config.get("pbspro", {}).get("read_only_resources", ["host", "vnode"]) def_sched = pbscmd.qmgr_parsed("list", "sched", "default") sched_priv = def_sched[0]["sched_priv"] sched_config = os.path.join(sched_priv, "sched_config") from pbspro.parser import PBSProParser parser = PBSProParser(config) sched_resources = parser.parse_resources_from_sched_priv(sched_config) missing_res = sched_resources - res_names missing_res_dicts = [] for res_name in missing_res: try: missing_res_dicts.extend( pbscmd.qmgr_parsed("list", "resource", res_name)) except CalledProcessError as e: logging.warning( "Could not find resource %s that was defined in %s, Ignoring", res_name, sched_config, ) logging.fine(e) for rdict in res_dicts + missing_res_dicts: name = rdict["name"] res_type = RESOURCE_TYPES[rdict["type"]] flag: ResourceFlag = rdict.get("flag", "") # type: ignore ret[name] = PBSProResourceDefinition(name, res_type, flag) if name in read_only: ret[name].read_only = True return ret
def parse_scheduler_nodes( config: Dict, pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition], ) -> List[Node]: """ Gets the current state of the nodes as the scheduler sees them, including resources, assigned resources, jobs currently running etc. """ ret: List[Node] = [] ignore_onprem = config.get("pbspro", {}).get("ignore_onprem", False) ignore_hostnames_re_expr = config.get("pbspro", {}).get("ignore_hostnames_re") ignore_hostnames_re = None if ignore_hostnames_re_expr: try: ignore_hostnames_re = re.compile(ignore_hostnames_re_expr) except: logging.exception( f"Could not parse {ignore_hostnames_re_expr} as a regular expression" ) ignored_hostnames = [] for ndict in pbscmd.pbsnodes_parsed("-a"): if ignore_hostnames_re and ignore_hostnames_re.match(ndict["name"]): ignored_hostnames.append(ndict["name"]) continue if ignore_onprem and ndict.get("resources_available.ccnodeid"): ignored_hostnames.append(ndict["name"]) continue node = parse_scheduler_node(ndict, resource_definitions) if not node.available.get("ccnodeid"): node.metadata["override_resources"] = False logging.fine( "'ccnodeid' is not defined so %s has not been joined to the cluster by the autoscaler" + " yet or this is not a CycleCloud managed node", node, ) ret.append(node) if ignored_hostnames: if len(ignored_hostnames) < 5: logging.info( f"Ignored {len(ignored_hostnames)} hostnames. {','.join(ignored_hostnames)}" ) else: logging.info( f"Ignored {len(ignored_hostnames)} hostnames. {','.join(ignored_hostnames[:5])}..." ) return ret
def list_queue_names(pbscmd: PBSCMD) -> List[str]: ret = [] lines_less_header = pbscmd.qstat("-Q").splitlines()[1:] for line in lines_less_header: line = line.strip() if not line: continue if line.startswith("---"): continue qname = line.split()[0] ret.append(qname) return ret
def get_pbspro_parser() -> PBSProParser: global _PARSER if _PARSER is None: # avoid circular import from pbspro.pbscmd import PBSCMD from pbspro.resource import read_resource_definitions # chicken / egg issue: we want the resource definitions # as a member of the parser, but we need the parser to parse # the definitions... # So create temp parser with no resource definitions _PARSER = PBSProParser({}) pbscmd = PBSCMD(_PARSER) logging.warning("Using uninitialized PBSProParser: please call" + " set_pbspro_parser before calling get_pbspro_parser") resource_definitions = read_resource_definitions(pbscmd, {}) _PARSER = PBSProParser(resource_definitions) return _PARSER
def __init__( self, config: Dict, pbscmd: Optional[PBSCMD] = None, resource_definitions: Optional[Dict[str, PBSProResourceDefinition]] = None, down_timeout: int = 300, ) -> None: super().__init__("pbspro") self.config = config self.pbscmd = pbscmd or PBSCMD(get_pbspro_parser()) self.__queues: Optional[Dict[str, PBSProQueue]] = None self.__shared_resources: Optional[Dict[str, SharedResource]] self.__resource_definitions = resource_definitions self.__read_only_resources: Optional[Set[str]] = None self.__jobs_cache: Optional[List[Job]] = None self.__scheduler_nodes_cache: Optional[List[Node]] = None self.down_timeout = down_timeout self.down_timeout_td = datetime.timedelta(seconds=self.down_timeout)
def read_queues( config: Dict, pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition], scheduler_shared_resources: Dict[str, conslib.SharedResource], ) -> Dict[str, PBSProQueue]: parser = get_pbspro_parser() ret: Dict[str, PBSProQueue] = {} qnames = list_queue_names(pbscmd) queue_dicts = pbscmd.qmgr_parsed("list", "queue", ",".join(qnames)) # queue resources will include things like ncpus - i.e. the total amount of ncpus etc # They are meaningless as a shared constraint, they are only there for info purposes ignore_queues = config.get("pbspro", {}).get("ignore_queues", []) for qdict in queue_dicts: state_count = parser.parse_state_counts(qdict["state_count"]) resource_state = parser.parse_resource_state( qdict, scheduler_shared_resources) queue = PBSProQueue( name=qdict["name"], queue_type=qdict["queue_type"], node_group_key=qdict.get("node_group_key"), node_group_enable=qdict.get("node_group_enable", "").lower() == "true", total_jobs=int(qdict["total_jobs"]), state_count=state_count, resource_state=resource_state, resources_default=parser.parse_resources_default(qdict), default_chunk=parser.parse_default_chunk(qdict), resource_definitions=resource_definitions, enabled=qdict["enabled"].lower() == "true" and qdict["name"] not in ignore_queues, started=qdict["started"].lower() == "true", ) ret[queue.name] = queue return ret
def parse_jobs( pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition], queues: Dict[str, PBSProQueue], resources_for_scheduling: Set[str], ) -> List[Job]: """ Parses PBS qstat output and creates relevant hpc.autoscale.job.job.Job objects """ parser = get_pbspro_parser() # alternate format triggered by # -a, -i, -G, -H, -M, -n, -r, -s, -T, or -u ret: List[Job] = [] response: Dict = pbscmd.qstat_json("-f", "-t") for job_id, jdict in response.get("Jobs", {}).items(): job_id = job_id.split(".")[0] job_state = jdict.get("job_state") if not job_state: logging.warning("No job_state defined for job %s. Skipping", job_id) continue if job_state != PBSProJobStates.Queued: continue # ensure we don't autoscale jobs from disabled or non-started queues qname = jdict.get("queue") if not qname or qname not in queues: logging.warning("queue was not defined for job %s: ignoring", job_id) continue queue: PBSProQueue = queues[qname] if not queue.enabled: logging.fine("Skipping job %s from disabled queue %s", job_id, qname) continue if not queue.started: logging.fine("Skipping job %s from non-started queue %s", job_id, qname) continue # handle array vs individual jobs if jdict.get("array"): iterations = parser.parse_range_size( jdict["array_indices_submitted"]) remaining = parser.parse_range_size( jdict["array_indices_remaining"]) elif "[" in job_id: continue else: iterations = 1 remaining = 1 res_list = jdict["Resource_List"] res_list["schedselect"] = jdict["schedselect"] rdict = parser.convert_resource_list(res_list) pack = (PackingStrategy.PACK if rdict["place"]["arrangement"] in ["free", "pack"] else PackingStrategy.SCATTER) # SMP style jobs is_smp = (rdict["place"].get("grouping") == "host" or rdict["place"]["arrangement"] == "pack") # pack jobs do not need to define node_count node_count = int(rdict.get("nodect", "0")) smp_multiplier = 1 if is_smp: smp_multiplier = max(1, iterations) * max(1, node_count) # for key, value in list(rdict.items()): # if isinstance(value, (float, int)): # value = value * smp_multiplier iterations = node_count = 1 effective_node_count = max(node_count, 1) # htc jobs set ungrouped=true. see our default htcq colocated = (not is_smp and queue.uses_placement and rdict.get("ungrouped", "false").lower() == "false") sharing = rdict["place"].get("sharing") for n, chunk_base in enumerate(rdict["schedselect"]): chunk: Dict[str, Any] = {} chunk.update(rdict) if "ncpus" not in chunk_base: chunk["ncpus"] = chunk["ncpus"] // effective_node_count if smp_multiplier > 1: for key, value in list(chunk_base.items()): if isinstance(value, (int, float)): chunk_base[key] = value * smp_multiplier # do this _after_ rdict, since the chunks # will override the top level resources # e.g. notice that ncpus=4. This will be the rdict value # but the chunks have ncpus=2 # Resource_List.ncpus = 4 # Resource_List.nodect = 2 # Resource_List.select = 2:ncpus=2 chunk.update(chunk_base) working_constraint: Dict[str, Any] = {} constraints = [working_constraint] if colocated: working_constraint["in-a-placement-group"] = True my_job_id = job_id if len(rdict["schedselect"]) > 1: if "." in job_id: job_index, host = job_id.split(".", 1) my_job_id = "{}+{}.{}".format(job_index, n, host) else: my_job_id = "{}+{}".format(job_id, n) if sharing == "excl": working_constraint["exclusive-task"] = True elif sharing == "exclhost": working_constraint["exclusive"] = True job_resources = {} for rname, rvalue in chunk.items(): if rname in ["select", "schedselect", "place", "nodect"]: continue if rname not in resources_for_scheduling: if rname == "skipcyclesubhook": continue logging.warning( "Ignoring resource %s as it was not defined in sched_config", rname, ) continue # add all resource requests here. By that, I mean # non resource requests, like exclusive, should be ignored # required for get_non_host_constraints job_resources[rname] = rvalue resource_def = resource_definitions.get(rname) # constraints are for the node/host # queue/scheduler level ones will be added using # > queue.get_non_host_constraints(job_resource) if not resource_def or not resource_def.is_host: continue if rname not in working_constraint: working_constraint[rname] = rvalue else: # hit a conflict, so start a new working cons # so we maintain precedence working_constraint = {rname: rvalue} constraints.append(working_constraint) queue_constraints = queue.get_non_host_constraints(job_resources) constraints.extend(queue_constraints) job = Job( name=my_job_id, constraints=constraints, iterations=iterations, node_count=node_count, colocated=colocated, packing_strategy=pack, ) job.iterations_remaining = remaining ret.append(job) return ret
def _initialize(self, command: str, config: Dict) -> None: resource_definitions = read_resource_definitions(self.pbscmd, config) set_pbspro_parser(PBSProParser(resource_definitions)) self.pbscmd = PBSCMD(get_pbspro_parser())