def read_schedulers( pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition] ) -> Dict[Hostname, PBSProScheduler]: parser = get_pbspro_parser() sched_dicts = pbscmd.qmgr_parsed("list", "sched") server_dicts = pbscmd.qmgr_parsed("list", "server") server_dicts_by_host = partition_single(server_dicts, lambda s: s["server_host"]) ret: Dict[str, PBSProScheduler] = {} for sched_dict in sched_dicts: hostname = sched_dict["sched_host"] server_dict = server_dicts_by_host[hostname] for key, value in server_dict.items(): if key not in sched_dict: sched_dict[key] = value # this is a scheduler, so it has no parent shared resources resource_state = parser.parse_resource_state( sched_dict, parent_shared_resources=None) scheduler = PBSProScheduler(sched_dict, resource_state) ret[scheduler.hostname] = scheduler return ret
def __init__( self, sched_dict: Dict[str, str], resource_state: ResourceState, ) -> None: btype = BooleanType() self.do_not_span_psets = btype.parse( sched_dict.get("do_not_span_psets", "false")) self.scheduling = btype.parse(sched_dict["scheduling"]) self.only_explicit_psets = btype.parse( sched_dict.get("only_explicit_psets", "false")) self.node_group_enable = btype.parse( sched_dict.get("node_group_enable", "false")) self.node_group_key = sched_dict.get("node_group_key") self.sched_log = sched_dict["sched_log"] self.sched_priv = sched_dict["sched_priv"] priv_config_path = os.path.join(self.sched_priv, "sched_config") self.resources_for_scheduling = (get_pbspro_parser( ).parse_resources_from_sched_priv(priv_config_path)) self.state = sched_dict["state"] self.hostname = sched_dict["sched_host"].split(".")[0] self.resource_state = resource_state try: self.pbs_version: Tuple = tuple( [int(x) for x in sched_dict["pbs_version"].split(".")]) except ValueError: self.pbs_version = tuple(sched_dict["pbs_version"].split(".")) self.sched_dict = sched_dict if not self.only_explicit_psets: logging.error( "only_explicit_psets must be set to true. You can change this by running:" + ' qmgr -c "set sched default only_explicit_psets = true')
def __init__(self) -> None: clilib.CommonCLI.__init__(self, "pbspro") # bootstrap parser set_pbspro_parser(PBSProParser({})) self.pbscmd = PBSCMD(get_pbspro_parser()) # lazily initialized self.__pbs_env: Optional[environment.PBSProEnvironment] = None self.__driver: Optional[PBSProDriver] = None
def __init__( self, config: Dict, pbscmd: Optional[PBSCMD] = None, resource_definitions: Optional[Dict[str, PBSProResourceDefinition]] = None, down_timeout: int = 300, ) -> None: super().__init__("pbspro") self.config = config self.pbscmd = pbscmd or PBSCMD(get_pbspro_parser()) self.__queues: Optional[Dict[str, PBSProQueue]] = None self.__shared_resources: Optional[Dict[str, SharedResource]] self.__resource_definitions = resource_definitions self.__read_only_resources: Optional[Set[str]] = None self.__jobs_cache: Optional[List[Job]] = None self.__scheduler_nodes_cache: Optional[List[Node]] = None self.down_timeout = down_timeout self.down_timeout_td = datetime.timedelta(seconds=self.down_timeout)
def read_queues( config: Dict, pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition], scheduler_shared_resources: Dict[str, conslib.SharedResource], ) -> Dict[str, PBSProQueue]: parser = get_pbspro_parser() ret: Dict[str, PBSProQueue] = {} qnames = list_queue_names(pbscmd) queue_dicts = pbscmd.qmgr_parsed("list", "queue", ",".join(qnames)) # queue resources will include things like ncpus - i.e. the total amount of ncpus etc # They are meaningless as a shared constraint, they are only there for info purposes ignore_queues = config.get("pbspro", {}).get("ignore_queues", []) for qdict in queue_dicts: state_count = parser.parse_state_counts(qdict["state_count"]) resource_state = parser.parse_resource_state( qdict, scheduler_shared_resources) queue = PBSProQueue( name=qdict["name"], queue_type=qdict["queue_type"], node_group_key=qdict.get("node_group_key"), node_group_enable=qdict.get("node_group_enable", "").lower() == "true", total_jobs=int(qdict["total_jobs"]), state_count=state_count, resource_state=resource_state, resources_default=parser.parse_resources_default(qdict), default_chunk=parser.parse_default_chunk(qdict), resource_definitions=resource_definitions, enabled=qdict["enabled"].lower() == "true" and qdict["name"] not in ignore_queues, started=qdict["started"].lower() == "true", ) ret[queue.name] = queue return ret
def parse_scheduler_node( ndict: Dict[str, Any], resource_definitions: Dict[str, PBSProResourceDefinition]) -> SchedulerNode: """ Implementation of parsing a single scheduler node. """ parser = get_pbspro_parser() hostname = ndict["name"] res_avail = parser.parse_resources_available(ndict, filter_is_host=True) res_assigned = parser.parse_resources_assigned(ndict, filter_is_host=True) node = SchedulerNode(hostname, res_avail) jobs_expr = ndict.get("jobs", "") state = ndict.get("state") or "" if state == "free" and jobs_expr.strip(): state = "partially-free" node.metadata["pbs_state"] = state if "down" in state: node.marked_for_deletion = True node.metadata["last_state_change_time"] = ndict.get( "last_state_change_time", "") for tok in jobs_expr.split(","): tok = tok.strip() if not tok: continue job_id_full, sub_job_id = tok.rsplit("/", 1) sched_host = "" if "." in job_id_full: job_id, sched_host = job_id_full.split(".", 1) else: job_id = job_id_full node.assign(job_id) if "job_ids_long" not in node.metadata: node.metadata["job_ids_long"] = [job_id_full] elif job_id_full not in node.metadata["job_ids_long"]: node.metadata["job_ids_long"].append(job_id_full) for res_name, value in res_assigned.items(): resource = resource_definitions.get(res_name) if not resource or not resource.is_host: continue if resource.is_consumable: if res_name in node.available: node.available[res_name] -= value else: logging.warning( "%s was not defined under resources_available, but was " + "defined under resources_assigned for %s. Setting available to assigned.", res_name, node, ) node.available[res_name] = value if "exclusive" in node.metadata["pbs_state"]: node.closed = True return node
def resource_definitions(self) -> Dict[str, PBSProResourceDefinition]: if not self.__resource_definitions: self.__resource_definitions = get_pbspro_parser( ).resource_definitions return self.__resource_definitions
def parse_jobs( pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition], queues: Dict[str, PBSProQueue], resources_for_scheduling: Set[str], ) -> List[Job]: """ Parses PBS qstat output and creates relevant hpc.autoscale.job.job.Job objects """ parser = get_pbspro_parser() # alternate format triggered by # -a, -i, -G, -H, -M, -n, -r, -s, -T, or -u ret: List[Job] = [] response: Dict = pbscmd.qstat_json("-f", "-t") for job_id, jdict in response.get("Jobs", {}).items(): job_id = job_id.split(".")[0] job_state = jdict.get("job_state") if not job_state: logging.warning("No job_state defined for job %s. Skipping", job_id) continue if job_state != PBSProJobStates.Queued: continue # ensure we don't autoscale jobs from disabled or non-started queues qname = jdict.get("queue") if not qname or qname not in queues: logging.warning("queue was not defined for job %s: ignoring", job_id) continue queue: PBSProQueue = queues[qname] if not queue.enabled: logging.fine("Skipping job %s from disabled queue %s", job_id, qname) continue if not queue.started: logging.fine("Skipping job %s from non-started queue %s", job_id, qname) continue # handle array vs individual jobs if jdict.get("array"): iterations = parser.parse_range_size( jdict["array_indices_submitted"]) remaining = parser.parse_range_size( jdict["array_indices_remaining"]) elif "[" in job_id: continue else: iterations = 1 remaining = 1 res_list = jdict["Resource_List"] res_list["schedselect"] = jdict["schedselect"] rdict = parser.convert_resource_list(res_list) pack = (PackingStrategy.PACK if rdict["place"]["arrangement"] in ["free", "pack"] else PackingStrategy.SCATTER) # SMP style jobs is_smp = (rdict["place"].get("grouping") == "host" or rdict["place"]["arrangement"] == "pack") # pack jobs do not need to define node_count node_count = int(rdict.get("nodect", "0")) smp_multiplier = 1 if is_smp: smp_multiplier = max(1, iterations) * max(1, node_count) # for key, value in list(rdict.items()): # if isinstance(value, (float, int)): # value = value * smp_multiplier iterations = node_count = 1 effective_node_count = max(node_count, 1) # htc jobs set ungrouped=true. see our default htcq colocated = (not is_smp and queue.uses_placement and rdict.get("ungrouped", "false").lower() == "false") sharing = rdict["place"].get("sharing") for n, chunk_base in enumerate(rdict["schedselect"]): chunk: Dict[str, Any] = {} chunk.update(rdict) if "ncpus" not in chunk_base: chunk["ncpus"] = chunk["ncpus"] // effective_node_count if smp_multiplier > 1: for key, value in list(chunk_base.items()): if isinstance(value, (int, float)): chunk_base[key] = value * smp_multiplier # do this _after_ rdict, since the chunks # will override the top level resources # e.g. notice that ncpus=4. This will be the rdict value # but the chunks have ncpus=2 # Resource_List.ncpus = 4 # Resource_List.nodect = 2 # Resource_List.select = 2:ncpus=2 chunk.update(chunk_base) working_constraint: Dict[str, Any] = {} constraints = [working_constraint] if colocated: working_constraint["in-a-placement-group"] = True my_job_id = job_id if len(rdict["schedselect"]) > 1: if "." in job_id: job_index, host = job_id.split(".", 1) my_job_id = "{}+{}.{}".format(job_index, n, host) else: my_job_id = "{}+{}".format(job_id, n) if sharing == "excl": working_constraint["exclusive-task"] = True elif sharing == "exclhost": working_constraint["exclusive"] = True job_resources = {} for rname, rvalue in chunk.items(): if rname in ["select", "schedselect", "place", "nodect"]: continue if rname not in resources_for_scheduling: if rname == "skipcyclesubhook": continue logging.warning( "Ignoring resource %s as it was not defined in sched_config", rname, ) continue # add all resource requests here. By that, I mean # non resource requests, like exclusive, should be ignored # required for get_non_host_constraints job_resources[rname] = rvalue resource_def = resource_definitions.get(rname) # constraints are for the node/host # queue/scheduler level ones will be added using # > queue.get_non_host_constraints(job_resource) if not resource_def or not resource_def.is_host: continue if rname not in working_constraint: working_constraint[rname] = rvalue else: # hit a conflict, so start a new working cons # so we maintain precedence working_constraint = {rname: rvalue} constraints.append(working_constraint) queue_constraints = queue.get_non_host_constraints(job_resources) constraints.extend(queue_constraints) job = Job( name=my_job_id, constraints=constraints, iterations=iterations, node_count=node_count, colocated=colocated, packing_strategy=pack, ) job.iterations_remaining = remaining ret.append(job) return ret
def _initialize(self, command: str, config: Dict) -> None: resource_definitions = read_resource_definitions(self.pbscmd, config) set_pbspro_parser(PBSProParser(resource_definitions)) self.pbscmd = PBSCMD(get_pbspro_parser())
def test_parse_scheduler_node() -> None: actual = parse_scheduler_node( { "name": "tux", "resources_available.ncpus": 4, "resources_available.group_id": "pg0", "resources_available.infiniband": True, "resources_assigned.ncpus": 3, "resources_assigned.group_id": "pg0", "resources_assigned.infiniband": True, }, get_pbspro_parser().resource_definitions, ) expected = SchedulerNode("tux", { "ncpus": 4, "group_id": "pg0", "infiniband": True }) expected.available["ncpus"] = 1 assert expected.hostname == actual.hostname assert expected.resources == actual.resources assert expected.available == actual.available # True: down for longer than 5 minutes actual = parse_scheduler_node( { "name": "tux", "resources_available.ncpus": 4, "state": "down", "last_state_change_time": "'Mon Jan 1 12:34:56 2001'", }, get_pbspro_parser().resource_definitions, ) assert actual.marked_for_deletion # True: down and offline for longer than 5 minutes actual = parse_scheduler_node( { "name": "tux", "resources_available.ncpus": 4, "state": "down,offline", "last_state_change_time": "'Mon Jan 1 12:34:56 2001'", }, get_pbspro_parser().resource_definitions, ) assert actual.marked_for_deletion # True: down and offline for less than 5 minutes actual = parse_scheduler_node( { "name": "tux", "resources_available.ncpus": 4, "state": "down,offline", "last_state_change_time": time.ctime(), }, get_pbspro_parser().resource_definitions, ) assert actual.marked_for_deletion # True: down for less than 5 minutes actual = parse_scheduler_node( { "name": "tux", "resources_available.ncpus": 4, "state": "down", "last_state_change_time": time.ctime(), }, get_pbspro_parser().resource_definitions, ) assert actual.marked_for_deletion