Ejemplo n.º 1
0
def read_schedulers(
    pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition]
) -> Dict[Hostname, PBSProScheduler]:
    parser = get_pbspro_parser()
    sched_dicts = pbscmd.qmgr_parsed("list", "sched")
    server_dicts = pbscmd.qmgr_parsed("list", "server")

    server_dicts_by_host = partition_single(server_dicts,
                                            lambda s: s["server_host"])

    ret: Dict[str, PBSProScheduler] = {}

    for sched_dict in sched_dicts:
        hostname = sched_dict["sched_host"]
        server_dict = server_dicts_by_host[hostname]

        for key, value in server_dict.items():
            if key not in sched_dict:
                sched_dict[key] = value

        # this is a scheduler, so it has no parent shared resources
        resource_state = parser.parse_resource_state(
            sched_dict, parent_shared_resources=None)
        scheduler = PBSProScheduler(sched_dict, resource_state)
        ret[scheduler.hostname] = scheduler

    return ret
Ejemplo n.º 2
0
    def __init__(
        self,
        sched_dict: Dict[str, str],
        resource_state: ResourceState,
    ) -> None:
        btype = BooleanType()
        self.do_not_span_psets = btype.parse(
            sched_dict.get("do_not_span_psets", "false"))
        self.scheduling = btype.parse(sched_dict["scheduling"])
        self.only_explicit_psets = btype.parse(
            sched_dict.get("only_explicit_psets", "false"))
        self.node_group_enable = btype.parse(
            sched_dict.get("node_group_enable", "false"))
        self.node_group_key = sched_dict.get("node_group_key")

        self.sched_log = sched_dict["sched_log"]
        self.sched_priv = sched_dict["sched_priv"]
        priv_config_path = os.path.join(self.sched_priv, "sched_config")
        self.resources_for_scheduling = (get_pbspro_parser(
        ).parse_resources_from_sched_priv(priv_config_path))
        self.state = sched_dict["state"]
        self.hostname = sched_dict["sched_host"].split(".")[0]
        self.resource_state = resource_state

        try:
            self.pbs_version: Tuple = tuple(
                [int(x) for x in sched_dict["pbs_version"].split(".")])
        except ValueError:
            self.pbs_version = tuple(sched_dict["pbs_version"].split("."))
        self.sched_dict = sched_dict

        if not self.only_explicit_psets:
            logging.error(
                "only_explicit_psets must be set to true. You can change this by running:"
                + ' qmgr -c "set sched default only_explicit_psets = true')
Ejemplo n.º 3
0
 def __init__(self) -> None:
     clilib.CommonCLI.__init__(self, "pbspro")
     # bootstrap parser
     set_pbspro_parser(PBSProParser({}))
     self.pbscmd = PBSCMD(get_pbspro_parser())
     # lazily initialized
     self.__pbs_env: Optional[environment.PBSProEnvironment] = None
     self.__driver: Optional[PBSProDriver] = None
Ejemplo n.º 4
0
 def __init__(
     self,
     config: Dict,
     pbscmd: Optional[PBSCMD] = None,
     resource_definitions: Optional[Dict[str,
                                         PBSProResourceDefinition]] = None,
     down_timeout: int = 300,
 ) -> None:
     super().__init__("pbspro")
     self.config = config
     self.pbscmd = pbscmd or PBSCMD(get_pbspro_parser())
     self.__queues: Optional[Dict[str, PBSProQueue]] = None
     self.__shared_resources: Optional[Dict[str, SharedResource]]
     self.__resource_definitions = resource_definitions
     self.__read_only_resources: Optional[Set[str]] = None
     self.__jobs_cache: Optional[List[Job]] = None
     self.__scheduler_nodes_cache: Optional[List[Node]] = None
     self.down_timeout = down_timeout
     self.down_timeout_td = datetime.timedelta(seconds=self.down_timeout)
Ejemplo n.º 5
0
def read_queues(
    config: Dict,
    pbscmd: PBSCMD,
    resource_definitions: Dict[str, PBSProResourceDefinition],
    scheduler_shared_resources: Dict[str, conslib.SharedResource],
) -> Dict[str, PBSProQueue]:
    parser = get_pbspro_parser()

    ret: Dict[str, PBSProQueue] = {}
    qnames = list_queue_names(pbscmd)
    queue_dicts = pbscmd.qmgr_parsed("list", "queue", ",".join(qnames))

    # queue resources will include things like ncpus - i.e. the total amount of ncpus etc
    # They are meaningless as a shared constraint, they are only there for info purposes
    ignore_queues = config.get("pbspro", {}).get("ignore_queues", [])

    for qdict in queue_dicts:
        state_count = parser.parse_state_counts(qdict["state_count"])

        resource_state = parser.parse_resource_state(
            qdict, scheduler_shared_resources)

        queue = PBSProQueue(
            name=qdict["name"],
            queue_type=qdict["queue_type"],
            node_group_key=qdict.get("node_group_key"),
            node_group_enable=qdict.get("node_group_enable",
                                        "").lower() == "true",
            total_jobs=int(qdict["total_jobs"]),
            state_count=state_count,
            resource_state=resource_state,
            resources_default=parser.parse_resources_default(qdict),
            default_chunk=parser.parse_default_chunk(qdict),
            resource_definitions=resource_definitions,
            enabled=qdict["enabled"].lower() == "true"
            and qdict["name"] not in ignore_queues,
            started=qdict["started"].lower() == "true",
        )
        ret[queue.name] = queue

    return ret
Ejemplo n.º 6
0
def parse_scheduler_node(
    ndict: Dict[str, Any],
    resource_definitions: Dict[str,
                               PBSProResourceDefinition]) -> SchedulerNode:
    """
    Implementation of parsing a single scheduler node.
    """
    parser = get_pbspro_parser()

    hostname = ndict["name"]
    res_avail = parser.parse_resources_available(ndict, filter_is_host=True)
    res_assigned = parser.parse_resources_assigned(ndict, filter_is_host=True)

    node = SchedulerNode(hostname, res_avail)
    jobs_expr = ndict.get("jobs", "")

    state = ndict.get("state") or ""

    if state == "free" and jobs_expr.strip():
        state = "partially-free"

    node.metadata["pbs_state"] = state

    if "down" in state:
        node.marked_for_deletion = True

    node.metadata["last_state_change_time"] = ndict.get(
        "last_state_change_time", "")

    for tok in jobs_expr.split(","):
        tok = tok.strip()
        if not tok:
            continue
        job_id_full, sub_job_id = tok.rsplit("/", 1)
        sched_host = ""
        if "." in job_id_full:
            job_id, sched_host = job_id_full.split(".", 1)
        else:
            job_id = job_id_full

        node.assign(job_id)

        if "job_ids_long" not in node.metadata:
            node.metadata["job_ids_long"] = [job_id_full]
        elif job_id_full not in node.metadata["job_ids_long"]:
            node.metadata["job_ids_long"].append(job_id_full)

    for res_name, value in res_assigned.items():
        resource = resource_definitions.get(res_name)

        if not resource or not resource.is_host:
            continue

        if resource.is_consumable:
            if res_name in node.available:
                node.available[res_name] -= value
            else:
                logging.warning(
                    "%s was not defined under resources_available, but was " +
                    "defined under resources_assigned for %s. Setting available to assigned.",
                    res_name,
                    node,
                )
                node.available[res_name] = value

    if "exclusive" in node.metadata["pbs_state"]:
        node.closed = True

    return node
Ejemplo n.º 7
0
 def resource_definitions(self) -> Dict[str, PBSProResourceDefinition]:
     if not self.__resource_definitions:
         self.__resource_definitions = get_pbspro_parser(
         ).resource_definitions
     return self.__resource_definitions
Ejemplo n.º 8
0
def parse_jobs(
    pbscmd: PBSCMD,
    resource_definitions: Dict[str, PBSProResourceDefinition],
    queues: Dict[str, PBSProQueue],
    resources_for_scheduling: Set[str],
) -> List[Job]:
    """
    Parses PBS qstat output and creates relevant hpc.autoscale.job.job.Job objects
    """
    parser = get_pbspro_parser()
    # alternate format triggered by
    # -a, -i, -G, -H, -M, -n, -r, -s, -T, or -u
    ret: List[Job] = []

    response: Dict = pbscmd.qstat_json("-f", "-t")

    for job_id, jdict in response.get("Jobs", {}).items():
        job_id = job_id.split(".")[0]

        job_state = jdict.get("job_state")
        if not job_state:
            logging.warning("No job_state defined for job %s. Skipping",
                            job_id)
            continue

        if job_state != PBSProJobStates.Queued:
            continue

        # ensure we don't autoscale jobs from disabled or non-started queues
        qname = jdict.get("queue")
        if not qname or qname not in queues:
            logging.warning("queue was not defined for job %s: ignoring",
                            job_id)
            continue

        queue: PBSProQueue = queues[qname]
        if not queue.enabled:
            logging.fine("Skipping job %s from disabled queue %s", job_id,
                         qname)
            continue

        if not queue.started:
            logging.fine("Skipping job %s from non-started queue %s", job_id,
                         qname)
            continue

        # handle array vs individual jobs
        if jdict.get("array"):
            iterations = parser.parse_range_size(
                jdict["array_indices_submitted"])
            remaining = parser.parse_range_size(
                jdict["array_indices_remaining"])
        elif "[" in job_id:
            continue
        else:
            iterations = 1
            remaining = 1

        res_list = jdict["Resource_List"]
        res_list["schedselect"] = jdict["schedselect"]
        rdict = parser.convert_resource_list(res_list)

        pack = (PackingStrategy.PACK if rdict["place"]["arrangement"]
                in ["free", "pack"] else PackingStrategy.SCATTER)

        # SMP style jobs
        is_smp = (rdict["place"].get("grouping") == "host"
                  or rdict["place"]["arrangement"] == "pack")

        # pack jobs do not need to define node_count

        node_count = int(rdict.get("nodect", "0"))

        smp_multiplier = 1

        if is_smp:
            smp_multiplier = max(1, iterations) * max(1, node_count)
            # for key, value in list(rdict.items()):
            #     if isinstance(value, (float, int)):
            #         value = value * smp_multiplier
            iterations = node_count = 1

        effective_node_count = max(node_count, 1)

        # htc jobs set ungrouped=true. see our default htcq
        colocated = (not is_smp and queue.uses_placement
                     and rdict.get("ungrouped", "false").lower() == "false")

        sharing = rdict["place"].get("sharing")

        for n, chunk_base in enumerate(rdict["schedselect"]):

            chunk: Dict[str, Any] = {}

            chunk.update(rdict)

            if "ncpus" not in chunk_base:
                chunk["ncpus"] = chunk["ncpus"] // effective_node_count

            if smp_multiplier > 1:
                for key, value in list(chunk_base.items()):
                    if isinstance(value, (int, float)):
                        chunk_base[key] = value * smp_multiplier
            # do this _after_ rdict, since the chunks
            # will override the top level resources
            # e.g. notice that ncpus=4. This will be the rdict value
            # but the chunks have ncpus=2
            # Resource_List.ncpus = 4
            # Resource_List.nodect = 2
            # Resource_List.select = 2:ncpus=2

            chunk.update(chunk_base)
            working_constraint: Dict[str, Any] = {}
            constraints = [working_constraint]

            if colocated:
                working_constraint["in-a-placement-group"] = True

            my_job_id = job_id
            if len(rdict["schedselect"]) > 1:
                if "." in job_id:
                    job_index, host = job_id.split(".", 1)
                    my_job_id = "{}+{}.{}".format(job_index, n, host)
                else:
                    my_job_id = "{}+{}".format(job_id, n)

            if sharing == "excl":
                working_constraint["exclusive-task"] = True
            elif sharing == "exclhost":
                working_constraint["exclusive"] = True

            job_resources = {}

            for rname, rvalue in chunk.items():
                if rname in ["select", "schedselect", "place", "nodect"]:
                    continue

                if rname not in resources_for_scheduling:
                    if rname == "skipcyclesubhook":
                        continue
                    logging.warning(
                        "Ignoring resource %s as it was not defined in sched_config",
                        rname,
                    )
                    continue

                # add all resource requests here. By that, I mean
                # non resource requests, like exclusive, should be ignored
                # required for get_non_host_constraints
                job_resources[rname] = rvalue

                resource_def = resource_definitions.get(rname)

                # constraints are for the node/host
                # queue/scheduler level ones will be added using
                # > queue.get_non_host_constraints(job_resource)
                if not resource_def or not resource_def.is_host:
                    continue

                if rname not in working_constraint:
                    working_constraint[rname] = rvalue
                else:
                    # hit a conflict, so start a new working cons
                    # so we maintain precedence
                    working_constraint = {rname: rvalue}
                    constraints.append(working_constraint)

            queue_constraints = queue.get_non_host_constraints(job_resources)
            constraints.extend(queue_constraints)

            job = Job(
                name=my_job_id,
                constraints=constraints,
                iterations=iterations,
                node_count=node_count,
                colocated=colocated,
                packing_strategy=pack,
            )
            job.iterations_remaining = remaining
            ret.append(job)

    return ret
Ejemplo n.º 9
0
    def _initialize(self, command: str, config: Dict) -> None:

        resource_definitions = read_resource_definitions(self.pbscmd, config)
        set_pbspro_parser(PBSProParser(resource_definitions))
        self.pbscmd = PBSCMD(get_pbspro_parser())
Ejemplo n.º 10
0
def test_parse_scheduler_node() -> None:
    actual = parse_scheduler_node(
        {
            "name": "tux",
            "resources_available.ncpus": 4,
            "resources_available.group_id": "pg0",
            "resources_available.infiniband": True,
            "resources_assigned.ncpus": 3,
            "resources_assigned.group_id": "pg0",
            "resources_assigned.infiniband": True,
        },
        get_pbspro_parser().resource_definitions,
    )

    expected = SchedulerNode("tux", {
        "ncpus": 4,
        "group_id": "pg0",
        "infiniband": True
    })
    expected.available["ncpus"] = 1

    assert expected.hostname == actual.hostname
    assert expected.resources == actual.resources
    assert expected.available == actual.available
    # True: down for longer than 5 minutes
    actual = parse_scheduler_node(
        {
            "name": "tux",
            "resources_available.ncpus": 4,
            "state": "down",
            "last_state_change_time": "'Mon Jan 1 12:34:56 2001'",
        },
        get_pbspro_parser().resource_definitions,
    )

    assert actual.marked_for_deletion

    # True: down and offline for longer than 5 minutes
    actual = parse_scheduler_node(
        {
            "name": "tux",
            "resources_available.ncpus": 4,
            "state": "down,offline",
            "last_state_change_time": "'Mon Jan 1 12:34:56 2001'",
        },
        get_pbspro_parser().resource_definitions,
    )

    assert actual.marked_for_deletion

    # True: down and offline for less than 5 minutes
    actual = parse_scheduler_node(
        {
            "name": "tux",
            "resources_available.ncpus": 4,
            "state": "down,offline",
            "last_state_change_time": time.ctime(),
        },
        get_pbspro_parser().resource_definitions,
    )

    assert actual.marked_for_deletion

    # True: down for less than 5 minutes
    actual = parse_scheduler_node(
        {
            "name": "tux",
            "resources_available.ncpus": 4,
            "state": "down",
            "last_state_change_time": time.ctime(),
        },
        get_pbspro_parser().resource_definitions,
    )

    assert actual.marked_for_deletion