コード例 #1
0
def main(nodelist, job_id, force=False):
    """main called when run as script"""
    log.debug(f"main {nodelist} {job_id}")
    # nodes are split between normal and exclusive
    # exclusive nodes are handled by PrologSlurmctld
    nodes = expand_nodelist(nodelist)
    if force:
        exclusive = normal = nodes
        prelog = "force "
    else:
        normal, exclusive = separate(is_exclusive_node, nodes)
        prelog = ""
    if job_id is None or force:
        if normal:
            hostlist = util.to_hostlist(normal)
            log.info(f"{prelog}resume {hostlist}")
            resume_nodes(normal)
    else:
        if exclusive:
            hostlist = util.to_hostlist(exclusive)
            log.info(f"{prelog}exclusive resume {hostlist} {job_id}")
            prolog_resume_nodes(job_id, exclusive)
コード例 #2
0
def main(nodelist, job_id):
    """main called when run as script"""
    log.debug(f"main {nodelist} {job_id}")
    nodes = util.to_hostnames(nodelist)
    if job_id is not None:
        _, exclusive = separate(is_exclusive_node, nodes)
        if exclusive:
            hostlist = util.to_hostlist(exclusive)
            log.info(f"epilog suspend {hostlist} job_id={job_id}")
            epilog_suspend_nodes(exclusive, job_id)
    else:
        # suspend is allowed to delete exclusive nodes
        log.info(f"suspend {nodelist}")
        suspend_nodes(nodes)
コード例 #3
0
def delete_instances(instances):
    """Call regionInstances.bulkInsert to create instances"""
    if len(instances) == 0:
        return
    invalid, valid = separate(lambda inst: bool(lkp.instance(inst)), instances)
    log.debug("instances do not exist: {}".format(",".join(invalid)))

    if lkp.cfg.enable_reconfigure:
        count = len(instances)
        hostlist = util.to_hostlist(valid)
        log.info("delete {} subscriptions ({})".format(count, hostlist))
        execute_with_futures(subscription_delete, valid)

    requests = {inst: delete_instance_request(inst) for inst in valid}
    done, failed = batch_execute(requests)
    if failed:
        failed_nodes = [f"{n}: {e}" for n, (_, e) in failed.items()]
        node_str = "\n".join(str(el) for el in truncate_iter(failed_nodes, 5))
        log.error(f"some nodes failed to delete: {node_str}")
    wait_for_operations(done.values())
コード例 #4
0
def down_nodes(nodelist, reason):
    """set nodes down with reason"""
    if isinstance(nodelist, list):
        nodelist = util.to_hostlist(nodelist)
    run(f"{lkp.scontrol} update nodename={nodelist} state=down reason='{reason}'")
コード例 #5
0
def resume_nodes(nodelist, placement_groups=None, exclusive=False):
    """resume nodes in nodelist"""

    def ident_key(n):
        # ident here will refer to the combination of partition and group
        return "-".join(
            (
                lkp.node_partition_name(n),
                lkp.node_group_name(n),
            )
        )

    # support already expanded list
    nodes = nodelist
    if isinstance(nodes, str):
        nodelist = expand_nodelist(nodelist)

    nodes = sorted(nodelist, key=ident_key)
    if len(nodes) == 0:
        return
    grouped_nodes = {
        ident: chunk
        for ident, nodes in groupby(nodes, ident_key)
        for chunk in chunked(nodes, n=BULK_INSERT_LIMIT)
    }
    log.debug(f"grouped_nodes: {grouped_nodes}")

    # make all bulkInsert requests and execute with batch
    inserts = {
        ident: create_instances_request(nodes, placement_groups, exclusive)
        for ident, nodes in grouped_nodes.items()
    }
    started, failed = batch_execute(inserts)
    if failed:
        failed_reqs = [f"{e}" for _, (_, e) in failed.items()]
        log.error("bulkInsert API failures: {}".format("\n".join(failed_reqs)))
        for ident, (_, exc) in failed.items():
            down_nodes(grouped_nodes[ident], exc._get_reason())

    # wait for all bulkInserts to complete and log any errors
    bulk_operations = [wait_for_operation(op) for op in started.values()]
    for bulk_op in bulk_operations:
        if "error" in bulk_op:
            error = bulk_op["error"]["errors"][0]
            log.error(
                f"bulkInsert operation error: {error['code']} operationName:'{bulk_op['name']}'"
            )

    # Fetch all insert operations from all bulkInserts. Group by error code and log
    successful_inserts, failed_inserts = separate(
        lambda op: "error" in op, get_insert_operations(bulk_operations)
    )
    # Apparently multiple errors are possible... so join with +.
    # grouped_inserts could be made into a dict, but it's not really necessary. Save some memory.
    grouped_inserts = util.groupby_unsorted(
        failed_inserts,
        lambda op: "+".join(err["code"] for err in op["error"]["errors"]),
    )
    for code, failed_ops in grouped_inserts:
        # at least one insert failure
        failed_nodes = [parse_self_link(op["targetLink"]).instance for op in failed_ops]
        hostlist = util.to_hostlist(failed_nodes)
        count = len(failed_nodes)
        log.error(
            f"{count} instances failed to start due to insert operation error: {code} ({hostlist})"
        )
        down_nodes(hostlist, code)
        if log.isEnabledFor(logging.DEBUG):
            msg = "\n".join(
                err["message"] for err in next(failed_ops)["error"]["errors"]
            )
            log.debug(f"{code} message from first node: {msg}")

    # If reconfigure enabled, create subscriptions for successfully started instances
    if lkp.cfg.enable_reconfigure and len(successful_inserts):
        started_nodes = [
            parse_self_link(op["targetLink"]).instance for op in successful_inserts
        ]
        count = len(started_nodes)
        hostlist = util.to_hostlist(started_nodes)
        log.info("create {} subscriptions ({})".format(count, hostlist))
        execute_with_futures(subscription_create, nodes)
コード例 #6
0
def main(arg_nodes, arg_job_id):
    log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}")
    compute = googleapiclient.discovery.build('compute', 'v1',
                                              cache_discovery=False)

    # Get node list
    nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}",
                         check=True, get_stdout=True).stdout
    node_list = nodes_str.splitlines()

    # Get static node list
    exc_nodes_hostlist = util.run(
        f"{SCONTROL} show config | "
        "awk '/SuspendExcNodes.*=/{print $3}'", shell=True,
        get_stdout=True).stdout
    nodes_exc_str = util.run(f"{SCONTROL} show hostnames {exc_nodes_hostlist}",
                             check=True, get_stdout=True).stdout
    node_exc_list = sorted(nodes_exc_str.splitlines(), key=util.get_pid)

    # Generate new arg_nodes without static nodes
    dynamic_nodes = list((set(node_exc_list) ^ set(node_list)) & set(node_list))
    node_list = dynamic_nodes
    arg_nodes = util.to_hostlist(SCONTROL, dynamic_nodes)

    if len(node_list) == 0:
        log.debug(f"Static nodes removed from request. No nodes remain in request.")
        return

    pid = util.get_pid(node_list[0])
    if (arg_job_id and not cfg.instance_defs[pid].exclusive):
        # Don't delete from calls by EpilogSlurmctld
        return

    if arg_job_id:
        # Mark nodes as off limits to new jobs while powering down.
        # Note: If PrologSlurmctld fails with a non-zero exit code,
        # "powering_up" flag would get stuck on the node. In 20.11 and prior:
        # state=down followed by state=power_down could clear it. In 21.08,
        # state=power_down_force can clear it.
        util.run(
            f"{SCONTROL} update node={arg_nodes} state=power_down_force")

    while True:
        delete_instances(compute, node_list, arg_job_id)
        if not len(retry_list):
            break

        log.debug("got {} nodes to retry ({})"
                  .format(len(retry_list), ','.join(retry_list)))
        node_list = list(retry_list)
        del retry_list[:]

    if arg_job_id:
        for operation in operations.values():
            try:
                util.wait_for_operation(compute, cfg.project, operation)
            except Exception:
                log.exception(f"Error in deleting {operation['name']} to slurm")
        # now that the instances are gone, resume to put back in service
        util.run(f"{SCONTROL} update node={arg_nodes} state=resume")

    log.debug("done deleting instances")

    if (arg_job_id and
            cfg.instance_defs[pid].enable_placement and
            cfg.instance_defs[pid].machine_type.split('-')[0] == "c2" and
            len(node_list) > 1):
        delete_placement_groups(compute, node_list, arg_job_id)

    log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")