def main(nodelist, job_id, force=False): """main called when run as script""" log.debug(f"main {nodelist} {job_id}") # nodes are split between normal and exclusive # exclusive nodes are handled by PrologSlurmctld nodes = expand_nodelist(nodelist) if force: exclusive = normal = nodes prelog = "force " else: normal, exclusive = separate(is_exclusive_node, nodes) prelog = "" if job_id is None or force: if normal: hostlist = util.to_hostlist(normal) log.info(f"{prelog}resume {hostlist}") resume_nodes(normal) else: if exclusive: hostlist = util.to_hostlist(exclusive) log.info(f"{prelog}exclusive resume {hostlist} {job_id}") prolog_resume_nodes(job_id, exclusive)
def main(nodelist, job_id): """main called when run as script""" log.debug(f"main {nodelist} {job_id}") nodes = util.to_hostnames(nodelist) if job_id is not None: _, exclusive = separate(is_exclusive_node, nodes) if exclusive: hostlist = util.to_hostlist(exclusive) log.info(f"epilog suspend {hostlist} job_id={job_id}") epilog_suspend_nodes(exclusive, job_id) else: # suspend is allowed to delete exclusive nodes log.info(f"suspend {nodelist}") suspend_nodes(nodes)
def delete_instances(instances): """Call regionInstances.bulkInsert to create instances""" if len(instances) == 0: return invalid, valid = separate(lambda inst: bool(lkp.instance(inst)), instances) log.debug("instances do not exist: {}".format(",".join(invalid))) if lkp.cfg.enable_reconfigure: count = len(instances) hostlist = util.to_hostlist(valid) log.info("delete {} subscriptions ({})".format(count, hostlist)) execute_with_futures(subscription_delete, valid) requests = {inst: delete_instance_request(inst) for inst in valid} done, failed = batch_execute(requests) if failed: failed_nodes = [f"{n}: {e}" for n, (_, e) in failed.items()] node_str = "\n".join(str(el) for el in truncate_iter(failed_nodes, 5)) log.error(f"some nodes failed to delete: {node_str}") wait_for_operations(done.values())
def down_nodes(nodelist, reason): """set nodes down with reason""" if isinstance(nodelist, list): nodelist = util.to_hostlist(nodelist) run(f"{lkp.scontrol} update nodename={nodelist} state=down reason='{reason}'")
def resume_nodes(nodelist, placement_groups=None, exclusive=False): """resume nodes in nodelist""" def ident_key(n): # ident here will refer to the combination of partition and group return "-".join( ( lkp.node_partition_name(n), lkp.node_group_name(n), ) ) # support already expanded list nodes = nodelist if isinstance(nodes, str): nodelist = expand_nodelist(nodelist) nodes = sorted(nodelist, key=ident_key) if len(nodes) == 0: return grouped_nodes = { ident: chunk for ident, nodes in groupby(nodes, ident_key) for chunk in chunked(nodes, n=BULK_INSERT_LIMIT) } log.debug(f"grouped_nodes: {grouped_nodes}") # make all bulkInsert requests and execute with batch inserts = { ident: create_instances_request(nodes, placement_groups, exclusive) for ident, nodes in grouped_nodes.items() } started, failed = batch_execute(inserts) if failed: failed_reqs = [f"{e}" for _, (_, e) in failed.items()] log.error("bulkInsert API failures: {}".format("\n".join(failed_reqs))) for ident, (_, exc) in failed.items(): down_nodes(grouped_nodes[ident], exc._get_reason()) # wait for all bulkInserts to complete and log any errors bulk_operations = [wait_for_operation(op) for op in started.values()] for bulk_op in bulk_operations: if "error" in bulk_op: error = bulk_op["error"]["errors"][0] log.error( f"bulkInsert operation error: {error['code']} operationName:'{bulk_op['name']}'" ) # Fetch all insert operations from all bulkInserts. Group by error code and log successful_inserts, failed_inserts = separate( lambda op: "error" in op, get_insert_operations(bulk_operations) ) # Apparently multiple errors are possible... so join with +. # grouped_inserts could be made into a dict, but it's not really necessary. Save some memory. grouped_inserts = util.groupby_unsorted( failed_inserts, lambda op: "+".join(err["code"] for err in op["error"]["errors"]), ) for code, failed_ops in grouped_inserts: # at least one insert failure failed_nodes = [parse_self_link(op["targetLink"]).instance for op in failed_ops] hostlist = util.to_hostlist(failed_nodes) count = len(failed_nodes) log.error( f"{count} instances failed to start due to insert operation error: {code} ({hostlist})" ) down_nodes(hostlist, code) if log.isEnabledFor(logging.DEBUG): msg = "\n".join( err["message"] for err in next(failed_ops)["error"]["errors"] ) log.debug(f"{code} message from first node: {msg}") # If reconfigure enabled, create subscriptions for successfully started instances if lkp.cfg.enable_reconfigure and len(successful_inserts): started_nodes = [ parse_self_link(op["targetLink"]).instance for op in successful_inserts ] count = len(started_nodes) hostlist = util.to_hostlist(started_nodes) log.info("create {} subscriptions ({})".format(count, hostlist)) execute_with_futures(subscription_create, nodes)
def main(arg_nodes, arg_job_id): log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}") compute = googleapiclient.discovery.build('compute', 'v1', cache_discovery=False) # Get node list nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}", check=True, get_stdout=True).stdout node_list = nodes_str.splitlines() # Get static node list exc_nodes_hostlist = util.run( f"{SCONTROL} show config | " "awk '/SuspendExcNodes.*=/{print $3}'", shell=True, get_stdout=True).stdout nodes_exc_str = util.run(f"{SCONTROL} show hostnames {exc_nodes_hostlist}", check=True, get_stdout=True).stdout node_exc_list = sorted(nodes_exc_str.splitlines(), key=util.get_pid) # Generate new arg_nodes without static nodes dynamic_nodes = list((set(node_exc_list) ^ set(node_list)) & set(node_list)) node_list = dynamic_nodes arg_nodes = util.to_hostlist(SCONTROL, dynamic_nodes) if len(node_list) == 0: log.debug(f"Static nodes removed from request. No nodes remain in request.") return pid = util.get_pid(node_list[0]) if (arg_job_id and not cfg.instance_defs[pid].exclusive): # Don't delete from calls by EpilogSlurmctld return if arg_job_id: # Mark nodes as off limits to new jobs while powering down. # Note: If PrologSlurmctld fails with a non-zero exit code, # "powering_up" flag would get stuck on the node. In 20.11 and prior: # state=down followed by state=power_down could clear it. In 21.08, # state=power_down_force can clear it. util.run( f"{SCONTROL} update node={arg_nodes} state=power_down_force") while True: delete_instances(compute, node_list, arg_job_id) if not len(retry_list): break log.debug("got {} nodes to retry ({})" .format(len(retry_list), ','.join(retry_list))) node_list = list(retry_list) del retry_list[:] if arg_job_id: for operation in operations.values(): try: util.wait_for_operation(compute, cfg.project, operation) except Exception: log.exception(f"Error in deleting {operation['name']} to slurm") # now that the instances are gone, resume to put back in service util.run(f"{SCONTROL} update node={arg_nodes} state=resume") log.debug("done deleting instances") if (arg_job_id and cfg.instance_defs[pid].enable_placement and cfg.instance_defs[pid].machine_type.split('-')[0] == "c2" and len(node_list) > 1): delete_placement_groups(compute, node_list, arg_job_id) log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")