def setup_compute(): """run compute node setup""" log.info("Setting up compute") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) install_custom_scripts() setup_nss_slurm() setup_network_storage() # template = lkp.node_template_info(zone=lkp.zone) # if (not cfg.instance_defs[pid].image_hyperthreads and # shutil.which('google_mpi_tuning')): # run("google_mpi_tuning --nosmt") has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l", shell=True).returncode if has_gpu: run("nvidia-smi") run_custom_scripts() setup_slurmd_cronjob() run("systemctl restart munge", timeout=30) run("systemctl enable slurmd", timeout=30) run("systemctl restart slurmd", timeout=30) run("systemctl enable slurmeventd", timeout=30) run("systemctl restart slurmeventd", timeout=30) log.info("Check status of cluster services") run("systemctl status munge", timeout=30) run("systemctl status slurmd", timeout=30) run("systemctl status slurmeventd", timeout=30) log.info("Done setting up compute")
def gen_cloud_gres_conf(lkp=lkp): """generate cloud_gres.conf""" gpu_nodes = defaultdict(list) for part_name, partition in lkp.cfg.partitions.items(): for node in partition.partition_nodes.values(): template_info = lkp.template_info(node.instance_template) gpu_count = template_info.gpu_count if gpu_count == 0: continue gpu_nodes[gpu_count].extend( filter(None, nodeset_lists(node, part_name))) lines = [ dict_to_conf({ "NodeName": names, "Name": "gpu", "File": "/dev/nvidia{}".format(f"[0-{i-1}]" if i > 1 else "0"), }) for i, names in gpu_nodes.items() ] lines.append("\n") content = FILE_PREAMBLE + "\n".join(lines) conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_gres.conf" conf_file_bak = conf_file.with_suffix(".conf.bak") if conf_file.is_file(): shutil.copy2(conf_file, conf_file_bak) conf_file.write_text(content) util.chown_slurm(conf_file, mode=0o600)
def fetch_devel_scripts(): """download scripts from project metadata if they are present""" meta_json = project_metadata(f"{cfg.slurm_cluster_name}-slurm-devel") if not meta_json: return metadata_devel = json.loads(meta_json) meta_entries = [ ("slurmeventd.py", "slurmeventd"), ("resume.py", "slurm-resume"), ("slurmsync.py", "slurmsync"), ("util.py", "util-script"), ("setup.py", "setup-script"), ("startup.sh", "startup-script"), ("load_bq.py", "loadbq"), ] for script, name in meta_entries: if name not in metadata_devel: log.debug(f"{name} not found in project metadata, not updating") continue log.info(f"updating {script} from metadata") content = metadata_devel[name] path = (dirs.scripts / script).resolve() # make sure parent dir exists path.write_text(content) util.chown_slurm(path, mode=0o755)
def install_slurm_conf(lkp): """install slurm.conf""" if lkp.cfg.ompi_version: mpi_default = "pmi2" else: mpi_default = "none" conf_options = { "name": lkp.cfg.slurm_cluster_name, "control_host": lkp.control_host, "scripts": dirs.scripts, "slurmlog": dirs.log, "state_save": slurmdirs.state, "mpi_default": mpi_default, } conf_resp = project_metadata( f"{cfg.slurm_cluster_name}-slurm-tpl-slurm-conf") conf = conf_resp.format(**conf_options) conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "slurm.conf" conf_file_bak = conf_file.with_suffix(".conf.bak") if conf_file.is_file(): shutil.copy2(conf_file, conf_file_bak) conf_file.write_text(conf) util.chown_slurm(conf_file, mode=0o644)
def setup_jwt_key(): jwt_key = Path(slurmdirs.state / "jwt_hs256.key") if jwt_key.exists(): log.info("JWT key already exists. Skipping key generation.") else: run("dd if=/dev/urandom bs=32 count=1 > " + str(jwt_key), shell=True) util.chown_slurm(jwt_key, mode=0o400)
def gen_cloud_conf(lkp=lkp, cloud_parameters=None): content = make_cloud_conf(lkp, cloud_parameters=cloud_parameters) conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud.conf" conf_file_bak = conf_file.with_suffix(".conf.bak") if conf_file.is_file(): shutil.copy2(conf_file, conf_file_bak) conf_file.write_text(content) util.chown_slurm(conf_file, mode=0o644)
def install_cgroup_conf(): """install cgroup.conf""" conf = project_metadata(f"{cfg.slurm_cluster_name}-slurm-tpl-cgroup-conf") conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cgroup.conf" conf_file_bak = conf_file.with_suffix(".conf.bak") if conf_file.is_file(): shutil.copy2(conf_file, conf_file_bak) conf_file.write_text(conf) util.chown_slurm(conf_file, mode=0o600)
def configure_dirs(): for p in dirs.values(): p.mkdirp() util.chown_slurm(dirs.slurm) util.chown_slurm(dirs.scripts) for p in slurmdirs.values(): p.mkdirp() util.chown_slurm(p) etc_slurm = Path("/etc/slurm") if etc_slurm.exists() and etc_slurm.is_symlink(): etc_slurm.unlink() etc_slurm.symlink_to(slurmdirs.etc) scripts_etc = dirs.scripts / "etc" if scripts_etc.exists() and scripts_etc.is_symlink(): scripts_etc.unlink() scripts_etc.symlink_to(slurmdirs.etc) scripts_log = dirs.scripts / "log" if scripts_log.exists() and scripts_log.is_symlink(): scripts_log.unlink() scripts_log.symlink_to(dirs.log)
def install_slurmdbd_conf(lkp): """install slurmdbd.conf""" conf_options = NSDict({ "control_host": lkp.control_host, "slurmlog": dirs.log, "state_save": slurmdirs.state, "db_name": "slurm_acct_db", "db_user": "******", "db_pass": '******', "db_host": "localhost", "db_port": "3306", }) if lkp.cfg.cloudsql: secret_name = f"{cfg.slurm_cluster_name}-slurm-secret-cloudsql" payload = json.loads(access_secret_version(util.project, secret_name)) if payload["db_name"] and payload["db_name"] != "": conf_options.db_name = payload["db_name"] if payload["user"] and payload["user"] != "": conf_options.db_user = payload["user"] if payload["password"] and payload["password"] != "": conf_options.db_pass = payload["password"] db_host_str = payload["server_ip"].split(":") if db_host_str[0] and db_host_str[0] != "": conf_options.db_host = db_host_str[0] conf_options.db_port = db_host_str[1] if len( db_host_str) >= 2 else "3306" conf_resp = project_metadata( f"{cfg.slurm_cluster_name}-slurm-tpl-slurmdbd-conf") conf = conf_resp.format(**conf_options) conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "slurmdbd.conf" conf_file_bak = conf_file.with_suffix(".conf.bak") if conf_file.is_file(): shutil.copy2(conf_file, conf_file_bak) conf_file.write_text(conf) util.chown_slurm(conf_file, 0o600)
import logging import re from time import sleep import setup import sys import util from collections import namedtuple from pathlib import Path from google.cloud import pubsub_v1 from util import project, lkp, cfg from util import config_root_logger, handle_exception, run, publish_message filename = Path(__file__).name logfile = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") util.chown_slurm(logfile, mode=0o600) config_root_logger(filename, level="DEBUG", util_level="DEBUG", logfile=logfile) log = logging.getLogger(filename) project_id = project subscription_id = lkp.hostname subscriber = pubsub_v1.SubscriberClient() subscription_path = subscriber.subscription_path(project_id, subscription_id) StateTuple = namedtuple("StateTuple", "base,flags")
help="Force attempted creation of the nodelist, whether nodes are exclusive or not.", ) parser.add_argument( "--debug", "-d", dest="debug", action="store_true", help="Enable debugging output" ) if __name__ == "__main__": if "SLURM_JOB_NODELIST" in os.environ: argv = [ *sys.argv[1:], os.environ["SLURM_JOB_NODELIST"], os.environ["SLURM_JOB_ID"], ] args = parser.parse_args(argv) else: args = parser.parse_args() util.chown_slurm(LOGFILE, mode=0o600) if args.debug: util.config_root_logger( filename, level="DEBUG", util_level="DEBUG", logfile=LOGFILE ) else: util.config_root_logger( filename, level="INFO", util_level="ERROR", logfile=LOGFILE ) sys.excepthook = util.handle_exception main(args.nodelist, args.job_id, args.force)
def setup_controller(): """Run controller setup""" log.info("Setting up controller") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) install_custom_scripts() install_slurm_conf(lkp) install_slurmdbd_conf(lkp) gen_cloud_conf() gen_cloud_gres_conf() install_gres_conf() install_cgroup_conf() setup_jwt_key() setup_munge_key() if cfg.controller_secondary_disk: setup_secondary_disks() setup_network_storage() run_custom_scripts() if not cfg.cloudsql: configure_mysql() run("systemctl enable slurmdbd", timeout=30) run("systemctl restart slurmdbd", timeout=30) # Wait for slurmdbd to come up time.sleep(5) sacctmgr = f"{slurmdirs.prefix}/bin/sacctmgr -i" result = run(f"{sacctmgr} add cluster {cfg.slurm_cluster_name}", timeout=30, check=False) if "already exists" in result.stdout: log.info(result.stdout) elif result.returncode > 1: result.check_returncode() # will raise error run("systemctl enable slurmctld", timeout=30) run("systemctl restart slurmctld", timeout=30) run("systemctl enable slurmrestd", timeout=30) run("systemctl restart slurmrestd", timeout=30) # Export at the end to signal that everything is up run("systemctl enable nfs-server", timeout=30) run("systemctl start nfs-server", timeout=30) run("systemctl enable slurmeventd", timeout=30) run("systemctl restart slurmeventd", timeout=30) setup_nfs_exports() setup_sync_cronjob() log.info("Check status of cluster services") run("systemctl status munge", timeout=30) run("systemctl status slurmdbd", timeout=30) run("systemctl status slurmctld", timeout=30) run("systemctl status slurmrestd", timeout=30) run("systemctl status slurmeventd", timeout=30) slurmsync.sync_slurm() run("systemctl enable slurm_load_bq.timer", timeout=30) run("systemctl start slurm_load_bq.timer", timeout=30) run("systemctl status slurm_load_bq.timer", timeout=30) log.info("Done setting up controller") pass
def install_custom_scripts(clean=False): """download custom scripts from project metadata""" script_pattern = re.compile( rf"{cfg.slurm_cluster_name}-slurm-(?P<path>\S+)-script-(?P<name>\S+)") metadata_keys = project_metadata("/").splitlines() def match_name(meta_key): m = script_pattern.match(meta_key) if not m: # key does not match, skip return None # returned path is `partition.d/<part_name>/<name>` # or `<controller/compute>.d/<name>` parts = m["path"].split("-") parts[0] += ".d" name, _, ext = m["name"].rpartition("_") name = ".".join((name, ext)) return meta_key, Path(*parts, name) def filter_role(meta_entry): if not meta_entry: return False key, path = meta_entry # path is <role>.d/script.sh or partition.d/<part>/script.sh # role is <role> or 'partition', part is None or <part> role, part, *_ = chain(path.parent.parts, (None, )) role = role[:-2] # strip off added '.d' # login only needs their login scripts if lkp.instance_role == "login": suffix = instance_metadata("attributes/slurm_login_suffix") script_types = [f"login_{suffix}"] return role in script_types # compute needs compute, prolog, epilog, and the matching partition if lkp.instance_role == "compute": script_types = ["compute", "prolog", "epilog"] return role in script_types or (part and part == lkp.node_partition_name()) # controller downloads them all for good measure return True custom_scripts = list(filter(filter_role, map(match_name, metadata_keys))) log.info("installing custom scripts: {}".format(",".join( str(path) for key, path in custom_scripts))) if clean: path = Path(dirs.custom_scripts) if path.exists() and path.is_dir(): # rm -rf custom_scripts shutil.rmtree(path) dirs.custom_scripts.mkdirp() for key, path in custom_scripts: fullpath = (dirs.custom_scripts / path).resolve() fullpath.parent.mkdirp() for par in path.parents: util.chown_slurm(dirs.custom_scripts / par) log.debug(path) content = project_metadata(key) fullpath.write_text(content) util.chown_slurm(fullpath, mode=0o755)
def install_gres_conf(): conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_gres.conf" gres_conf = Path(lkp.cfg.output_dir or slurmdirs.etc) / "gres.conf" if not gres_conf.exists(): gres_conf.symlink_to(conf_file) util.chown_slurm(gres_conf, mode=0o600)