def get_home_dir(user_name, is_admin=False): cluster_info = get_cluster_info() nas_path = cluster_info["nas_path"].strip("/") if is_admin: return ADMIN_HOME_FMT.format(nas_path) else: return HOME_FMT.format(nas_path, user_name)
def add(params): user_name = params.get("user_name", None) password = params.get("password", None) if user_name and password: ldap_client = None try: ldap_client = new_ldap_client() if ldap_client.user_exist(user_name): logger.error("The user[%s] already exist.", user_name) sys.exit(45) cluster_info = get_cluster_info() gid = cluster_info["admin_gid"] # gname = cluster_info["user"] # if not ldap_client.group_exist(gid): # ldap_client.create_group(gname, gid) uid = ldap_client.generate_uid_number() home_dir = get_home_dir(user_name) ldap_client.create_user(user_name, uid, password, home_dir, gid) logger.info("create user[%s], done.", user_name) except Exception: logger.error("Failed to create user: [%s]", traceback.format_exc()) sys.exit(1) finally: if ldap_client: ldap_client.close() else: logger.error("Required params: user_name: [%s], password: [%s]", user_name, password) sys.exit(40)
def main(argv): role = get_role() # 只在一个master controller节点执行此命令 cluster_info = get_cluster_info() if role != ROLE_CONTROLLER or \ int(cluster_info["sid"]) != MASTER_CONTROLLER_SID: logger.info("queue action[%s] not handler on non-master [%s|%s].", argv, role, cluster_info["sid"]) return parser = ArgsParser() ret = parser.parse(argv) if not ret: sys.exit(40) if parser.action in ACTION_MAP: if parser.directive: queue = parser.directive["queue"] if isinstance(queue, str): queue = json_loads(queue) if queue is None: logger.error("Failed to load queue[%s] to json!", parser.directive) sys.exit(40) ret = ACTION_MAP[parser.action](queue) else: ret = ACTION_MAP[parser.action]() sys.exit(ret) else: logger.error("can not handle the action[%s].", parser.action) sys.exit(40)
def delete(params): user_name = params.get("user_name", None) if not user_name: logger.error("required params: user_name[%s].", user_name) sys.exit(40) if user_name == get_cluster_info()["user"]: logger.error("The admin user[%s] can not be deleted.", user_name) sys.exit(46) ldap_client = None try: ldap_client = new_ldap_client() if not ldap_client.user_exist(user_name): logger.error("The user[%s] not exist.", user_name) sys.exit(44) ldap_client.delete_user(user_name) logger.info("The user[%s] has been deleted.", user_name) except Exception: logger.error("Failed to delete user[%s]: [%s]", user_name, traceback.format_exc()) sys.exit(1) finally: if ldap_client: ldap_client.close()
def main(argv): try: # 只在一个master controller节点执行此命令 role = get_role() cluster_info = get_cluster_info() if role != ROLE_CONTROLLER or \ int(cluster_info["sid"]) != MASTER_CONTROLLER_SID: return parser = ArgsParser() ret = parser.parse(argv) if not ret: sys.exit(40) if parser.action in ACTION_MAP: if parser.directive: software = parser.directive["software"] if isinstance(software, str): software = json_loads(software) if software is None: logger.error("Failed to load software[%s] to json!", parser.directive) sys.exit(40) ret = ACTION_MAP[parser.action](software) else: ret = ACTION_MAP[parser.action]() sys.exit(ret) else: logger.error("can not handle the action[%s].", parser.action) sys.exit(40) except Exception: logger.error("Failed to update software: [%s]", traceback.format_exc()) sys.exit(1)
def generate_conf(): # get cluster_name cluster_info = get_cluster_info() cls_name = cluster_info["cluster_name"] ctl_resource = "" # the first line in resource.info cmp_resource = "" # the last line in resource.info with open(RESOURCE_INFO_FILE, "r") as info: lines = info.readlines() for line in lines: if line: if not ctl_resource: ctl_resource = line else: cmp_resource = line # generate default NodeName, eg: node[1-6,8,12-15] sids = [int(s) for s in cluster_info["sids"].split(",") if s] sids.sort() node_name = "{}[".format(COMPUTE_HOSTNAME_PREFIX) # eg: [1-6,8] start_sid = sids[0] # eg: 1 last_sid = start_sid for sid in sids: if sid - last_sid > 1: if last_sid == start_sid: node_name = "{}{},".format(node_name, start_sid) # eg: [1-6,8, else: node_name = "{}{}-{},".format(node_name, start_sid, last_sid) # eg: [1-6,8,12-15] start_sid = sid last_sid = sid if last_sid == start_sid: # the end of sids node_name = "{}{}]".format(node_name, start_sid) else: node_name = "{}{}-{}]".format(node_name, start_sid, last_sid) # backup last slurm configuration if os_path.exists(SLURM_CONF): backup(BACKUP_SLURM_CONF_CMD) # replace slurm.conf.template tmp_file = "{}/slurm.conf.tmp".format(APP_HOME) with open(tmp_file, "w") as conf: with open(SLURM_CONF_TMPL, "r") as tmpl: for line in tmpl.readlines(): if line: line = line.format(CLUSTER_NAME=cls_name, CONTROLLER_RESOURCE=ctl_resource, COMPUTE_RESOURCE=cmp_resource, DEFAULT_NODE_NAME=node_name) conf.write(line) run_shell("mv {} {}".format(tmp_file, SLURM_CONF))
def metadata_reload(): logger.info("cluster metadata reload...") cluster_info = get_cluster_info() username = cluster_info.get("user") password = cluster_info.get("password") gid = cluster_info.get("gid") uid = cluster_info.get("uid") # create user add_user(username, password, uid, gid) logger.info("setup done.") return 0
def setup(): logger.info("initialize cluster...") cluster_info = get_cluster_info() username = cluster_info.get("user") password = cluster_info.get("password") gid = cluster_info.get("gid") uid = cluster_info.get("uid") # create user add_user(username, password, uid, gid) logger.info("setup done.") return 0
def uninstall(software): logger.info("uninstall software[%s]..", software) nas_mount_point = get_cluster_info()["nas_mount_point"] software_home = SOFTWARE_HOME_FMT.format(nas_mount_point) for s in software: if not os.path.exists("{}/{}".format(software_home, s["name"])): logger.error("The software[%s] not exist!", s["name"]) return 54 for s in software: ret = _uninstall(s["name"], software_home, s.get("uninstaller")) if ret is not 0: return ret return 0
def add_admin_user(): ldap_client = new_ldap_client() cluster_info = get_cluster_info() gid = cluster_info["admin_gid"] if not ldap_client.group_exist(gid): gname = cluster_info["admin_user"] ldap_client.create_group(gname, gid) if not ldap_client.user_exist(cluster_info["admin_user"], cluster_info["admin_uid"]): home_dir = get_home_dir("", is_admin=True) ldap_client.create_user(cluster_info["admin_user"], cluster_info["admin_uid"], cluster_info["admin_password"], home_dir, gid) ldap_client.close()
def _install(software, source, software_home, installer=None): """ :param source: full url to download software, eg: root@xxx/aa/bb/cc.tar.gz :param software_home: the home that software would to be installed :param software: software name (install dir: software_home/software) :param installer: install script :return: """ logger.info("Do install software[%s] from source[%s]..", software, source) package = source.split("/")[-1] nas_mount_point = get_cluster_info()["nas_mount_point"] workdir = SOFTWARE_WORKDIR_FMT.format(nas_mount_point, software) package_path = "{}/{}".format(workdir, package) try: # download _download(source, workdir) # un-tar run_shell("tar -zxf {}".format(package_path), cwd=workdir, timeout=180) # install ret = os.listdir(workdir) un_tar_dir = "" for r in ret: if os.path.isdir("{}/{}".format(workdir, r)): un_tar_dir = "{}/{}".format(workdir, r) break if installer: installer = "bash {}/{}".format(un_tar_dir, installer) else: f = "bash {}/install.sh".format(un_tar_dir) installer = f if os.path.exists(f) else \ "cp -nrf {} {}/".format(un_tar_dir, software_home) run_shell("export SOFTWARE_HOME={} && {}".format(software_home, installer), timeout=120) # clean if os.path.exists(workdir): run_shell("rm -rf {}".format(workdir)) except Exception: logger.error("Failed to install software[%s]: \n%s", software, traceback.format_exc()) return 1 return 0
def get_ctl_machine(): cluster_info = get_cluster_info() controllers = cluster_info["controllers"] controllers.sort(key=lambda x: x[0]) # eg: # SlurmctldHost=controller1(10.0.191.219) # SlurmctldHost=controller2(10.0.3.169) # 1st one must be controller1 ctl_machine = "SlurmctldHost={}{}({})".format(CONTROLLER_HOSTNAME_PREFIX, controllers[0][0], controllers[0][1]) for controller in controllers: if controller[0] > 1: ctl_machine = "{}\nSlurmctldHost={}{}({})".format( ctl_machine, CONTROLLER_HOSTNAME_PREFIX, controller[0], controller[1]) return ctl_machine
def start(): role = get_role() nas_mount_point = get_nas_mount_point() cluster_name = get_cluster_name() # mkdir /nas_mount_point/opt/slurm/state_save_loc for StateSaveLocation run_shell("mkdir -p {}/opt/slurm/state_save_loc/{}/".format(nas_mount_point, cluster_name)) run_shell("ln -sf {}/opt/slurm/ /opt/slurm".format(nas_mount_point)) # start before if role == ROLE_CONTROLLER: logger.info("Generating slurm configurations...") generate_slurm_conf() else: for f in clear_files[role]: if path.exists(f): run_shell("rm {}".format(f)) # start service if role in ROLE_SERVICES: for service in ROLE_SERVICES[role]: logger.info("Start service {}".format(service)) run_shell("systemctl start {}".format(service)) else: logger.error("Un-support role[%s].", role) return 1 # start post cluster_info = get_cluster_info() nas_mount_point = get_nas_mount_point() if role == ROLE_CONTROLLER and \ int(cluster_info["sid"]) == MASTER_CONTROLLER_SID: logger.info("create admin dirs..") run_shell("mkdir -p {}/opt".format(nas_mount_point)) run_shell("mkdir -p {}/home/".format(nas_mount_point)) run_shell("mkdir -p {}/data/".format(nas_mount_point)) # create admin user add_admin_user() # install software return init_software() logger.info("%s started.", role) return 0
def update_cmpnodes_res_conf(filename): """ update compute node resource in slurm configuration :param filename: :return: """ # read slurm.conf if not os.path.isfile(filename): logger.error("%s is not exists", filename) return -1 cluster_info = get_cluster_info() # 1. get compute node resource ctl_resource = "" # the first line in resource.info cmp_resource = "" # the last line in resource.info with open(RESOURCE_INFO_FILE, "r") as info: lines = info.readlines() for line in lines: if line: if not ctl_resource: ctl_resource = line else: cmp_resource = line # get modified compute node node_list = get_slurm_cmpnode_list(cluster_info) buf = StringIO.StringIO() with open(filename, 'r') as f: for line in f: if line.startswith("NodeName=node"): # put Slurm Partition conf here line = NODE_RESOURCE_FMT.format(node_list, cmp_resource) buf.write("\n") buf.write(line) content = buf.getvalue() buf.close() write_file(filename, content) return 0
def install(software_list, ignore_exist=False): logger.info("install software[%s]..", software_list) nas_mount_point = get_cluster_info()["nas_mount_point"] software_home = SOFTWARE_HOME_FMT.format(nas_mount_point) if software_list: run_shell("mkdir -p {}".format(software_home)) exist_info = {} for s in software_list: # software exist if os.path.exists("{}/{}".format(software_home, s["name"])): exist_info[s["name"]] = True logger.error("The software[%s] already exist!", s["name"]) if not ignore_exist: return 55 for s in software_list: if not exist_info.get(s["name"], False): ret = _install(s["name"], s["source"], software_home, s.get("installer")) if ret is not 0: return ret return 0
def generate_slurm_conf(): # get cluster_name cluster_info = get_cluster_info() cls_name = cluster_info["cluster_name"] ctl_resource = "" # the first line in resource.info cmp_resource = "" # the last line in resource.info with open(RESOURCE_INFO_FILE, "r") as info: lines = info.readlines() for line in lines: if line: if not ctl_resource: ctl_resource = line else: cmp_resource = line node_list = get_slurm_cmpnode_list(cluster_info) ctl_machine = get_ctl_machine() # backup last slurm configuration if os_path.exists(SLURM_CONF): backup(BACKUP_SLURM_CONF_CMD) # replace slurm.conf.template tmp_file = "{}/slurm.conf.tmp".format(APP_HOME) with open(tmp_file, "w") as conf: with open(SLURM_CONF_TMPL, "r") as tmpl: for line in tmpl.readlines(): if line: line = line.format(CLUSTER_NAME=cls_name, CONTROL_MACHINE=ctl_machine, CONTROLLER_RESOURCE=ctl_resource, COMPUTE_RESOURCE=cmp_resource, DEFAULT_NODE_NAME=node_list) conf.write(line) run_shell("mv {} {}".format(tmp_file, SLURM_CONF))