コード例 #1
0
ファイル: userctl.py プロジェクト: QingCloudAppcenter/EHPC
def get_home_dir(user_name, is_admin=False):
    cluster_info = get_cluster_info()
    nas_path = cluster_info["nas_path"].strip("/")
    if is_admin:
        return ADMIN_HOME_FMT.format(nas_path)
    else:
        return HOME_FMT.format(nas_path, user_name)
コード例 #2
0
ファイル: userctl.py プロジェクト: QingCloudAppcenter/EHPC
def add(params):
    user_name = params.get("user_name", None)
    password = params.get("password", None)
    if user_name and password:
        ldap_client = None
        try:
            ldap_client = new_ldap_client()
            if ldap_client.user_exist(user_name):
                logger.error("The user[%s] already exist.", user_name)
                sys.exit(45)

            cluster_info = get_cluster_info()
            gid = cluster_info["admin_gid"]
            # gname = cluster_info["user"]
            # if not ldap_client.group_exist(gid):
            #     ldap_client.create_group(gname, gid)

            uid = ldap_client.generate_uid_number()
            home_dir = get_home_dir(user_name)
            ldap_client.create_user(user_name, uid, password, home_dir, gid)
            logger.info("create user[%s], done.", user_name)
        except Exception:
            logger.error("Failed to create user: [%s]", traceback.format_exc())
            sys.exit(1)
        finally:
            if ldap_client:
                ldap_client.close()
    else:
        logger.error("Required params: user_name: [%s], password: [%s]",
                     user_name, password)
        sys.exit(40)
コード例 #3
0
def main(argv):
    role = get_role()
    # 只在一个master controller节点执行此命令
    cluster_info = get_cluster_info()
    if role != ROLE_CONTROLLER or \
            int(cluster_info["sid"]) != MASTER_CONTROLLER_SID:
        logger.info("queue action[%s] not handler on non-master [%s|%s].",
                    argv, role, cluster_info["sid"])
        return
    parser = ArgsParser()
    ret = parser.parse(argv)
    if not ret:
        sys.exit(40)

    if parser.action in ACTION_MAP:
        if parser.directive:
            queue = parser.directive["queue"]
            if isinstance(queue, str):
                queue = json_loads(queue)
                if queue is None:
                    logger.error("Failed to load queue[%s] to json!",
                                 parser.directive)
                    sys.exit(40)

            ret = ACTION_MAP[parser.action](queue)
        else:
            ret = ACTION_MAP[parser.action]()
        sys.exit(ret)
    else:
        logger.error("can not handle the action[%s].", parser.action)
        sys.exit(40)
コード例 #4
0
ファイル: userctl.py プロジェクト: QingCloudAppcenter/EHPC
def delete(params):
    user_name = params.get("user_name", None)
    if not user_name:
        logger.error("required params: user_name[%s].", user_name)
        sys.exit(40)

    if user_name == get_cluster_info()["user"]:
        logger.error("The admin user[%s] can not be deleted.", user_name)
        sys.exit(46)

    ldap_client = None
    try:
        ldap_client = new_ldap_client()
        if not ldap_client.user_exist(user_name):
            logger.error("The user[%s] not exist.", user_name)
            sys.exit(44)

        ldap_client.delete_user(user_name)
        logger.info("The user[%s] has been deleted.", user_name)
    except Exception:
        logger.error("Failed to delete user[%s]: [%s]", user_name,
                     traceback.format_exc())
        sys.exit(1)
    finally:
        if ldap_client:
            ldap_client.close()
コード例 #5
0
ファイル: softwarectl.py プロジェクト: ljlu1504/EHPC
def main(argv):
    try:
        # 只在一个master controller节点执行此命令
        role = get_role()
        cluster_info = get_cluster_info()
        if role != ROLE_CONTROLLER or \
                int(cluster_info["sid"]) != MASTER_CONTROLLER_SID:
            return

        parser = ArgsParser()
        ret = parser.parse(argv)
        if not ret:
            sys.exit(40)

        if parser.action in ACTION_MAP:
            if parser.directive:
                software = parser.directive["software"]
                if isinstance(software, str):
                    software = json_loads(software)
                    if software is None:
                        logger.error("Failed to load software[%s] to json!",
                                     parser.directive)
                        sys.exit(40)

                ret = ACTION_MAP[parser.action](software)
            else:
                ret = ACTION_MAP[parser.action]()
            sys.exit(ret)
        else:
            logger.error("can not handle the action[%s].", parser.action)
            sys.exit(40)
    except Exception:
        logger.error("Failed to update software: [%s]", traceback.format_exc())
        sys.exit(1)
コード例 #6
0
def generate_conf():
    # get cluster_name
    cluster_info = get_cluster_info()
    cls_name = cluster_info["cluster_name"]

    ctl_resource = ""  # the first line in resource.info
    cmp_resource = ""  # the last line in resource.info
    with open(RESOURCE_INFO_FILE, "r") as info:
        lines = info.readlines()
        for line in lines:
            if line:
                if not ctl_resource:
                    ctl_resource = line
                else:
                    cmp_resource = line

    # generate default NodeName, eg: node[1-6,8,12-15]
    sids = [int(s) for s in cluster_info["sids"].split(",") if s]
    sids.sort()

    node_name = "{}[".format(COMPUTE_HOSTNAME_PREFIX)  # eg: [1-6,8]
    start_sid = sids[0]  # eg: 1
    last_sid = start_sid
    for sid in sids:
        if sid - last_sid > 1:
            if last_sid == start_sid:
                node_name = "{}{},".format(node_name, start_sid)  # eg: [1-6,8,
            else:
                node_name = "{}{}-{},".format(node_name, start_sid,
                                              last_sid)  # eg: [1-6,8,12-15]
            start_sid = sid
        last_sid = sid

    if last_sid == start_sid:  # the end of sids
        node_name = "{}{}]".format(node_name, start_sid)
    else:
        node_name = "{}{}-{}]".format(node_name, start_sid, last_sid)

    # backup last slurm configuration
    if os_path.exists(SLURM_CONF):
        backup(BACKUP_SLURM_CONF_CMD)

    # replace slurm.conf.template
    tmp_file = "{}/slurm.conf.tmp".format(APP_HOME)
    with open(tmp_file, "w") as conf:
        with open(SLURM_CONF_TMPL, "r") as tmpl:
            for line in tmpl.readlines():
                if line:
                    line = line.format(CLUSTER_NAME=cls_name,
                                       CONTROLLER_RESOURCE=ctl_resource,
                                       COMPUTE_RESOURCE=cmp_resource,
                                       DEFAULT_NODE_NAME=node_name)
                conf.write(line)

    run_shell("mv {} {}".format(tmp_file, SLURM_CONF))
コード例 #7
0
ファイル: appctl.hpc.py プロジェクト: ljlu1504/EHPC
def metadata_reload():
    logger.info("cluster metadata reload...")
    cluster_info = get_cluster_info()
    username = cluster_info.get("user")
    password = cluster_info.get("password")
    gid = cluster_info.get("gid")
    uid = cluster_info.get("uid")
    # create user
    add_user(username, password, uid, gid)
    logger.info("setup done.")
    return 0
コード例 #8
0
ファイル: appctl.hpc.py プロジェクト: ljlu1504/EHPC
def setup():
    logger.info("initialize cluster...")
    cluster_info = get_cluster_info()
    username = cluster_info.get("user")
    password = cluster_info.get("password")
    gid = cluster_info.get("gid")
    uid = cluster_info.get("uid")
    # create user
    add_user(username, password, uid, gid)
    logger.info("setup done.")
    return 0
コード例 #9
0
ファイル: softwarectl.py プロジェクト: ljlu1504/EHPC
def uninstall(software):
    logger.info("uninstall software[%s]..", software)
    nas_mount_point = get_cluster_info()["nas_mount_point"]
    software_home = SOFTWARE_HOME_FMT.format(nas_mount_point)
    for s in software:
        if not os.path.exists("{}/{}".format(software_home, s["name"])):
            logger.error("The software[%s] not exist!", s["name"])
            return 54

    for s in software:
        ret = _uninstall(s["name"], software_home, s.get("uninstaller"))
        if ret is not 0:
            return ret
    return 0
コード例 #10
0
ファイル: userctl.py プロジェクト: QingCloudAppcenter/EHPC
def add_admin_user():
    ldap_client = new_ldap_client()
    cluster_info = get_cluster_info()

    gid = cluster_info["admin_gid"]
    if not ldap_client.group_exist(gid):
        gname = cluster_info["admin_user"]
        ldap_client.create_group(gname, gid)

    if not ldap_client.user_exist(cluster_info["admin_user"],
                                  cluster_info["admin_uid"]):
        home_dir = get_home_dir("", is_admin=True)
        ldap_client.create_user(cluster_info["admin_user"],
                                cluster_info["admin_uid"],
                                cluster_info["admin_password"], home_dir, gid)
    ldap_client.close()
コード例 #11
0
ファイル: softwarectl.py プロジェクト: ljlu1504/EHPC
def _install(software, source, software_home, installer=None):
    """
    :param source: full url to download software, eg: root@xxx/aa/bb/cc.tar.gz
    :param software_home: the home that software would to be installed
    :param software: software name (install dir: software_home/software)
    :param installer: install script
    :return:
    """
    logger.info("Do install software[%s] from source[%s]..", software, source)

    package = source.split("/")[-1]
    nas_mount_point = get_cluster_info()["nas_mount_point"]
    workdir = SOFTWARE_WORKDIR_FMT.format(nas_mount_point, software)
    package_path = "{}/{}".format(workdir, package)

    try:
        # download
        _download(source, workdir)

        # un-tar
        run_shell("tar -zxf {}".format(package_path), cwd=workdir, timeout=180)

        # install
        ret = os.listdir(workdir)
        un_tar_dir = ""
        for r in ret:
            if os.path.isdir("{}/{}".format(workdir, r)):
                un_tar_dir = "{}/{}".format(workdir, r)
                break

        if installer:
            installer = "bash {}/{}".format(un_tar_dir, installer)
        else:
            f = "bash {}/install.sh".format(un_tar_dir)
            installer = f if os.path.exists(f) else \
                "cp -nrf {} {}/".format(un_tar_dir, software_home)

        run_shell("export SOFTWARE_HOME={} && {}".format(software_home, installer), timeout=120)

        # clean
        if os.path.exists(workdir):
            run_shell("rm -rf {}".format(workdir))
    except Exception:
        logger.error("Failed to install software[%s]: \n%s",
                     software, traceback.format_exc())
        return 1
    return 0
コード例 #12
0
ファイル: slurm_utils.py プロジェクト: ljlu1504/EHPC
def get_ctl_machine():
    cluster_info = get_cluster_info()
    controllers = cluster_info["controllers"]
    controllers.sort(key=lambda x: x[0])
    # eg:
    # SlurmctldHost=controller1(10.0.191.219)
    # SlurmctldHost=controller2(10.0.3.169)
    # 1st one must be controller1
    ctl_machine = "SlurmctldHost={}{}({})".format(CONTROLLER_HOSTNAME_PREFIX,
                                                  controllers[0][0],
                                                  controllers[0][1])
    for controller in controllers:
        if controller[0] > 1:
            ctl_machine = "{}\nSlurmctldHost={}{}({})".format(
                ctl_machine, CONTROLLER_HOSTNAME_PREFIX, controller[0],
                controller[1])
    return ctl_machine
コード例 #13
0
ファイル: appctl.py プロジェクト: ljlu1504/EHPC
def start():
    role = get_role()
    nas_mount_point = get_nas_mount_point()
    cluster_name = get_cluster_name()
    # mkdir /nas_mount_point/opt/slurm/state_save_loc for StateSaveLocation
    run_shell("mkdir -p {}/opt/slurm/state_save_loc/{}/".format(nas_mount_point,
                                                                cluster_name))
    run_shell("ln -sf {}/opt/slurm/ /opt/slurm".format(nas_mount_point))

    # start before
    if role == ROLE_CONTROLLER:
        logger.info("Generating slurm configurations...")
        generate_slurm_conf()
    else:
        for f in clear_files[role]:
            if path.exists(f):
                run_shell("rm {}".format(f))

    # start service
    if role in ROLE_SERVICES:
        for service in ROLE_SERVICES[role]:
            logger.info("Start service {}".format(service))
            run_shell("systemctl start {}".format(service))
    else:
        logger.error("Un-support role[%s].", role)
        return 1

    # start post
    cluster_info = get_cluster_info()
    nas_mount_point = get_nas_mount_point()
    if role == ROLE_CONTROLLER and \
            int(cluster_info["sid"]) == MASTER_CONTROLLER_SID:
        logger.info("create admin dirs..")
        run_shell("mkdir -p {}/opt".format(nas_mount_point))
        run_shell("mkdir -p {}/home/".format(nas_mount_point))
        run_shell("mkdir -p {}/data/".format(nas_mount_point))

        # create admin user
        add_admin_user()

        # install software
        return init_software()
    logger.info("%s started.", role)
    return 0
コード例 #14
0
ファイル: slurm_utils.py プロジェクト: ljlu1504/EHPC
def update_cmpnodes_res_conf(filename):
    """
    update compute node resource in slurm configuration
    :param filename:
    :return:
    """
    # read slurm.conf
    if not os.path.isfile(filename):
        logger.error("%s is not exists", filename)
        return -1

    cluster_info = get_cluster_info()
    # 1. get compute node resource
    ctl_resource = ""  # the first line in resource.info
    cmp_resource = ""  # the last line in resource.info
    with open(RESOURCE_INFO_FILE, "r") as info:
        lines = info.readlines()
        for line in lines:
            if line:
                if not ctl_resource:
                    ctl_resource = line
                else:
                    cmp_resource = line

    # get modified compute node
    node_list = get_slurm_cmpnode_list(cluster_info)

    buf = StringIO.StringIO()
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith("NodeName=node"):
                # put Slurm Partition conf here
                line = NODE_RESOURCE_FMT.format(node_list, cmp_resource)

            buf.write("\n")
            buf.write(line)

    content = buf.getvalue()
    buf.close()

    write_file(filename, content)
    return 0
コード例 #15
0
ファイル: softwarectl.py プロジェクト: ljlu1504/EHPC
def install(software_list, ignore_exist=False):
    logger.info("install software[%s]..", software_list)
    nas_mount_point = get_cluster_info()["nas_mount_point"]
    software_home = SOFTWARE_HOME_FMT.format(nas_mount_point)
    if software_list:
        run_shell("mkdir -p {}".format(software_home))

    exist_info = {}
    for s in software_list:
        # software exist
        if os.path.exists("{}/{}".format(software_home, s["name"])):
            exist_info[s["name"]] = True
            logger.error("The software[%s] already exist!", s["name"])
            if not ignore_exist:
                return 55

    for s in software_list:
        if not exist_info.get(s["name"], False):
            ret = _install(s["name"], s["source"], software_home,
                           s.get("installer"))
            if ret is not 0:
                return ret
    return 0
コード例 #16
0
ファイル: slurm_utils.py プロジェクト: ljlu1504/EHPC
def generate_slurm_conf():
    # get cluster_name
    cluster_info = get_cluster_info()
    cls_name = cluster_info["cluster_name"]

    ctl_resource = ""  # the first line in resource.info
    cmp_resource = ""  # the last line in resource.info
    with open(RESOURCE_INFO_FILE, "r") as info:
        lines = info.readlines()
        for line in lines:
            if line:
                if not ctl_resource:
                    ctl_resource = line
                else:
                    cmp_resource = line

    node_list = get_slurm_cmpnode_list(cluster_info)
    ctl_machine = get_ctl_machine()

    # backup last slurm configuration
    if os_path.exists(SLURM_CONF):
        backup(BACKUP_SLURM_CONF_CMD)

    # replace slurm.conf.template
    tmp_file = "{}/slurm.conf.tmp".format(APP_HOME)
    with open(tmp_file, "w") as conf:
        with open(SLURM_CONF_TMPL, "r") as tmpl:
            for line in tmpl.readlines():
                if line:
                    line = line.format(CLUSTER_NAME=cls_name,
                                       CONTROL_MACHINE=ctl_machine,
                                       CONTROLLER_RESOURCE=ctl_resource,
                                       COMPUTE_RESOURCE=cmp_resource,
                                       DEFAULT_NODE_NAME=node_list)
                conf.write(line)

    run_shell("mv {} {}".format(tmp_file, SLURM_CONF))