Esempio n. 1
0
def main(argv):
    role = get_role()
    # 只在一个master controller节点执行此命令
    cluster_info = get_cluster_info()
    if role != ROLE_CONTROLLER or \
            int(cluster_info["sid"]) != MASTER_CONTROLLER_SID:
        logger.info("queue action[%s] not handler on non-master [%s|%s].",
                    argv, role, cluster_info["sid"])
        return
    parser = ArgsParser()
    ret = parser.parse(argv)
    if not ret:
        sys.exit(40)

    if parser.action in ACTION_MAP:
        if parser.directive:
            queue = parser.directive["queue"]
            if isinstance(queue, str):
                queue = json_loads(queue)
                if queue is None:
                    logger.error("Failed to load queue[%s] to json!",
                                 parser.directive)
                    sys.exit(40)

            ret = ACTION_MAP[parser.action](queue)
        else:
            ret = ACTION_MAP[parser.action]()
        sys.exit(ret)
    else:
        logger.error("can not handle the action[%s].", parser.action)
        sys.exit(40)
Esempio n. 2
0
def main(argv):
    try:
        # 只在一个master controller节点执行此命令
        role = get_role()
        cluster_info = get_cluster_info()
        if role != ROLE_CONTROLLER or \
                int(cluster_info["sid"]) != MASTER_CONTROLLER_SID:
            return

        parser = ArgsParser()
        ret = parser.parse(argv)
        if not ret:
            sys.exit(40)

        if parser.action in ACTION_MAP:
            if parser.directive:
                software = parser.directive["software"]
                if isinstance(software, str):
                    software = json_loads(software)
                    if software is None:
                        logger.error("Failed to load software[%s] to json!",
                                     parser.directive)
                        sys.exit(40)

                ret = ACTION_MAP[parser.action](software)
            else:
                ret = ACTION_MAP[parser.action]()
            sys.exit(ret)
        else:
            logger.error("can not handle the action[%s].", parser.action)
            sys.exit(40)
    except Exception:
        logger.error("Failed to update software: [%s]", traceback.format_exc())
        sys.exit(1)
Esempio n. 3
0
def health_check():
    role = get_role()
    services = ROLE_SERVICES.get(role, "")
    for service in services:
        ret = check_service_status(service)
        if ret != 0:
            return ret
    return 0
Esempio n. 4
0
def stop():
    role = get_role()
    if role in ROLE_SERVICES:
        for service in ROLE_SERVICES[role]:
            run_shell("systemctl stop {}".format(service))
    else:
        logger.error("Un-support role[%s].", role)
        return 1
    return 0
Esempio n. 5
0
def restart():
    role = get_role()
    if role in ROLE_SERVICES:
        for service in ROLE_SERVICES[role]:
            run_shell("systemctl restart {}".format(service))
    else:
        logger.error("Un-support role[%s].", role)
        return 1
    logger.info("%s re-started.", role)
    return 0
Esempio n. 6
0
def metadata_reload():
    logger.info("generate hosts for reloading..")
    generate_hosts()

    role = get_role()
    if role == ROLE_CONTROLLER:
        logger.info("update slurm conf for reloading metadata..")
        update_slurm_conf()

        # TODO: 多controller节点时,只在一个master节点执行此命令即可
        logger.info("re-config slurm configuration for cluster..")
        run_shell("scontrol reconfigure")
    return 0
Esempio n. 7
0
def start():
    role = get_role()
    nas_mount_point = get_nas_mount_point()
    cluster_name = get_cluster_name()
    # mkdir /nas_mount_point/opt/slurm/state_save_loc for StateSaveLocation
    run_shell("mkdir -p {}/opt/slurm/state_save_loc/{}/".format(nas_mount_point,
                                                                cluster_name))
    run_shell("ln -sf {}/opt/slurm/ /opt/slurm".format(nas_mount_point))

    # start before
    if role == ROLE_CONTROLLER:
        logger.info("Generating slurm configurations...")
        generate_slurm_conf()
    else:
        for f in clear_files[role]:
            if path.exists(f):
                run_shell("rm {}".format(f))

    # start service
    if role in ROLE_SERVICES:
        for service in ROLE_SERVICES[role]:
            logger.info("Start service {}".format(service))
            run_shell("systemctl start {}".format(service))
    else:
        logger.error("Un-support role[%s].", role)
        return 1

    # start post
    cluster_info = get_cluster_info()
    nas_mount_point = get_nas_mount_point()
    if role == ROLE_CONTROLLER and \
            int(cluster_info["sid"]) == MASTER_CONTROLLER_SID:
        logger.info("create admin dirs..")
        run_shell("mkdir -p {}/opt".format(nas_mount_point))
        run_shell("mkdir -p {}/home/".format(nas_mount_point))
        run_shell("mkdir -p {}/data/".format(nas_mount_point))

        # create admin user
        add_admin_user()

        # install software
        return init_software()
    logger.info("%s started.", role)
    return 0
Esempio n. 8
0
def service_in(hostname):
    c = components[get_role(hostname)]
    c.restart(hostname)
    c.service_in(hostname)
Esempio n. 9
0
def service_out(hostname):
    components[get_role(hostname)].service_out(hostname)