Example #1
0
    def stop(self, host):
        """
        Stop Collectd service on the agent host.
        :param host: the name of the agent host. Could be a list.
        """
        log, barreleye_instance = init_env(self._bac_config_fpath,
                                           self._bac_logdir,
                                           self._bac_log_to_file)
        host = cmd_general.check_argument_str(log, "host", host)

        hostnames = cmd_general.parse_list_string(log, host)
        if hostnames is None:
            log.cl_error("host list [%s] is invalid", host)
            cmd_general.cmd_exit(log, -1)
        ret = barreleye_instance.bei_stop_agents(log, hostnames)
        cmd_general.cmd_exit(log, ret)
Example #2
0
def build(log,
          source_dir,
          workspace,
          cache=constant.CORAL_BUILD_CACHE,
          lustre_rpms_dir=None,
          e2fsprogs_rpms_dir=None,
          collectd=None,
          enable_zfs=False,
          enable_devel=False,
          disable_plugin=None,
          tsinghua_mirror=False):
    """
    Build the Coral ISO.
    """
    # pylint: disable=too-many-locals,too-many-branches
    # pylint: disable=too-many-statements
    if disable_plugin is None:
        disabled_plugins = []
    else:
        disabled_plugins = cmd_general.parse_list_string(log, disable_plugin)
        if disabled_plugins is None:
            log.cl_error("invalid option [%s] of --disable_plugin",
                         disable_plugin)
            return -1

    plugins = list(build_common.CORAL_RELEASE_PLUGIN_DICT.values())
    if enable_devel:
        plugins += list(build_common.CORAL_DEVEL_PLUGIN_DICT.values())

    sync_cache_back = True
    disable_plugins_str = ""
    for plugin_name in disabled_plugins:
        if plugin_name not in build_common.CORAL_PLUGIN_DICT:
            log.cl_error("unknown plugin [%s] of --disable_plugin",
                         plugin_name)
            log.cl_error("possible plugins are %s",
                         list(build_common.CORAL_PLUGIN_DICT.keys()))
            return -1

        if ((not enable_devel) and
            (plugin_name not in build_common.CORAL_RELEASE_PLUGIN_DICT)):
            log.cl_info(
                "plugin [%s] will not be included in release "
                "ISO anyway", plugin_name)
            continue

        sync_cache_back = False
        plugin = build_common.CORAL_PLUGIN_DICT[plugin_name]
        if plugin in plugins:
            plugins.remove(plugin)
        disable_plugins_str += " --disable-%s" % plugin_name

    if len(plugins) == 0:
        log.cl_error("everything has been disabled, nothing to build")
        return -1

    need_lustre_rpms = False
    need_collectd = False
    install_lustre = False
    enabled_plugin_str = ""
    for plugin in plugins:
        if enabled_plugin_str == "":
            enabled_plugin_str = plugin.cpt_plugin_name
        else:
            enabled_plugin_str += ", " + plugin.cpt_plugin_name
        if plugin.cpt_need_lustre_rpms:
            need_lustre_rpms = True
        if plugin.cpt_need_collectd:
            need_collectd = True
        if plugin.cpt_install_lustre:
            install_lustre = True

    type_fname = constant.CORAL_BUILD_CACHE_TYPE_OPEN
    if type_fname == constant.CORAL_BUILD_CACHE_TYPE_OPEN:
        log.cl_info("building ISO with %s", enabled_plugin_str)

    local_host = ssh_host.get_local_host(ssh=False)
    distro = local_host.sh_distro(log)
    if distro not in (ssh_host.DISTRO_RHEL7, ssh_host.DISTRO_RHEL8):
        log.cl_error("build on distro [%s] is not supported yet", distro)
        return -1

    shared_cache = cache.rstrip("/")
    # Shared cache for this build type
    shared_type_cache = shared_cache + "/" + type_fname
    # Extra RPMs to download
    extra_rpm_names = []
    # Extra RPM file names under package directory
    extra_package_fnames = []
    # Extra file names under ISO directory
    extra_iso_fnames = []

    enable_zfs_string = ""
    if enable_zfs:
        enable_zfs_string = ", ZFS support disabled"

    type_cache = workspace + "/" + type_fname
    build_pip_dir = type_cache + "/" + constant.BUILD_PIP
    iso_cache = type_cache + "/" + constant.ISO_CACHE_FNAME
    # Directory path of package under ISO cache
    packages_dir = iso_cache + "/" + constant.BUILD_PACKAGES
    default_lustre_rpms_dir = (iso_cache + "/" +
                               constant.LUSTRE_RPM_DIR_BASENAME)
    default_e2fsprogs_rpms_dir = (iso_cache + "/" +
                                  constant.E2FSPROGS_RPM_DIR_BASENAME)

    if not need_collectd:
        if collectd is not None:
            log.cl_warning(
                "option [--collectd %s] has been ignored since "
                "no need to have Collectd RPMs", collectd)
    elif collectd is not None:
        sync_cache_back = False

    if not need_lustre_rpms and not install_lustre:
        if lustre_rpms_dir is not None:
            log.cl_warning(
                "option [--lustre %s] has been ignored since "
                "no need to have Lustre RPMs", lustre_rpms_dir)
        if e2fsprogs_rpms_dir is not None:
            log.cl_warning(
                "option [--e2fsprogs %s] has been ignored since "
                "no need to have Lustre RPMs", e2fsprogs_rpms_dir)
    else:
        if lustre_rpms_dir is None:
            lustre_rpms_dir = default_lustre_rpms_dir
        if e2fsprogs_rpms_dir is None:
            e2fsprogs_rpms_dir = default_e2fsprogs_rpms_dir

    command = ("mkdir -p %s" % workspace)
    retval = local_host.sh_run(log, command)
    if retval.cr_exit_status:
        log.cl_error(
            "failed to run command [%s] on host [%s], "
            "ret = [%d], stdout = [%s], stderr = [%s]", command,
            local_host.sh_hostname, retval.cr_exit_status, retval.cr_stdout,
            retval.cr_stderr)
        return -1

    ret = get_shared_build_cache(log, local_host, workspace, shared_type_cache)
    if ret:
        log.cl_error("failed to get shared build cache")
        return -1

    target_cpu = local_host.sh_target_cpu(log)
    if target_cpu is None:
        log.cl_error("failed to get the target cpu on host [%s]",
                     local_host.sh_hostname)
        return -1

    ret = install_build_dependency(log,
                                   workspace,
                                   local_host,
                                   distro,
                                   target_cpu,
                                   type_cache,
                                   plugins,
                                   build_pip_dir,
                                   tsinghua_mirror=tsinghua_mirror)
    if ret:
        log.cl_error("failed to install dependency for building")
        return -1

    command = ("mkdir -p %s" % (packages_dir))
    retval = local_host.sh_run(log, command)
    if retval.cr_exit_status:
        log.cl_error(
            "failed to run command [%s] on host [%s], "
            "ret = [%d], stdout = [%s], stderr = [%s]", command,
            local_host.sh_hostname, retval.cr_exit_status, retval.cr_stdout,
            retval.cr_stderr)
        return -1

    for plugin in plugins:
        ret = plugin.cpt_build(log, workspace, local_host, source_dir,
                               target_cpu, type_cache, iso_cache, packages_dir,
                               extra_iso_fnames, extra_package_fnames,
                               extra_rpm_names, collectd)
        if ret:
            log.cl_error("failed to build plugin [%s]", plugin.cpt_plugin_name)
            return -1

    ret = download_dependent_rpms(log, workspace, local_host, distro,
                                  target_cpu, packages_dir, type_cache,
                                  extra_package_fnames, extra_rpm_names)
    if ret:
        log.cl_error("failed to download dependent rpms")
        return -1

    pip_dir = iso_cache + "/" + constant.BUILD_PIP
    command = ("mkdir -p %s" % (pip_dir))
    retval = local_host.sh_run(log, command)
    if retval.cr_exit_status:
        log.cl_error(
            "failed to run command [%s] on host [%s], "
            "ret = [%d], stdout = [%s], stderr = [%s]", command,
            local_host.sh_hostname, retval.cr_exit_status, retval.cr_stdout,
            retval.cr_stderr)
        return -1

    ret = install_common.download_pip3_packages(
        log,
        local_host,
        pip_dir,
        constant.CORAL_DEPENDENT_PIPS,
        tsinghua_mirror=tsinghua_mirror)
    if ret:
        log.cl_error("failed to download pip3 packages")
        return -1

    lustre_distribution = None
    if need_lustre_rpms or install_lustre:
        if lustre_distribution is None:
            log.cl_error("Lustre distribution is needed unexpectedly")
            return -1

    contents = ([constant.BUILD_PACKAGES, constant.BUILD_PIP] +
                extra_iso_fnames)
    ret = local_host.sh_check_dir_content(log,
                                          iso_cache,
                                          contents,
                                          cleanup=True)
    if ret:
        log.cl_error("directory [%s] does not have expected content",
                     iso_cache)
        return -1

    log.cl_info("generating Coral ISO")

    enable_zfs_string = ""
    if enable_zfs:
        enable_zfs_string = " --enable-zfs"
    enable_devel_string = ""
    if enable_devel:
        enable_devel_string = " --enable-devel"
    command = ("cd %s && rm coral-*.tar.bz2 coral-*.tar.gz -f && "
               "sh autogen.sh && "
               "./configure --with-iso-cache=%s%s%s%s && "
               "make -j8 && "
               "make iso" % (source_dir, iso_cache, enable_zfs_string,
                             enable_devel_string, disable_plugins_str))
    retval = local_host.sh_watched_run(log,
                                       command,
                                       None,
                                       None,
                                       return_stdout=False,
                                       return_stderr=False)
    if retval.cr_exit_status:
        log.cl_error("failed to run command [%s] on host [%s]", command,
                     local_host.sh_hostname)
        return -1

    # If there is any plugin disabled or Collectd is special, the local cache
    # might have some things missing thus should not be used by other build.
    if sync_cache_back:
        ret = sync_shared_build_cache(log, local_host, type_cache,
                                      shared_cache)
        if ret:
            log.cl_error("failed to sync to shared build cache")
            return -1

    log.cl_info("Built Coral ISO successfully")
    return 0
Example #3
0
def barrele_init_instance(log, workspace, config, config_fpath, log_to_file,
                          logdir_is_default):
    """
    Parse the config and init the instance
    """
    # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    collect_interval = utils.config_value(
        config, barrele_constant.BRL_COLLECT_INTERVAL)
    if collect_interval is None:
        log.cl_debug(
            "no [%s] is configured in the config file [%s], "
            "using default value [%s]", barrele_constant.BRL_COLLECT_INTERVAL,
            config_fpath, BARRELE_COLLECT_INTERVAL)
        collect_interval = BARRELE_COLLECT_INTERVAL

    continuous_query_periods = utils.config_value(
        config, barrele_constant.BRL_CONTINUOUS_QUERY_PERIODS)
    if continuous_query_periods is None:
        log.cl_debug(
            "no [%s] is configured in the config file [%s], "
            "using default value [%s]",
            barrele_constant.BRL_CONTINUOUS_QUERY_PERIODS, config_fpath,
            BARRELE_CONTINUOUS_QUERY_PERIODS)
        continuous_query_periods = BARRELE_CONTINUOUS_QUERY_PERIODS

    jobstat_pattern = utils.config_value(config,
                                         barrele_constant.BRL_JOBSTAT_PATTERN)
    if jobstat_pattern is None:
        log.cl_debug(
            "no [%s] is configured in the config file [%s], "
            "using default value [%s]", barrele_constant.BRL_JOBSTAT_PATTERN,
            config_fpath, barrele_constant.BARRELE_JOBSTAT_PATTERN_UNKNOWN)
        jobstat_pattern = barrele_constant.BARRELE_JOBSTAT_PATTERN_UNKNOWN
    if jobstat_pattern not in barrele_constant.BARRELE_JOBSTAT_PATTERNS:
        log.cl_error("unsupported jobstat_pattern [%s], supported: %s",
                     jobstat_pattern,
                     barrele_constant.BARRELE_JOBSTAT_PATTERNS)
        return None

    lustre_fallback_version_name = \
        utils.config_value(config,
                           barrele_constant.BRL_LUSTRE_FALLBACK_VERSION)
    if lustre_fallback_version_name is None:
        log.cl_debug(
            "no [%s] is configured in the config file [%s], "
            "using default value [%s]",
            barrele_constant.BRL_LUSTRE_FALLBACK_VERSION, config_fpath,
            BARRELE_LUSTRE_FALLBACK_VERSION)
        lustre_fallback_version_name = BARRELE_LUSTRE_FALLBACK_VERSION

    if lustre_fallback_version_name not in lustre_version.LUSTRE_VERSION_DICT:
        log.cl_error(
            "unsupported Lustre version [%s] is configured in the "
            "config file [%s]", lustre_fallback_version_name, config_fpath)
        return None

    lustre_fallback_version = \
        lustre_version.LUSTRE_VERSION_DICT[lustre_fallback_version_name]

    enable_lustre_exp_mdt = utils.config_value(
        config, barrele_constant.BRL_ENABLE_LUSTRE_EXP_MDT)
    if enable_lustre_exp_mdt is None:
        log.cl_debug(
            "no [%s] is configured in the config file [%s], "
            "using default value [False]",
            barrele_constant.BRL_ENABLE_LUSTRE_EXP_MDT, config_fpath)
        enable_lustre_exp_mdt = False

    enable_lustre_exp_ost = utils.config_value(
        config, barrele_constant.BRL_ENABLE_LUSTRE_EXP_OST)
    if enable_lustre_exp_ost is None:
        log.cl_debug(
            "no [%s] is configured in the config file [%s], "
            "using default value [False]",
            barrele_constant.BRL_ENABLE_LUSTRE_EXP_OST, config_fpath)
        enable_lustre_exp_ost = False

    agent_configs = utils.config_value(config, barrele_constant.BRL_AGENTS)
    if agent_configs is None:
        log.cl_error(
            "can NOT find [%s] in the config file, "
            "please correct file [%s]", barrele_constant.BRL_AGENTS,
            config_fpath)
        return None

    host_dict = {}
    barreleye_server = parse_server_config(log, config, config_fpath,
                                           host_dict)
    if barreleye_server is None:
        log.cl_error("failed to parse server config")
        return None

    agent_dict = {}
    for agent_config in agent_configs:
        hostname_config = utils.config_value(agent_config,
                                             barrele_constant.BRL_HOSTNAME)
        if hostname_config is None:
            log.cl_error(
                "can NOT find [%s] in the config of SSH host "
                "[%s], please correct file [%s]",
                barrele_constant.BRL_HOSTNAME, hostname_config, config_fpath)
            return None

        hostnames = cmd_general.parse_list_string(log, hostname_config)
        if hostnames is None:
            log.cl_error("[%s] as [%s] is invalid in the config file [%s]",
                         hostname_config, barrele_constant.BRL_HOSTNAME,
                         config_fpath)
            return None

        ssh_identity_file = utils.config_value(
            agent_config, barrele_constant.BRL_SSH_IDENTITY_FILE)

        enable_disk = utils.config_value(agent_config,
                                         barrele_constant.BRL_ENABLE_DISK)
        if enable_disk is None:
            log.cl_debug(
                "no [%s] is configured in the config file [%s], "
                "using default value [False]",
                barrele_constant.BRL_ENABLE_DISK, config_fpath)
            enable_disk = False

        enable_infiniband = utils.config_value(
            agent_config, barrele_constant.BRL_ENABLE_INFINIBAND)
        if enable_infiniband is None:
            log.cl_debug(
                "no [%s] is configured in the config file [%s], "
                "using default value [False]",
                barrele_constant.BRL_ENABLE_INFINIBAND, config_fpath)
            enable_infiniband = False

        enable_lustre_client = utils.config_value(
            agent_config, barrele_constant.BRL_ENABLE_LUSTRE_CLIENT)
        if enable_lustre_client is None:
            log.cl_debug(
                "no [%s] is configured in the config file [%s], "
                "using default value [False]",
                barrele_constant.BRL_ENABLE_LUSTRE_CLIENT, config_fpath)
            enable_lustre_client = False

        enable_lustre_mds = utils.config_value(
            agent_config, barrele_constant.BRL_ENABLE_LUSTRE_MDS)
        if enable_lustre_mds is None:
            log.cl_debug(
                "no [%s] is configured in the config file [%s], "
                "using default value [True]",
                barrele_constant.BRL_ENABLE_LUSTRE_MDS, config_fpath)
            enable_lustre_mds = True

        enable_lustre_oss = utils.config_value(
            agent_config, barrele_constant.BRL_ENABLE_LUSTRE_OSS)
        if enable_lustre_oss is None:
            log.cl_debug(
                "no [%s] is configured in the config file [%s], "
                "using default value [True]",
                barrele_constant.BRL_ENABLE_LUSTRE_OSS, config_fpath)
            enable_lustre_oss = True

        for hostname in hostnames:
            if hostname in agent_dict:
                log.cl_error(
                    "agent of host [%s] is configured for multiple times",
                    hostname)
                return None
            host = ssh_host.get_or_add_host_to_dict(log, host_dict, hostname,
                                                    ssh_identity_file)
            if host is None:
                return None

            agent = barrele_agent.BarreleAgent(
                host,
                barreleye_server,
                enable_disk=enable_disk,
                enable_lustre_oss=enable_lustre_oss,
                enable_lustre_mds=enable_lustre_mds,
                enable_lustre_client=enable_lustre_client,
                enable_infiniband=enable_infiniband)
            agent_dict[hostname] = agent

    local_host = ssh_host.get_local_host()
    instance = BarreleInstance(workspace, config, config_fpath, log_to_file,
                               logdir_is_default, local_host, collect_interval,
                               continuous_query_periods, jobstat_pattern,
                               lustre_fallback_version, enable_lustre_exp_mdt,
                               enable_lustre_exp_ost, host_dict, agent_dict,
                               barreleye_server)
    return instance