Esempio n. 1
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Performs advanced disk checks under Linux. This will first attempt to
  check the HDP installation directories if they exist. If they do not exist,
  it will default to checking /

  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    if not STACK_NAME in configurations or not STACK_ROOT in configurations:
        return (('STACK_ROOT', [
            'cluster-env/stack_name and cluster-env/stack_root are required'
        ]))

    path = stack_tools.get_stack_root(configurations[STACK_NAME],
                                      configurations[STACK_ROOT])

    try:
        disk_usage = _get_disk_usage(path)
        result_code, label = _get_warnings_for_partition(
            parameters, disk_usage)
    except NotImplementedError, platform_error:
        return 'CRITICAL', [str(platform_error)]
Esempio n. 2
0
def get_check_command(oozie_url, host_name, configurations, parameters,
                      only_kinit):
    kerberos_env = None

    user = USER_DEFAULT
    if USER_KEY in configurations:
        user = configurations[USER_KEY]

    if is_security_enabled(configurations):
        # defaults
        user_keytab = USER_KEYTAB_DEFAULT
        user_principal = USER_PRINCIPAL_DEFAULT

        # check script params
        if USER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
            user_principal = parameters[USER_PRINCIPAL_SCRIPT_PARAM_KEY]
            user_principal = user_principal.replace('_HOST', host_name.lower())
        if USER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
            user_keytab = parameters[USER_KEYTAB_SCRIPT_PARAM_KEY]

        # check configurations last as they should always take precedence
        if USER_PRINCIPAL_KEY in configurations:
            user_principal = configurations[USER_PRINCIPAL_KEY]
            user_principal = user_principal.replace('_HOST', host_name.lower())
        if USER_KEYTAB_KEY in configurations:
            user_keytab = configurations[USER_KEYTAB_KEY]

        # Create the kerberos credentials cache (ccache) file and set it in the environment to use
        # when executing curl
        env = Environment.get_instance()
        ccache_file = "{0}{1}oozie_alert_cc_{2}".format(
            env.tmp_dir, os.sep, os.getpid())
        kerberos_env = {'KRB5CCNAME': ccache_file}

        # Get the configured Kerberos executable search paths, if any
        kerberos_executable_search_paths = None
        if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
            kerberos_executable_search_paths = configurations[
                KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

        klist_path_local = get_klist_path(kerberos_executable_search_paths)
        kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
        kinit_part_command = format(
            "{kinit_path_local} -l 5m20s -c {ccache_file} -kt {user_keytab} {user_principal}; "
        )

        # Determine if we need to kinit by testing to see if the relevant cache exists and has
        # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
        # it kinits we do but recover quickly when keytabs are regenerated

        if only_kinit:
            kinit_command = kinit_part_command
        else:
            kinit_command = "{0} -s {1} || ".format(
                klist_path_local, ccache_file) + kinit_part_command

        # prevent concurrent kinit
        kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
        kinit_lock.acquire()
        try:
            Execute(kinit_command, environment=kerberos_env, user=user)
        finally:
            kinit_lock.release()

    # Configure stack root
    stack_root = STACK_ROOT_DEFAULT
    if STACK_NAME_KEY in configurations and STACK_ROOT_KEY in configurations:
        stack_root = stack_tools.get_stack_root(
            configurations[STACK_NAME_KEY],
            configurations[STACK_ROOT_KEY]).lower()

    # oozie configuration directory using a symlink
    oozie_config_directory = OOZIE_CONF_DIR.replace(STACK_ROOT_PATTERN,
                                                    stack_root)
    if not os.path.exists(oozie_config_directory):
        oozie_config_directory = OOZIE_CONF_DIR_LEGACY

    command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format(
        oozie_config_directory, oozie_url)

    return (command, kerberos_env, user)
Esempio n. 3
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    LLAP_APP_STATUS_CMD_TIMEOUT = 0

    if configurations is None:
        return ('UNKNOWN',
                ['There were no configurations supplied to the script.'])

    result_code = None

    try:
        security_enabled = False
        if SECURITY_ENABLED_KEY in configurations:
            security_enabled = str(
                configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

        check_command_timeout = CHECK_COMMAND_TIMEOUT_DEFAULT
        if CHECK_COMMAND_TIMEOUT_KEY in configurations:
            check_command_timeout = int(parameters[CHECK_COMMAND_TIMEOUT_KEY])

        hive_user = HIVE_USER_DEFAULT
        if HIVE_USER_KEY in configurations:
            hive_user = configurations[HIVE_USER_KEY]

        llap_app_name = LLAP_APP_NAME_DEFAULT
        if LLAP_APP_NAME_KEY in configurations:
            llap_app_name = configurations[LLAP_APP_NAME_KEY]

        if security_enabled:
            if HIVE_PRINCIPAL_KEY in configurations:
                llap_principal = configurations[HIVE_PRINCIPAL_KEY]
            else:
                llap_principal = HIVE_PRINCIPAL_DEFAULT
            llap_principal = llap_principal.replace('_HOST', host_name.lower())

            llap_keytab = HIVE_PRINCIPAL_KEYTAB_DEFAULT
            if HIVE_PRINCIPAL_KEYTAB_KEY in configurations:
                llap_keytab = configurations[HIVE_PRINCIPAL_KEYTAB_KEY]

            # Get the configured Kerberos executable search paths, if any
            if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
                kerberos_executable_search_paths = configurations[
                    KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
            else:
                kerberos_executable_search_paths = None

            kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
            kinitcmd = format(
                "{kinit_path_local} -kt {llap_keytab} {llap_principal}; ")

            # prevent concurrent kinit
            kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
            kinit_lock.acquire()
            try:
                Execute(kinitcmd,
                        user=hive_user,
                        path=[
                            "/bin/", "/usr/bin/", "/usr/lib/hive/bin/",
                            "/usr/sbin/"
                        ],
                        timeout=10)
            finally:
                kinit_lock.release()

        start_time = time.time()
        if STACK_NAME in configurations and STACK_ROOT in configurations:
            stack_root = stack_tools.get_stack_root(configurations[STACK_NAME],
                                                    configurations[STACK_ROOT])

            llap_status_cmd = stack_root + format(
                "/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name}  --findAppTimeout {LLAP_APP_STATUS_CMD_TIMEOUT}"
            )
        else:
            llap_status_cmd = STACK_ROOT_DEFAULT + format(
                "/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name} --findAppTimeout {LLAP_APP_STATUS_CMD_TIMEOUT}"
            )

        code, output, error = shell.checked_call(llap_status_cmd,
                                                 user=hive_user,
                                                 stderr=subprocess.PIPE,
                                                 timeout=check_command_timeout,
                                                 logoutput=False)
        # Call for getting JSON
        llap_app_info = make_valid_json(output)

        if llap_app_info is None or 'state' not in llap_app_info:
            alert_label = traceback.format_exc()
            result_code = UKNOWN_STATUS_CODE
            return (result_code, [alert_label])

        retrieved_llap_app_state = llap_app_info['state'].upper()
        if retrieved_llap_app_state in ['RUNNING_ALL']:
            result_code = OK_RESULT_CODE
            total_time = time.time() - start_time
            alert_label = OK_MESSAGE.format(
                llap_app_state_dict.get(retrieved_llap_app_state,
                                        retrieved_llap_app_state), total_time)
        elif retrieved_llap_app_state in ['RUNNING_PARTIAL']:
            live_instances = 0
            desired_instances = 0
            percentInstancesUp = 0
            percent_desired_instances_to_be_up = 80
            # Get 'live' and 'desired' instances
            if 'liveInstances' not in llap_app_info or 'desiredInstances' not in llap_app_info:
                result_code = CRITICAL_RESULT_CODE
                total_time = time.time() - start_time
                alert_label = CRITICAL_MESSAGE_WITH_STATE.format(
                    llap_app_state_dict.get(retrieved_llap_app_state,
                                            retrieved_llap_app_state),
                    total_time)
                return (result_code, [alert_label])

            live_instances = llap_app_info['liveInstances']
            desired_instances = llap_app_info['desiredInstances']
            if live_instances < 0 or desired_instances <= 0:
                result_code = CRITICAL_RESULT_CODE
                total_time = time.time() - start_time
                alert_label = CRITICAL_MESSAGE_WITH_STATE.format(
                    llap_app_state_dict.get(retrieved_llap_app_state,
                                            retrieved_llap_app_state),
                    total_time)
                return (result_code, [alert_label])

            percentInstancesUp = float(
                live_instances) / desired_instances * 100
            if percentInstancesUp >= percent_desired_instances_to_be_up:
                result_code = OK_RESULT_CODE
                total_time = time.time() - start_time
                alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(
                    llap_app_state_dict.get(retrieved_llap_app_state,
                                            retrieved_llap_app_state),
                    total_time, llap_app_info['liveInstances'],
                    llap_app_info['desiredInstances'])
            else:
                result_code = CRITICAL_RESULT_CODE
                total_time = time.time() - start_time
                alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(
                    llap_app_state_dict.get(retrieved_llap_app_state,
                                            retrieved_llap_app_state),
                    total_time, llap_app_info['liveInstances'],
                    llap_app_info['desiredInstances'])
        else:
            result_code = CRITICAL_RESULT_CODE
            total_time = time.time() - start_time
            alert_label = CRITICAL_MESSAGE_WITH_STATE.format(
                llap_app_state_dict.get(retrieved_llap_app_state,
                                        retrieved_llap_app_state), total_time)
    except:
        alert_label = traceback.format_exc()
        traceback.format_exc()
        result_code = UKNOWN_STATUS_CODE
    return (result_code, [alert_label])
Esempio n. 4
0
    StackFeature.RANGER_AUDIT_DB_SUPPORT, version_for_stack_feature_checks)
stack_supports_core_site_for_ranger_plugin = check_stack_feature(
    StackFeature.CORE_SITE_FOR_RANGER_PLUGINS_SUPPORT,
    version_for_stack_feature_checks)

# This is the version whose state is CURRENT. During an RU, this is the source version.
# DO NOT format it since we need the build number too.
upgrade_from_version = upgrade_summary.get_source_version()

source_stack = default("/commandParams/source_stack", None)
if source_stack is None:
    source_stack = upgrade_summary.get_source_stack("KNOX")
source_stack_name = get_stack_name(source_stack)
if source_stack_name is not None and source_stack_name != stack_name:
    source_stack_root = get_stack_root(
        source_stack_name,
        default('/configurations/cluster-env/stack_root', None))
else:
    source_stack_root = stack_root

# server configurations
# Default value used in HDP 2.3.0.0 and earlier.
knox_data_dir = '/var/lib/knox/data'

# Important, it has to be strictly greater than 2.3.0.0!!!
Logger.info(format("Stack version to use is {version_formatted}"))
if version_formatted and check_stack_feature(
        StackFeature.KNOX_VERSIONED_DATA_DIR, version_formatted):
    # This is the current version. In the case of a Rolling Upgrade, it will be the newer version.
    # In the case of a Downgrade, it will be the version downgrading to.
    # This is always going to be a symlink to /var/lib/knox/data_${version}
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    if not HIVE_METASTORE_URIS_KEY in configurations:
        return (('UNKNOWN',
                 ['Hive metastore uris were not supplied to the script.']))

    metastore_uris = configurations[HIVE_METASTORE_URIS_KEY].split(',')

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    check_command_timeout = CHECK_COMMAND_TIMEOUT_DEFAULT
    if CHECK_COMMAND_TIMEOUT_KEY in parameters:
        check_command_timeout = float(parameters[CHECK_COMMAND_TIMEOUT_KEY])

    # defaults
    smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT
    smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT
    smokeuser = SMOKEUSER_DEFAULT

    # check script params
    if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
        smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY]

    if SMOKEUSER_SCRIPT_PARAM_KEY in parameters:
        smokeuser = parameters[SMOKEUSER_SCRIPT_PARAM_KEY]

    if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
        smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY]

    # check configurations last as they should always take precedence
    if SMOKEUSER_PRINCIPAL_KEY in configurations:
        smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    result_code = None

    try:
        if security_enabled:
            if SMOKEUSER_KEYTAB_KEY in configurations:
                smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY]

            # Get the configured Kerberos executable search paths, if any
            if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
                kerberos_executable_search_paths = configurations[
                    KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
            else:
                kerberos_executable_search_paths = None

            kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
            kinitcmd = format(
                "{kinit_path_local} -kt {smokeuser_keytab} {smokeuser_principal}; "
            )

            # prevent concurrent kinit
            kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
            kinit_lock.acquire()
            try:
                Execute(kinitcmd,
                        user=smokeuser,
                        path=[
                            "/bin/", "/usr/bin/", "/usr/lib/hive/bin/",
                            "/usr/sbin/"
                        ],
                        timeout=10)
            finally:
                kinit_lock.release()

        if host_name is None:
            host_name = socket.getfqdn()

        for uri in metastore_uris:
            if host_name in uri:
                metastore_uri = uri

        conf_dir = HIVE_CONF_DIR_LEGACY
        bin_dir = HIVE_BIN_DIR_LEGACY

        if STACK_NAME in configurations and STACK_ROOT in configurations:
            stack_root = stack_tools.get_stack_root(configurations[STACK_NAME],
                                                    configurations[STACK_ROOT])
            hive_conf_dir = stack_root + format("/current/hive-metastore/conf")
            hive_bin_dir = stack_root + format("/current/hive-metastore/bin")

            if os.path.exists(hive_conf_dir):
                conf_dir = hive_conf_dir
                bin_dir = hive_bin_dir

        cmd = format("export HIVE_CONF_DIR='{conf_dir}' ; "
                     "hive --hiveconf hive.metastore.uris={metastore_uri}\
                 --hiveconf hive.metastore.client.connect.retry.delay=1\
                 --hiveconf hive.metastore.failure.retries=1\
                 --hiveconf hive.metastore.connect.retries=1\
                 --hiveconf hive.metastore.client.socket.timeout=14\
                 --hiveconf hive.execution.engine=mr -e 'show databases;'")

        start_time = time.time()

        try:
            Execute(
                cmd,
                user=smokeuser,
                path=["/bin/", "/usr/bin/", "/usr/sbin/", bin_dir],
                timeout=int(check_command_timeout),
                timeout_kill_strategy=TerminateStrategy.KILL_PROCESS_TREE,
            )

            total_time = time.time() - start_time

            result_code = 'OK'
            label = OK_MESSAGE.format(total_time)
        except:
            result_code = 'CRITICAL'
            label = CRITICAL_MESSAGE.format(host_name, traceback.format_exc())

    except:
        label = traceback.format_exc()
        result_code = 'UNKNOWN'

    return ((result_code, [label]))