Beispiel #1
0
def falcon(type, action = None, upgrade_type=None):
  import params

  if action == 'config':
    env = Environment.get_instance()
    # These 2 parameters are used in ../templates/client.properties.j2
    env.config.params["falcon_host"] = params.falcon_host
    env.config.params["falcon_port"] = params.falcon_port
    File(os.path.join(params.falcon_conf_dir, 'falcon-env.sh'),
      content = InlineTemplate(params.falcon_env_sh_template))

    PropertiesFile(os.path.join(params.falcon_conf_dir, 'runtime.properties'),
      properties = params.falcon_runtime_properties)

    PropertiesFile(os.path.join(params.falcon_conf_dir, 'startup.properties'),
      properties = params.falcon_startup_properties)

    PropertiesFile(os.path.join(params.falcon_conf_dir, 'client.properties'),
      properties = params.falcon_client_properties)

  if type == 'server':
    ServiceConfig(params.falcon_win_service_name,
      action = "change_user",
      username = params.falcon_user,
      password = Script.get_password(params.falcon_user))

    if action == 'start':
      Service(params.falcon_win_service_name, action = "start")

    if action == 'stop':
      Service(params.falcon_win_service_name, action = "stop")
Beispiel #2
0
    def format(self, format_string, *args, **kwargs):
        env = Environment.get_instance()
        variables = kwargs
        params = env.config.params

        result = checked_unite(variables, params)
        return self.vformat(format_string, args, result)
Beispiel #3
0
  def format(self, format_string, *args, **kwargs):
    variables = kwargs
    
    if Environment.has_instance():
      env = Environment.get_instance()
      params = env.config.params
  
      # don't use checked_unite for this as it would interfere with reload(module)
      # for things like params and status_params; instead, start out copying
      # the environment parameters and add in any locally declared variables to
      # override existing env parameters
      all_params = params.copy()
    else:
      all_params = {}
      
    all_params.update(variables)

    self.convert_field = self.convert_field_protected
    result_protected = self.vformat(format_string, args, all_params)
    
    self.convert_field = self.convert_field_unprotected
    result_unprotected = self.vformat(format_string, args, all_params)
    
    if result_protected != result_unprotected:
      Logger.sensitive_strings[result_unprotected] = result_protected
      
    return result_unprotected
Beispiel #4
0
    def format(self, format_string, *args, **kwargs):
        variables = kwargs

        if Environment.has_instance():
            env = Environment.get_instance()
            params = env.config.params

            # don't use checked_unite for this as it would interfere with reload(module)
            # for things like params and status_params; instead, start out copying
            # the environment parameters and add in any locally declared variables to
            # override existing env parameters
            all_params = params.copy()
        else:
            all_params = {}

        all_params.update(variables)

        self.convert_field = self.convert_field_protected
        result_protected = self.vformat(format_string, args, all_params)

        self.convert_field = self.convert_field_unprotected
        result_unprotected = self.vformat(format_string, args, all_params)

        if result_protected != result_unprotected:
            Logger.sensitive_strings[result_unprotected] = result_protected

        return result_unprotected
Beispiel #5
0
def falcon(type, action = None):
  import params

  if action == 'config':
    env = Environment.get_instance()
    # These 2 parameters are used in ../templates/client.properties.j2
    env.config.params["falcon_host"] = params.falcon_host
    env.config.params["falcon_port"] = params.falcon_port
    File(os.path.join(params.falcon_conf_dir, 'falcon-env.sh'),
      content = InlineTemplate(params.falcon_env_sh_template))

    File(os.path.join(params.falcon_conf_dir, 'client.properties'),
      content = Template('client.properties.j2'))

    PropertiesFile(os.path.join(params.falcon_conf_dir, 'runtime.properties'),
      properties = params.falcon_runtime_properties)

    PropertiesFile(os.path.join(params.falcon_conf_dir, 'startup.properties'),
      properties = params.falcon_startup_properties)

  if type == 'server':
    ServiceConfig(params.falcon_win_service_name,
      action = "change_user",
      username = params.falcon_user,
      password = Script.get_password(params.falcon_user))

    if action == 'start':
      Service(params.falcon_win_service_name, action = "start")

    if action == 'stop':
      Service(params.falcon_win_service_name, action = "stop")
Beispiel #6
0
def get_check_command(oozie_url, host_name, configurations, parameters):
  kerberos_env = None

  user = USER_DEFAULT
  if USER_KEY in configurations:
    user = configurations[USER_KEY]

  security_enabled = False
  if SECURITY_ENABLED in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED]).upper() == 'TRUE'

  if security_enabled:
    # defaults
    user_keytab = USER_KEYTAB_DEFAULT
    user_principal = USER_PRINCIPAL_DEFAULT

    # check script params
    if USER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
      user_principal = parameters[USER_PRINCIPAL_SCRIPT_PARAM_KEY]
      user_principal = user_principal.replace('_HOST', host_name.lower())
    if USER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
      user_keytab = parameters[USER_KEYTAB_SCRIPT_PARAM_KEY]

    # check configurations last as they should always take precedence
    if USER_PRINCIPAL_KEY in configurations:
      user_principal = configurations[USER_PRINCIPAL_KEY]
      user_principal = user_principal.replace('_HOST', host_name.lower())
    if USER_KEYTAB_KEY in configurations:
      user_keytab = configurations[USER_KEYTAB_KEY]

    # Create the kerberos credentials cache (ccache) file and set it in the environment to use
    # when executing curl
    env = Environment.get_instance()
    ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid())
    kerberos_env = {'KRB5CCNAME': ccache_file}

    # Get the configured Kerberos executable search paths, if any
    kerberos_executable_search_paths = None
    if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
      kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

    klist_path_local = get_klist_path(kerberos_executable_search_paths)
    kinit_path_local = get_kinit_path(kerberos_executable_search_paths)

    # Determine if we need to kinit by testing to see if the relevant cache exists and has
    # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
    # it kinits we do but recover quickly when keytabs are regenerated

    kinit_command = "{0} -s {1} || ".format(klist_path_local, ccache_file) + format("{kinit_path_local} -l 5m20s -c {ccache_file} -kt {user_keytab} {user_principal}; ")
    Execute(kinit_command, environment=kerberos_env, user=user)

  # oozie configuration directory uses a symlink when > HDP 2.2
  oozie_config_directory = OOZIE_CONF_DIR_LEGACY
  if os.path.exists(OOZIE_CONF_DIR):
    oozie_config_directory = OOZIE_CONF_DIR

  command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format(
    oozie_config_directory, oozie_url)

  return (command, kerberos_env, user)
Beispiel #7
0
    def __new__(cls, name, env=None, provider=None, **kwargs):
        if isinstance(name, list):
            while len(name) != 1:
                cls(name.pop(0), env, provider, **kwargs)

            name = name[0]

        env = env or Environment.get_instance()
        provider = provider or getattr(cls, 'provider', None)

        r_type = cls.__name__
        if r_type not in env.resources:
            env.resources[r_type] = {}
        if name not in env.resources[r_type]:
            obj = super(Resource, cls).__new__(cls)
            env.resources[r_type][name] = obj
            env.resource_list.append(obj)
            return obj

        obj = env.resources[r_type][name]
        if obj.provider != provider:
            raise Fail(
                "Duplicate resource %r with a different provider %r != %r" %
                (obj, provider, obj.provider))

        obj.override(**kwargs)
        return obj
Beispiel #8
0
    def action_delayed_for_nameservice(self, nameservice, action_name,
                                       main_resource):
        resource = {}
        env = Environment.get_instance()
        env_dict_key = 'hdfs_files_sudo' if main_resource.create_as_root else 'hdfs_files'

        if main_resource.create_as_root:
            Logger.info("Will create {0} as root user".format(
                main_resource.resource.target))

        if not env_dict_key in env.config:
            env.config[env_dict_key] = []

        # Put values in dictionary-resource
        for field_name, json_field_name in RESOURCE_TO_JSON_FIELDS.iteritems():
            if field_name == 'action':
                resource[json_field_name] = action_name
            elif field_name == 'mode' and main_resource.resource.mode:
                resource[json_field_name] = oct(
                    main_resource.resource.mode)[1:]
            elif field_name == 'manage_if_exists':
                resource[json_field_name] = main_resource.manage_if_exists
            elif getattr(main_resource.resource, field_name):
                resource[json_field_name] = getattr(main_resource.resource,
                                                    field_name)

        resource['nameservice'] = nameservice

        # Add resource to create
        env.config[env_dict_key].append(resource)
def get_check_command(oozie_url, host_name, configurations):
  if OOZIE_USER in configurations:
    oozie_user = configurations[OOZIE_USER]
  else:
    raise Exception("Oozie user is required")
    
  security_enabled = False
  if SECURITY_ENABLED in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED]).upper() == 'TRUE'
  kerberos_env = None
  if security_enabled:
    if OOZIE_KEYTAB in configurations and OOZIE_PRINCIPAL in configurations:
      oozie_keytab = configurations[OOZIE_KEYTAB]
      oozie_principal = configurations[OOZIE_PRINCIPAL]

      # substitute _HOST in kerberos principal with actual fqdn
      oozie_principal = oozie_principal.replace('_HOST', host_name)
    else:
      raise KerberosPropertiesNotFound('The Oozie keytab and principal are required configurations when security is enabled.')

    # Create the kerberos credentials cache (ccache) file and set it in the environment to use
    # when executing curl
    env = Environment.get_instance()
    ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid())
    kerberos_env = {'KRB5CCNAME': ccache_file}

    # Get the configured Kerberos executable search paths, if any
    if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
      kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
    else:
      kerberos_executable_search_paths = None

    klist_path_local = get_klist_path(kerberos_executable_search_paths)
    klist_command = format("{klist_path_local} -s {ccache_file}")

    # Determine if we need to kinit by testing to see if the relevant cache exists and has
    # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
    # it kinits we do but recover quickly when keytabs are regenerated
    return_code, _ = call(klist_command, user=oozie_user)
    if return_code != 0:
      kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
      kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ")

      # kinit
      Execute(kinit_command, 
              environment=kerberos_env,
              user=oozie_user,
      )

  # oozie configuration directory uses a symlink when > HDP 2.2
  oozie_config_directory = OOZIE_CONF_DIR_LEGACY
  if os.path.exists(OOZIE_CONF_DIR):
    oozie_config_directory = OOZIE_CONF_DIR

  command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format(
    oozie_config_directory, oozie_url)

  return (command, kerberos_env, oozie_user)
def get_check_command(oozie_url, host_name, configurations):
    security_enabled = False
    if SECURITY_ENABLED in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED]).upper() == 'TRUE'
    kerberos_env = None
    if security_enabled:
        if OOZIE_KEYTAB in configurations and OOZIE_PRINCIPAL in configurations:
            oozie_keytab = configurations[OOZIE_KEYTAB]
            oozie_principal = configurations[OOZIE_PRINCIPAL]

            # substitute _HOST in kerberos principal with actual fqdn
            oozie_principal = oozie_principal.replace('_HOST', host_name)
        else:
            raise KerberosPropertiesNotFound(
                'The Oozie keytab and principal are required configurations when security is enabled.'
            )

        # Create the kerberos credentials cache (ccache) file and set it in the environment to use
        # when executing curl
        env = Environment.get_instance()
        ccache_file = "{0}{1}oozie_alert_cc_{2}".format(
            env.tmp_dir, os.sep, os.getpid())
        kerberos_env = {'KRB5CCNAME': ccache_file}

        # Get the configured Kerberos executable search paths, if any
        if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
            kerberos_executable_search_paths = configurations[
                KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
        else:
            kerberos_executable_search_paths = None

        klist_path_local = get_klist_path(kerberos_executable_search_paths)
        klist_command = format("{klist_path_local} -s {ccache_file}")

        # Determine if we need to kinit by testing to see if the relevant cache exists and has
        # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
        # it kinits we do but recover quickly when keytabs are regenerated
        return_code, _ = call(klist_command)
        if return_code != 0:
            kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
            kinit_command = format(
                "{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; "
            )

            # kinit
            Execute(kinit_command, environment=kerberos_env)

    # oozie configuration directory uses a symlink when > HDP 2.2
    oozie_config_directory = OOZIE_CONF_DIR_LEGACY
    if os.path.exists(OOZIE_CONF_DIR):
        oozie_config_directory = OOZIE_CONF_DIR

    command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format(
        oozie_config_directory, oozie_url)

    return (command, kerberos_env)
    def configure(self, env):
        import params
        env.set_params(params)

        if params.monitor_security_enabled and self.component == 'monitor':
            import os
            import random
            import string

            basedir = Environment.get_instance().config.basedir
            keystore_file = os.path.join(basedir, "files", "keystore.jks")
            truststore_file = os.path.join(basedir, "files", "cacerts.jks")
            cert_file = os.path.join(basedir, "files", "server.cer")

            if os.path.exists(keystore_file) or os.path.exists(
                    truststore_file) or os.path.exists(cert_file):
                self.fail_with_error(
                    "trying to create monitor certs but they already existed")

            goodchars = string.lowercase + string.uppercase + string.digits + '#%+,-./:=?@^_'
            keypass = ''.join(random.choice(goodchars) for x in range(20))
            storepass = ''.join(random.choice(goodchars) for x in range(20))

            https_params = {}
            https_params[params.keystore_property] = params.keystore_path
            https_params[params.truststore_property] = params.truststore_path
            https_params[params.keystore_password_property] = keypass
            https_params[params.truststore_password_property] = storepass

            setup_conf_dir(name=self.component, extra_params=https_params)

            Execute(format(
                "{java64_home}/bin/keytool -genkey -alias \"default\" -keyalg RSA -keypass {keypass} -storepass {storepass} -keystore {keystore_file} -dname \"CN=Unknown, OU=Unknown, O=Unknown, L=Unknown, ST=Unknown, C=Unknown\""
            ),
                    user=params.accumulo_user)
            Execute(format(
                "{java64_home}/bin/keytool -export -alias \"default\" -storepass {storepass} -file {cert_file} -keystore {keystore_file}"
            ),
                    user=params.accumulo_user)
            Execute(format(
                "echo \"yes\" | {java64_home}/bin/keytool -import -v -trustcacerts -alias \"default\" -file {cert_file} -keystore {truststore_file} -keypass {keypass} -storepass {storepass}"
            ),
                    user=params.accumulo_user)

            accumulo_StaticFile("keystore.jks")
            accumulo_StaticFile("cacerts.jks")

        else:
            setup_conf_dir(name=self.component)
Beispiel #12
0
  def __get_delegation_token(self, user, keytab, principal, kinit_path):
    """
    Gets the kerberos delegation token from name node
    """
    import params
    url = params.namenode_path + "/webhdfs/v1/?op=GETDELEGATIONTOKEN"
    Logger.info("Getting delegation token from {0}".format(url))
    response, _, _  = curl_krb_request(Environment.get_instance().tmp_dir, keytab, principal,
        url, "get_delegation_token", kinit_path, False, "Delegation Token", user)
    json_response = json.loads(response)
    if json_response['Token'] and json_response['Token']['urlString']:
      return json_response['Token']['urlString']

    error_msg = "Get Token: Unable to get kerberos delegation token from webhdfs: \nurl = {0}, user = {1}, keytab = {2}, principal = {3}, kinit-path = {4} \nresponse = {5}".format(url, user, keytab, principal, kinit_path, json_response)
    Logger.error(error_msg)
    self.checks_failed += 1
Beispiel #13
0
    def format(self, format_string, *args, **kwargs):
        env = Environment.get_instance()
        variables = kwargs
        params = env.config.params
        all_params = checked_unite(variables, params)

        self.convert_field = self.convert_field_protected
        result_protected = self.vformat(format_string, args, all_params)

        self.convert_field = self.convert_field_unprotected
        result_unprotected = self.vformat(format_string, args, all_params)

        if result_protected != result_unprotected:
            Logger.sensitive_strings[result_unprotected] = result_protected

        return result_unprotected
Beispiel #14
0
    def format(self, format_string, *args, **kwargs):
        env = Environment.get_instance()
        variables = kwargs
        params = env.config.params
        all_params = checked_unite(variables, params)

        self.convert_field = self.convert_field_protected
        result_protected = self.vformat(format_string, args, all_params)

        self.convert_field = self.convert_field_unprotected
        result_unprotected = self.vformat(format_string, args, all_params)

        if result_protected != result_unprotected:
            Logger.sensitive_strings[result_unprotected] = result_protected

        return result_unprotected
Beispiel #15
0
  def action_delayed(self, action_name, main_resource):
    resource = {}
    env = Environment.get_instance()
    if not 'hdfs_files' in env.config:
      env.config['hdfs_files'] = []

    # Put values in dictionary-resource
    for field_name, json_field_name in RESOURCE_TO_JSON_FIELDS.iteritems():
      if field_name == 'action':
        resource[json_field_name] = action_name
      elif field_name == 'mode' and main_resource.resource.mode:
        resource[json_field_name] = oct(main_resource.resource.mode)[1:]
      elif getattr(main_resource.resource, field_name):
        resource[json_field_name] = getattr(main_resource.resource, field_name)

    # Add resource to create
    env.config['hdfs_files'].append(resource)
Beispiel #16
0
    def action_execute(self, main_resource):
        env = Environment.get_instance()

        # Check required parameters
        if main_resource.has_core_configs:
            main_resource.assert_parameter_is_set('user')

        if not 'hdfs_files' in env.config or not env.config['hdfs_files']:
            Logger.info(
                "No resources to create. 'create_on_execute' or 'delete_on_execute' or 'download_on_execute' wasn't triggered before this 'execute' action."
            )
            return

        hadoop_bin_dir = main_resource.resource.hadoop_bin_dir
        hadoop_conf_dir = main_resource.resource.hadoop_conf_dir
        user = main_resource.resource.user if main_resource.has_core_configs else None
        security_enabled = main_resource.resource.security_enabled
        keytab_file = main_resource.resource.keytab
        kinit_path = main_resource.resource.kinit_path_local
        logoutput = main_resource.resource.logoutput
        principal_name = main_resource.resource.principal_name
        jar_path = JAR_PATH
        timestamp = time.time()
        json_path = format(JSON_PATH)

        if security_enabled:
            main_resource.kinit()

        # Write json file to disk
        File(json_path,
             owner=user,
             content=json.dumps(env.config['hdfs_files']))

        # Execute jar to create/delete resources in hadoop
        Execute(
            format(
                "hadoop --config {hadoop_conf_dir} jar {jar_path} {json_path}"
            ),
            user=user,
            path=[hadoop_bin_dir],
            logoutput=logoutput,
        )

        # Clean
        env.config['hdfs_files'] = []
Beispiel #17
0
    def action_execute(self, main_resource, sudo=False):
        env = Environment.get_instance()
        env_dict_key = 'hdfs_files_sudo' if sudo else 'hdfs_files'

        if not env_dict_key in env.config or not env.config[env_dict_key]:
            return

        # Check required parameters
        if not sudo:
            main_resource.assert_parameter_is_set('user')
            user = main_resource.resource.user
        else:
            user = None

        hadoop_bin_dir = main_resource.resource.hadoop_bin_dir
        hadoop_conf_dir = main_resource.resource.hadoop_conf_dir
        security_enabled = main_resource.resource.security_enabled
        keytab_file = main_resource.resource.keytab
        kinit_path = main_resource.resource.kinit_path_local
        logoutput = main_resource.resource.logoutput
        principal_name = main_resource.resource.principal_name
        jar_path = JAR_PATH
        timestamp = time.time()
        json_path = format(JSON_PATH)

        if security_enabled:
            main_resource.kinit()

        # Write json file to disk
        File(json_path,
             owner=user,
             content=json.dumps(env.config[env_dict_key]))

        # Execute jar to create/delete resources in hadoop
        Execute(
            ('hadoop', '--config', hadoop_conf_dir, 'jar', jar_path,
             json_path),
            user=user,
            path=[hadoop_bin_dir],
            logoutput=logoutput,
            sudo=sudo,
        )

        # Clean
        env.config[env_dict_key] = []
def _get_delegation_token(namenode_address, user, keytab, principal,
                          kinit_path):
    """
  Gets the kerberos delegation token from name node
  """
    url = namenode_address + "/webhdfs/v1/?op=GETDELEGATIONTOKEN"
    logger.info("Getting delegation token from {0} for PXF".format(url))
    response, _, _ = curl_krb_request(Environment.get_instance().tmp_dir,
                                      keytab, principal, url,
                                      "get_delegation_token", kinit_path,
                                      False, "Delegation Token", user)
    json_response = json.loads(response)
    if json_response['Token'] and json_response['Token']['urlString']:
        return json_response['Token']['urlString']

    msg = "Unable to get delegation token for PXF"
    logger.error(msg)
    raise Exception(msg)
Beispiel #19
0
  def __new__(cls, name, env=None, provider=None, **kwargs):
    if isinstance(name, list):
      while len(name) != 1:
        cls(name.pop(0), env, provider, **kwargs)
        
      name = name[0]
    
    env = env or Environment.get_instance()
    provider = provider or getattr(cls, 'provider', None)
    
    r_type = cls.__name__
    if r_type not in env.resources:
      env.resources[r_type] = {}

    obj = super(Resource, cls).__new__(cls)
    env.resources[r_type][name] = obj
    env.resource_list.append(obj)
    return obj
Beispiel #20
0
    def __new__(cls, name, env=None, provider=None, **kwargs):
        if isinstance(name, list):
            while len(name) != 1:
                cls(name.pop(0), env, provider, **kwargs)

            name = name[0]

        env = env or Environment.get_instance()
        provider = provider or getattr(cls, 'provider', None)

        r_type = cls.__name__
        if r_type not in env.resources:
            env.resources[r_type] = {}

        obj = super(Resource, cls).__new__(cls)
        env.resources[r_type][name] = obj
        env.resource_list.append(obj)
        return obj
  def call_curl_request(self,user,keytab,principal, url, flag_http_response, request_method='GET',request_body='',header=''):
    """
    :param user: service user for which call is to be made
    :param keytab: keytab of service user
    :param principal: principal of service user
    :param url: url with which call is to be made
    :param flag_http_response: flag to get only response-code or response string
    :param request_method: http method (GET / POST / PUT / DELETE)
    :param request_body: data to be send along with the request
    :param header: http header required for the call
    :return: Returns the response error_msg , time_millis
    """
    response = None
    error_msg = None
    time_millis = 0
    response, error_msg, time_millis = curl_krb_request(Environment.get_instance().tmp_dir, keytab, principal, url, 'ranger_admin_calls',
                                                         None, flag_http_response, "Ranger-Admin API calls", user,kinit_timer_ms=0,method = request_method,body=request_body,header=header)

    return response, error_msg, time_millis
  def call_curl_request(self,user,keytab,principal, url, flag_http_response, request_method='GET',request_body='',header=''):
    """
    :param user: service user for which call is to be made
    :param keytab: keytab of service user
    :param principal: principal of service user
    :param url: url with which call is to be made
    :param flag_http_response: flag to get only response-code or response string
    :param request_method: http method (GET / POST / PUT / DELETE)
    :param request_body: data to be send along with the request
    :param header: http header required for the call
    :return: Returns the response error_msg , time_millis
    """
    response = None
    error_msg = None
    time_millis = 0
    response, error_msg, time_millis = curl_krb_request(Environment.get_instance().tmp_dir, keytab, principal, url, 'ranger_admin_calls',
                                                         None, flag_http_response, "Ranger-Admin API calls", user,kinit_timer_ms=0,method = request_method,body=request_body,header=header)

    return response, error_msg, time_millis
  def configure(self, env):
    import params
    env.set_params(params)

    if params.monitor_security_enabled and self.component == 'monitor':
      import os
      import random
      import string

      basedir = Environment.get_instance().config.basedir
      keystore_file = os.path.join(basedir, "files", "keystore.jks")
      truststore_file = os.path.join(basedir, "files", "cacerts.jks")
      cert_file = os.path.join(basedir, "files", "server.cer")

      if os.path.exists(keystore_file) or os.path.exists(truststore_file) or os.path.exists(cert_file):
        self.fail_with_error("trying to create monitor certs but they already existed")

      goodchars = string.lowercase + string.uppercase + string.digits + '#%+,-./:=?@^_'
      keypass = ''.join(random.choice(goodchars) for x in range(20))
      storepass = ''.join(random.choice(goodchars) for x in range(20))

      https_params = {}
      https_params[params.keystore_property] = params.keystore_path
      https_params[params.truststore_property] = params.truststore_path
      https_params[params.keystore_password_property] = keypass
      https_params[params.truststore_password_property] = storepass

      setup_conf_dir(name=self.component, extra_params=https_params)

      Execute( format("{java64_home}/bin/keytool -genkey -alias \"default\" -keyalg RSA -keypass {keypass} -storepass {storepass} -keystore {keystore_file} -dname \"CN=Unknown, OU=Unknown, O=Unknown, L=Unknown, ST=Unknown, C=Unknown\""),
               user=params.accumulo_user)
      Execute( format("{java64_home}/bin/keytool -export -alias \"default\" -storepass {storepass} -file {cert_file} -keystore {keystore_file}"),
               user=params.accumulo_user)
      Execute( format("echo \"yes\" | {java64_home}/bin/keytool -import -v -trustcacerts -alias \"default\" -file {cert_file} -keystore {truststore_file} -keypass {keypass} -storepass {storepass}"),
               user=params.accumulo_user)

      accumulo_StaticFile("keystore.jks")
      accumulo_StaticFile("cacerts.jks")

    else:
      setup_conf_dir(name=self.component)
  def action_execute(self, main_resource):
    env = Environment.get_instance()

    # Check required parameters
    main_resource.assert_parameter_is_set('user')

    if not 'hdfs_files' in env.config or not env.config['hdfs_files']:
      Logger.info("No resources to create. 'create_on_execute' or 'delete_on_execute' wasn't triggered before this 'execute' action.")
      return

    hadoop_bin_dir = main_resource.resource.hadoop_bin_dir
    hadoop_conf_dir = main_resource.resource.hadoop_conf_dir
    user = main_resource.resource.user
    security_enabled = main_resource.resource.security_enabled
    keytab_file = main_resource.resource.keytab
    kinit_path = main_resource.resource.kinit_path_local
    logoutput = main_resource.resource.logoutput
    principal_name = main_resource.resource.principal_name
    jar_path=JAR_PATH
    timestamp = time.time()
    json_path=format(JSON_PATH)

    if security_enabled:
      main_resource.kinit()

    # Write json file to disk
    File(json_path,
         owner = user,
         content = json.dumps(env.config['hdfs_files'])
    )

    # Execute jar to create/delete resources in hadoop
    Execute(format("hadoop --config {hadoop_conf_dir} jar {jar_path} {json_path}"),
            user=user,
            path=[hadoop_bin_dir],
            logoutput=logoutput,
    )

    # Clean
    env.config['hdfs_files'] = []
Beispiel #25
0
  def __init__(self, name, env=None, provider=None, **kwargs):
    if isinstance(name, list):
      name = name.pop(0)
    
    if hasattr(self, 'name'):
      return

    self.env = env or Environment.get_instance()
    self.name = name
     
    self.provider = provider or getattr(self, 'provider', None)

    self.arguments = {}
    for key, value in kwargs.items():
      try:
        arg = self._arguments[key]
      except KeyError:
        raise Fail("%s received unsupported argument %s" % (self, key))
      else:
        try:
          self.arguments[key] = arg.validate(value)
        except InvalidArgument, exc:
          raise InvalidArgument("%s %s" % (self, exc))
Beispiel #26
0
    def __init__(self, name, env=None, provider=None, **kwargs):
        if isinstance(name, list):
            name = name.pop(0)

        if hasattr(self, 'name'):
            return

        self.env = env or Environment.get_instance()
        self.name = name

        self.provider = provider or getattr(self, 'provider', None)

        self.arguments = {}
        for key, value in kwargs.items():
            try:
                arg = self._arguments[key]
            except KeyError:
                raise Fail("%s received unsupported argument %s" % (self, key))
            else:
                try:
                    self.arguments[key] = arg.validate(value)
                except InvalidArgument, exc:
                    raise InvalidArgument("%s %s" % (self, exc))
Beispiel #27
0
def get_check_command(oozie_url, host_name, parameters):
  security_enabled = False
  if SECURITY_ENABLED in parameters:
    security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE'
  kerberos_env = None
  if security_enabled:
    if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters:
      oozie_keytab = parameters[OOZIE_KEYTAB]
      oozie_principal = parameters[OOZIE_PRINCIPAL]

      # substitute _HOST in kerberos principal with actual fqdn
      oozie_principal = oozie_principal.replace('_HOST', host_name)
    else:
      raise KerberosPropertiesNotFound('The Oozie keytab and principal are required parameters when security is enabled.')

    # Create the kerberos credentials cache (ccache) file and set it in the environment to use
    # when executing curl
    env = Environment.get_instance()
    ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid())
    kerberos_env = {'KRB5CCNAME': ccache_file}

    klist_path_local = get_klist_path()
    klist_command = format("{klist_path_local} -s {ccache_file}")

    # Determine if we need to kinit by testing to see if the relevant cache exists and has
    # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
    # it kinits we do but recover quickly when keytabs are regenerated
    return_code, _ = call(klist_command)
    if return_code != 0:
      kinit_path_local = get_kinit_path()
      kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ")

      # kinit
      Execute(kinit_command, environment=kerberos_env)
  command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status")
  return (command, kerberos_env)
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
  if configurations is None:
    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])

  # if not in HA mode, then SKIP
  if not NAMESERVICE_KEY in configurations:
    return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled'])

  # hdfs-site is required
  if not HDFS_SITE_KEY in configurations:
    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
  
  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)


  # determine whether or not SSL is enabled
  is_ssl_enabled = False
  if DFS_POLICY_KEY in configurations:
    dfs_policy = configurations[DFS_POLICY_KEY]
    if dfs_policy == "HTTPS_ONLY":
      is_ssl_enabled = True

  name_service = configurations[NAMESERVICE_KEY]
  hdfs_site = configurations[HDFS_SITE_KEY]

  # look for dfs.ha.namenodes.foo
  nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
  if not nn_unique_ids_key in hdfs_site:
    return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)])

  namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
  jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

  if is_ssl_enabled:
    namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
    jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"


  active_namenodes = []
  standby_namenodes = []
  unknown_namenodes = []

  # now we have something like 'nn1,nn2,nn3,nn4'
  # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
  # ie dfs.namenode.http-address.hacluster.nn1
  nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
  for nn_unique_id in nn_unique_ids:
    key = namenode_http_fragment.format(name_service,nn_unique_id)

    if key in hdfs_site:
      # use str() to ensure that unicode strings do not have the u' in them
      value = str(hdfs_site[key])

      try:
        jmx_uri = jmx_uri_fragment.format(value)
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
          env = Environment.get_instance()

          # curl requires an integer timeout
          curl_connection_timeout = int(connection_timeout)

          state_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir,
            kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False,
            "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout)

          state = _get_ha_state_from_json(state_response)
        else:
          state_response = get_jmx(jmx_uri, connection_timeout)
          state = _get_ha_state_from_json(state_response)

        if state == HDFS_NN_STATE_ACTIVE:
          active_namenodes.append(value)
        elif state == HDFS_NN_STATE_STANDBY:
          standby_namenodes.append(value)
        else:
          unknown_namenodes.append(value)
      except:
        unknown_namenodes.append(value)

  # now that the request is done, determine if this host is the host that
  # should report the status of the HA topology
  is_active_namenode = False
  for active_namenode in active_namenodes:
    if active_namenode.startswith(host_name):
      is_active_namenode = True

  # there's only one scenario here; there is exactly 1 active and 1 standby
  is_topology_healthy = len(active_namenodes) == 1 and len(standby_namenodes) == 1

  result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(str(active_namenodes),
    str(standby_namenodes), str(unknown_namenodes))

  # Healthy Topology:
  #   - Active NN reports the alert, standby does not
  #
  # Unhealthy Topology:
  #   - Report the alert if this is the first named host
  #   - Report the alert if not the first named host, but the other host
  #   could not report its status
  if is_topology_healthy:
    if is_active_namenode is True:
      return (RESULT_STATE_OK, [result_label])
    else:
      return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
  else:
    # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode
    first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format(
      name_service, nn_unique_ids[0])

    first_listed_host = ''
    if first_listed_host_key in hdfs_site:
      first_listed_host = hdfs_site[first_listed_host_key]

    is_first_listed_host = False
    if first_listed_host.startswith(host_name):
      is_first_listed_host = True

    if is_first_listed_host:
      return (RESULT_STATE_CRITICAL, [result_label])
    else:
      # not the first listed host, but the first host might be in the unknown
      return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
Beispiel #29
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    result_code = RESULT_CODE_UNKNOWN

    if configurations is None:
        return (result_code,
                ['There were no configurations supplied to the script.'])

    webhcat_port = WEBHCAT_PORT_DEFAULT
    if TEMPLETON_PORT_KEY in configurations:
        webhcat_port = int(configurations[TEMPLETON_PORT_KEY])

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = configurations[SECURITY_ENABLED_KEY].lower(
        ) == 'true'

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
        curl_connection_timeout = str(int(connection_timeout))

    # the alert will always run on the webhcat host
    if host_name is None:
        host_name = socket.getfqdn()

    smokeuser = SMOKEUSER_DEFAULT

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    if SMOKEUSER_SCRIPT_PARAM_KEY in parameters:
        smokeuser = parameters[SMOKEUSER_SCRIPT_PARAM_KEY]

    # webhcat always uses http, never SSL
    query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(
        host_name, webhcat_port, smokeuser)

    # initialize
    total_time = 0
    json_response = {}

    if security_enabled:
        if WEBHCAT_KEYTAB_KEY not in configurations or WEBHCAT_PRINCIPAL_KEY not in configurations:
            return (RESULT_CODE_UNKNOWN, [str(configurations)])

        try:
            webhcat_keytab = configurations[WEBHCAT_KEYTAB_KEY]
            webhcat_principal = configurations[WEBHCAT_PRINCIPAL_KEY]

            # substitute _HOST in kerberos principal with actual fqdn
            webhcat_principal = webhcat_principal.replace('_HOST', host_name)

            # Create the kerberos credentials cache (ccache) file and set it in the environment to use
            # when executing curl
            env = Environment.get_instance()
            ccache_file = "{0}{1}webhcat_alert_cc_{2}".format(
                env.tmp_dir, sep, getpid())
            kerberos_env = {'KRB5CCNAME': ccache_file}

            # Get the configured Kerberos executable search paths, if any
            if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
                kerberos_executable_search_paths = configurations[
                    KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
            else:
                kerberos_executable_search_paths = None

            klist_path_local = get_klist_path(kerberos_executable_search_paths)
            klist_command = format("{klist_path_local} -s {ccache_file}")

            # Determine if we need to kinit by testing to see if the relevant cache exists and has
            # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
            # it kinits we do but recover quickly when keytabs are regenerated
            return_code, _ = call(klist_command)
            if return_code != 0:
                kinit_path_local = get_kinit_path(
                    kerberos_executable_search_paths)
                kinit_command = format(
                    "{kinit_path_local} -l 5m -c {ccache_file} -kt {webhcat_keytab} {webhcat_principal}; "
                )

                # kinit so that curl will work with --negotiate
                Execute(kinit_command)

            # make a single curl call to get just the http code
            curl = subprocess.Popen([
                'curl', '--negotiate', '-u', ':', '-sL', '-w', '%{http_code}',
                '--connect-timeout', curl_connection_timeout, '-o',
                '/dev/null', query_url
            ],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    env=kerberos_env)

            stdout, stderr = curl.communicate()

            if stderr != '':
                raise Exception(stderr)

            # check the response code
            response_code = int(stdout)

            # 0 indicates no connection
            if response_code == 0:
                label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
                return (RESULT_CODE_CRITICAL, [label])

            # any other response aside from 200 is a problem
            if response_code != 200:
                label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url)
                return (RESULT_CODE_CRITICAL, [label])

            # now that we have the http status and it was 200, get the content
            start_time = time.time()
            curl = subprocess.Popen([
                'curl',
                '--negotiate',
                '-u',
                ':',
                '-sL',
                '--connect-timeout',
                curl_connection_timeout,
                query_url,
            ],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    env=kerberos_env)

            stdout, stderr = curl.communicate()
            total_time = time.time() - start_time

            if stderr != '':
                raise Exception(stderr)

            json_response = json.loads(stdout)
        except Exception, exception:
            return (RESULT_CODE_CRITICAL, [str(exception)])
Beispiel #30
0
 def __init__(self, env=None):
     self.env = env or Environment.get_instance()
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  webhcat_port = WEBHCAT_PORT_DEFAULT
  if TEMPLETON_PORT_KEY in configurations:
    webhcat_port = int(configurations[TEMPLETON_PORT_KEY])

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true'

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
    curl_connection_timeout = str(int(connection_timeout))


  # the alert will always run on the webhcat host
  if host_name is None:
    host_name = socket.getfqdn()

  smokeuser = SMOKEUSER_DEFAULT

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  # webhcat always uses http, never SSL
  query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(host_name, webhcat_port, smokeuser)

  # initialize
  total_time = 0
  json_response = {}

  if security_enabled:
    try:
      # defaults
      smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT
      smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT

      # check script params
      if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
        smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY]
      if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
        smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY]

      # check configurations last as they should always take precedence
      if SMOKEUSER_PRINCIPAL_KEY in configurations:
        smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
      if SMOKEUSER_KEYTAB_KEY in configurations:
        smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY]

      # Get the configured Kerberos executable search paths, if any
      kerberos_executable_search_paths = None
      if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
        kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

      env = Environment.get_instance()
      stdout, stderr, time_millis = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal,
                                                      query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True,
                                                      "WebHCat Server Status", smokeuser,
                                                      connection_timeout=curl_connection_timeout)

      # check the response code
      response_code = int(stdout)

      # 0 indicates no connection
      if response_code == 0:
        label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
        return (RESULT_CODE_CRITICAL, [label])

      # any other response aside from 200 is a problem
      if response_code != 200:
        label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url)
        return (RESULT_CODE_CRITICAL, [label])

      # now that we have the http status and it was 200, get the content
      stdout, stderr, total_time = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal,
                                                      query_url, "webhcat_alert_cc_", kerberos_executable_search_paths,
                                                      False, "WebHCat Server Status", smokeuser,
                                                      connection_timeout=curl_connection_timeout)
      json_response = json.loads(stdout)
    except Exception, exception:
      return (RESULT_CODE_CRITICAL, [str(exception)])
Beispiel #32
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    uri = None
    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'
    checkpoint_tx = CHECKPOINT_TX_DEFAULT
    checkpoint_period = CHECKPOINT_PERIOD_DEFAULT

    if NN_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NN_HTTP_ADDRESS_KEY]

    if NN_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if NN_CHECKPOINT_TX_KEY in configurations:
        checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

    if NN_CHECKPOINT_PERIOD_KEY in configurations:
        checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    percent_warning = PERCENT_WARNING_DEFAULT
    if PERCENT_WARNING_KEY in parameters:
        percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100

    percent_critical = PERCENT_CRITICAL_DEFAULT
    if PERCENT_CRITICAL_KEY in parameters:
        percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == 'HTTPS_ONLY':
        scheme = 'https'

        if https_uri is not None:
            uri = https_uri

    current_time = int(round(time.time() * 1000))

    last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(
        scheme, uri)
    journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(
        scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()
            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir, kerberos_keytab, kerberos_principal,
                last_checkpoint_time_qry, "checkpoint_time_alert", None, False,
                "NameNode Last Checkpoint")
            last_checkpoint_time_response_json = json.loads(
                last_checkpoint_time_response)
            last_checkpoint_time = int(
                last_checkpoint_time_response_json["beans"][0]
                ["LastCheckpointTime"])

            journal_transaction_info_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir, kerberos_keytab, kerberos_principal,
                journal_transaction_info_qry, "checkpoint_time_alert", None,
                False, "NameNode Last Checkpoint")
            journal_transaction_info_response_json = json.loads(
                journal_transaction_info_response)
            journal_transaction_info = journal_transaction_info_response_json[
                "beans"][0]["JournalTransactionInfo"]
        else:
            last_checkpoint_time = int(
                get_value_from_jmx(last_checkpoint_time_qry,
                                   "LastCheckpointTime", connection_timeout))

            journal_transaction_info = get_value_from_jmx(
                journal_transaction_info_qry, "JournalTransactionInfo",
                connection_timeout)

        journal_transaction_info_dict = json.loads(journal_transaction_info)

        last_tx = int(
            journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
        most_recent_tx = int(
            journal_transaction_info_dict['MostRecentCheckpointTxId'])
        transaction_difference = last_tx - most_recent_tx

        delta = (current_time - last_checkpoint_time) / 1000

        label = LABEL.format(h=get_time(delta)['h'],
                             m=get_time(delta)['m'],
                             tx=transaction_difference)

        if (transaction_difference > int(checkpoint_tx)) and (
                float(delta) / int(checkpoint_period) * 100 >=
                int(percent_critical)):
            result_code = 'CRITICAL'
        elif (transaction_difference > int(checkpoint_tx)) and (
                float(delta) / int(checkpoint_period) * 100 >=
                int(percent_warning)):
            result_code = 'WARNING'

    except Exception, e:
        label = str(e)
        result_code = 'UNKNOWN'
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  scheme = 'http'
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]
    
  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

  if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

  if YARN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[YARN_HTTP_POLICY_KEY]


  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])


  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  label = ''
  url_response = None
  node_healthy = 'false'
  total_time = 0

  # some yarn-site structures don't have the web ui address
  if uri is None:
    if host_name is None:
      host_name = socket.getfqdn()

    uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
    
  if OSCheck.is_windows_family():
    uri_host, uri_port = uri.split(':')
    # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
    uri_host = resolve_address(uri_host)
    uri = '{0}:{1}'.format(uri_host, uri_port)

  query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
        query, "nm_health_alert", None, False, "NodeManager Health", smokeuser,
        connection_timeout=curl_connection_timeout)

      json_response = json.loads(url_response)
    else:
      # execute the query for the JSON that includes templeton status
      url_response = urllib2.urlopen(query, timeout=connection_timeout)
      json_response = json.loads(url_response.read())
  except urllib2.HTTPError, httpError:
    label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
      str(httpError))

    return (RESULT_CODE_CRITICAL, [label])
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  webhcat_port = WEBHCAT_PORT_DEFAULT
  if TEMPLETON_PORT_KEY in configurations:
    webhcat_port = int(configurations[TEMPLETON_PORT_KEY])

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true'

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
    curl_connection_timeout = str(int(connection_timeout))


  # the alert will always run on the webhcat host
  if host_name is None:
    host_name = socket.getfqdn()

  smokeuser = SMOKEUSER_DEFAULT

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  # webhcat always uses http, never SSL
  query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(host_name, webhcat_port, smokeuser)

  # initialize
  total_time = 0
  json_response = {}

  if security_enabled:
    try:
      # defaults
      smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT
      smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT

      # check script params
      if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
        smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY]
      if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
        smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY]

      # check configurations last as they should always take precedence
      if SMOKEUSER_PRINCIPAL_KEY in configurations:
        smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
      if SMOKEUSER_KEYTAB_KEY in configurations:
        smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY]

      # Get the configured Kerberos executable search paths, if any
      kerberos_executable_search_paths = None
      if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
        kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

      env = Environment.get_instance()
      stdout, stderr, time_millis = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal,
                                                      query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True,
                                                      "WebHCat Server Status", smokeuser,
                                                      connection_timeout=curl_connection_timeout)

      # check the response code
      response_code = int(stdout)

      # 0 indicates no connection
      if response_code == 0:
        label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc())
        return (RESULT_CODE_CRITICAL, [label])

      # any other response aside from 200 is a problem
      if response_code != 200:
        label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url, traceback.format_exc())
        return (RESULT_CODE_CRITICAL, [label])

      # now that we have the http status and it was 200, get the content
      stdout, stderr, total_time = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal,
                                                      query_url, "webhcat_alert_cc_", kerberos_executable_search_paths,
                                                      False, "WebHCat Server Status", smokeuser,
                                                      connection_timeout=curl_connection_timeout)
      json_response = json.loads(stdout)
    except:
      return (RESULT_CODE_CRITICAL, [traceback.format_exc()])
  else:
    url_response = None

    try:
      # execute the query for the JSON that includes WebHCat status
      start_time = time.time()
      url_response = urllib2.urlopen(query_url, timeout=connection_timeout)
      total_time = time.time() - start_time

      json_response = json.loads(url_response.read())
    except urllib2.HTTPError as httpError:
      label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url, traceback.format_exc())
      return (RESULT_CODE_CRITICAL, [label])
    except:
      label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc())
      return (RESULT_CODE_CRITICAL, [label])
    finally:
      if url_response is not None:
        try:
          url_response.close()
        except:
          pass


  # if status is not in the response, we can't do any check; return CRIT
  if 'status' not in json_response:
    return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + str(json_response)])


  # URL response received, parse it
  try:
    webhcat_status = json_response['status']
  except:
    return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + "\n" + traceback.format_exc()])


  # proper JSON received, compare against known value
  if webhcat_status.lower() == WEBHCAT_OK_RESPONSE:
    result_code = RESULT_CODE_OK
    label = OK_MESSAGE.format(total_time, query_url)
  else:
    result_code = RESULT_CODE_CRITICAL
    label = CRITICAL_WEBHCAT_STATUS_MESSAGE.format(webhcat_status)

  return (result_code, [label])
Beispiel #35
0
def get_check_command(oozie_url, host_name, configurations, parameters,
                      only_kinit):
    kerberos_env = None

    user = USER_DEFAULT
    if USER_KEY in configurations:
        user = configurations[USER_KEY]

    if is_security_enabled(configurations):
        # defaults
        user_keytab = USER_KEYTAB_DEFAULT
        user_principal = USER_PRINCIPAL_DEFAULT

        # check script params
        if USER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
            user_principal = parameters[USER_PRINCIPAL_SCRIPT_PARAM_KEY]
            user_principal = user_principal.replace('_HOST', host_name.lower())
        if USER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
            user_keytab = parameters[USER_KEYTAB_SCRIPT_PARAM_KEY]

        # check configurations last as they should always take precedence
        if USER_PRINCIPAL_KEY in configurations:
            user_principal = configurations[USER_PRINCIPAL_KEY]
            user_principal = user_principal.replace('_HOST', host_name.lower())
        if USER_KEYTAB_KEY in configurations:
            user_keytab = configurations[USER_KEYTAB_KEY]

        # Create the kerberos credentials cache (ccache) file and set it in the environment to use
        # when executing curl
        env = Environment.get_instance()
        ccache_file = "{0}{1}oozie_alert_cc_{2}".format(
            env.tmp_dir, os.sep, os.getpid())
        kerberos_env = {'KRB5CCNAME': ccache_file}

        # Get the configured Kerberos executable search paths, if any
        kerberos_executable_search_paths = None
        if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
            kerberos_executable_search_paths = configurations[
                KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

        klist_path_local = get_klist_path(kerberos_executable_search_paths)
        kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
        kinit_part_command = format(
            "{kinit_path_local} -l 5m20s -c {ccache_file} -kt {user_keytab} {user_principal}; "
        )

        # Determine if we need to kinit by testing to see if the relevant cache exists and has
        # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
        # it kinits we do but recover quickly when keytabs are regenerated

        if only_kinit:
            kinit_command = kinit_part_command
        else:
            kinit_command = "{0} -s {1} || ".format(
                klist_path_local, ccache_file) + kinit_part_command

        # prevent concurrent kinit
        kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
        kinit_lock.acquire()
        try:
            Execute(kinit_command, environment=kerberos_env, user=user)
        finally:
            kinit_lock.release()

    # Configure stack root
    stack_root = STACK_ROOT_DEFAULT
    if STACK_NAME_KEY in configurations and STACK_ROOT_KEY in configurations:
        stack_root = stack_tools.get_stack_root(
            configurations[STACK_NAME_KEY],
            configurations[STACK_ROOT_KEY]).lower()

    # oozie configuration directory using a symlink
    oozie_config_directory = OOZIE_CONF_DIR.replace(STACK_ROOT_PATTERN,
                                                    stack_root)
    if not os.path.exists(oozie_config_directory):
        oozie_config_directory = OOZIE_CONF_DIR_LEGACY

    command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format(
        oozie_config_directory, oozie_url)

    return (command, kerberos_env, user)
Beispiel #36
0
 def __init__(self, env=None):
     self.env = env or Environment.get_instance()
Beispiel #37
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    uri = None
    http_policy = 'HTTP_ONLY'

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return 'SKIPPED', [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ]

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                    DEFAULT_KERBEROS_KINIT_TIMER_MS)

    # determine the right URI and whether to use SSL
    hdfs_site = configurations[HDFS_SITE_KEY]

    scheme = "https" if http_policy == "HTTPS_ONLY" else "http"

    nn_addresses = get_all_namenode_addresses(hdfs_site)
    for nn_address in nn_addresses:
        if nn_address.startswith(host_name + ":") or nn_address == host_name:
            uri = nn_address
            break
    if not uri:
        return 'SKIPPED', [
            'NameNode on host {0} not found (namenode adresses = {1})'.format(
                host_name, ', '.join(nn_addresses))
        ]

    upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(
        scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                upgrade_finalized_qry,
                "upgrade_finalized_state",
                executable_paths,
                False,
                "HDFS Upgrade Finalized State",
                smokeuser,
                kinit_timer_ms=kinit_timer_ms)

            upgrade_finalized_response_json = json.loads(
                last_checkpoint_time_response)
            upgrade_finalized = bool(upgrade_finalized_response_json["beans"]
                                     [0]["UpgradeFinalized"])

        else:
            upgrade_finalized = bool(
                get_value_from_jmx(upgrade_finalized_qry, "UpgradeFinalized"))

        if upgrade_finalized:
            label = "HDFS cluster is not in the upgrade state"
            result_code = 'OK'
        else:
            label = "HDFS cluster is not finalized"
            result_code = 'CRITICAL'

    except:
        label = traceback.format_exc()
        result_code = 'UNKNOWN'

    return ((result_code, [label]))
Beispiel #38
0
 def __init__(self, name):
     self.env = Environment.get_instance()
     self.name = name
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return ("UNKNOWN", ["There were no configurations supplied to the script."])

    uri = None
    scheme = "http"
    http_uri = None
    https_uri = None
    http_policy = "HTTP_ONLY"
    checkpoint_tx = CHECKPOINT_TX_DEFAULT
    checkpoint_period = CHECKPOINT_PERIOD_DEFAULT

    if NN_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NN_HTTP_ADDRESS_KEY]

    if NN_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if NN_CHECKPOINT_TX_KEY in configurations:
        checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

    if NN_CHECKPOINT_PERIOD_KEY in configurations:
        checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == "TRUE"

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace("_HOST", host_name)

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    percent_warning = PERCENT_WARNING_DEFAULT
    if PERCENT_WARNING_KEY in parameters:
        percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100

    percent_critical = PERCENT_CRITICAL_DEFAULT
    if PERCENT_CRITICAL_KEY in parameters:
        percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == "HTTPS_ONLY":
        scheme = "https"

        if https_uri is not None:
            uri = https_uri

    current_time = int(round(time.time() * 1000))

    last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme, uri)
    journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            # curl requires an integer timeout
            curl_connection_timeout = int(connection_timeout)

            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                last_checkpoint_time_qry,
                "checkpoint_time_alert",
                executable_paths,
                False,
                "NameNode Last Checkpoint",
                smokeuser,
                connection_timeout=curl_connection_timeout,
            )

            last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response)
            last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"])

            journal_transaction_info_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                journal_transaction_info_qry,
                "checkpoint_time_alert",
                executable_paths,
                False,
                "NameNode Last Checkpoint",
                smokeuser,
                connection_timeout=curl_connection_timeout,
            )

            journal_transaction_info_response_json = json.loads(journal_transaction_info_response)
            journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"]
        else:
            last_checkpoint_time = int(
                get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout)
            )

            journal_transaction_info = get_value_from_jmx(
                journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout
            )

        journal_transaction_info_dict = json.loads(journal_transaction_info)

        last_tx = int(journal_transaction_info_dict["LastAppliedOrWrittenTxId"])
        most_recent_tx = int(journal_transaction_info_dict["MostRecentCheckpointTxId"])
        transaction_difference = last_tx - most_recent_tx

        delta = (current_time - last_checkpoint_time) / 1000

        label = LABEL.format(h=get_time(delta)["h"], m=get_time(delta)["m"], tx=transaction_difference)

        if (transaction_difference > int(checkpoint_tx)) and (
            float(delta) / int(checkpoint_period) * 100 >= int(percent_critical)
        ):
            result_code = "CRITICAL"
        elif (transaction_difference > int(checkpoint_tx)) and (
            float(delta) / int(checkpoint_period) * 100 >= int(percent_warning)
        ):
            result_code = "WARNING"

    except Exception, e:
        label = str(e)
        result_code = "UNKNOWN"
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  if configurations is None:
    return (('UNKNOWN', ['There were no configurations supplied to the script.']))

  scheme = 'http'  
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

  if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

  if YARN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[YARN_HTTP_POLICY_KEY]
    
  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  uri = str(host_name) + ":" + uri.split(":")[1]
  live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri)
  convert_to_json_failed = False
  response_code = None
  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
        live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, False,
        "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout)

      try:
        url_response_json = json.loads(url_response)
        live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"])
      except ValueError, error:
        convert_to_json_failed = True
        logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
        format("NodeManager Health Summary", str(error)))

      if convert_to_json_failed:
        response_code, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
          live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, True,
          "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout)
    else:
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  scheme = 'http'
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

  if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

  if YARN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[YARN_HTTP_POLICY_KEY]


  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])


  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  label = ''
  url_response = None
  node_healthy = 'false'
  total_time = 0

  # some yarn-site structures don't have the web ui address
  if uri is None:
    if host_name is None:
      host_name = socket.getfqdn()

    uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
    
  if OSCheck.is_windows_family():
    uri_host, uri_port = uri.split(':')
    # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
    uri_host = resolve_address(uri_host)
    uri = '{0}:{1}'.format(uri_host, uri_port)

  query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
        query, "nm_health_alert", executable_paths, False, "NodeManager Health", smokeuser,
        connection_timeout=curl_connection_timeout)

      json_response = json.loads(url_response)
    else:
      # execute the query for the JSON that includes templeton status
      url_response = urllib2.urlopen(query, timeout=connection_timeout)
      json_response = json.loads(url_response.read())
  except urllib2.HTTPError, httpError:
    label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
      str(httpError), traceback.format_exc())

    return (RESULT_CODE_CRITICAL, [label])
def get_check_command(oozie_url, host_name, configurations, parameters):
  kerberos_env = None

  smokeuser = SMOKEUSER_DEFAULT
  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  security_enabled = False
  if SECURITY_ENABLED in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED]).upper() == 'TRUE'

  if security_enabled:
    # defaults
    smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT
    smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT

    # check script params
    if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
      smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY]
    if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
      smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY]

    # check configurations last as they should always take precedence
    if SMOKEUSER_PRINCIPAL_KEY in configurations:
      smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
    if SMOKEUSER_KEYTAB_KEY in configurations:
      smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY]

    # Create the kerberos credentials cache (ccache) file and set it in the environment to use
    # when executing curl
    env = Environment.get_instance()
    ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid())
    kerberos_env = {'KRB5CCNAME': ccache_file}

    # Get the configured Kerberos executable search paths, if any
    kerberos_executable_search_paths = None
    if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
      kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

    klist_path_local = get_klist_path(kerberos_executable_search_paths)
    klist_command = format("{klist_path_local} -s {ccache_file}")

    # Determine if we need to kinit by testing to see if the relevant cache exists and has
    # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
    # it kinits we do but recover quickly when keytabs are regenerated
    return_code, _ = call(klist_command, user=smokeuser)
    if return_code != 0:
      kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
      kinit_command = format("{kinit_path_local} -l 5m -kt {smokeuser_keytab} {smokeuser_principal}; ")

      # kinit
      Execute(kinit_command, environment=kerberos_env, user=smokeuser)

  # oozie configuration directory uses a symlink when > HDP 2.2
  oozie_config_directory = OOZIE_CONF_DIR_LEGACY
  if os.path.exists(OOZIE_CONF_DIR):
    oozie_config_directory = OOZIE_CONF_DIR

  command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format(
    oozie_config_directory, oozie_url)

  return (command, kerberos_env, smokeuser)
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """

  if configurations is None:
    return (('UNKNOWN', ['There were no configurations supplied to the script.']))

  uri = None
  scheme = 'http'
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  if NN_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NN_HTTP_ADDRESS_KEY]

  if NN_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

  if NN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[NN_HTTP_POLICY_KEY]

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme, uri)

  # start out assuming an OK status
  label = None
  result_code = "OK"

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
        env.tmp_dir, kerberos_keytab,
        kerberos_principal, upgrade_finalized_qry, "upgrade_finalized_state", executable_paths, False,
        "HDFS Upgrade Finalized State", smokeuser
       )

      upgrade_finalized_response_json = json.loads(last_checkpoint_time_response)
      upgrade_finalized = bool(upgrade_finalized_response_json["beans"][0]["UpgradeFinalized"])

    else:
      upgrade_finalized = bool(get_value_from_jmx(upgrade_finalized_qry,
                                                    "UpgradeFinalized"))

    if upgrade_finalized:
      label = "HDFS cluster is not in the upgrade state"
      result_code = 'OK'
    else:
      label = "HDFS cluster is not finalized"
      result_code = 'CRITICAL'

  except:
    label = traceback.format_exc()
    result_code = 'UNKNOWN'

  return ((result_code, [label]))
Beispiel #44
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    uri = None
    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'

    if NN_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NN_HTTP_ADDRESS_KEY]

    if NN_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == 'HTTPS_ONLY':
        scheme = 'https'

        if https_uri is not None:
            uri = https_uri

    upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(
        scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir, kerberos_keytab, kerberos_principal,
                upgrade_finalized_qry, "upgrade_finalized_state",
                executable_paths, False, "HDFS Upgrade Finalized State",
                smokeuser)

            upgrade_finalized_response_json = json.loads(
                last_checkpoint_time_response)
            upgrade_finalized = bool(upgrade_finalized_response_json["beans"]
                                     [0]["UpgradeFinalized"])

        else:
            upgrade_finalized = bool(
                get_value_from_jmx(upgrade_finalized_qry, "UpgradeFinalized"))

        if upgrade_finalized:
            label = "HDFS cluster is not in the upgrade state"
            result_code = 'OK'
        else:
            label = "HDFS cluster is not finalized"
            result_code = 'CRITICAL'

    except Exception, e:
        label = str(e)
        result_code = 'UNKNOWN'
def get_check_command(oozie_url, host_name, configurations, parameters, only_kinit):
  kerberos_env = None

  user = USER_DEFAULT
  if USER_KEY in configurations:
    user = configurations[USER_KEY]

  if is_security_enabled(configurations):
    # defaults
    user_keytab = USER_KEYTAB_DEFAULT
    user_principal = USER_PRINCIPAL_DEFAULT

    # check script params
    if USER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
      user_principal = parameters[USER_PRINCIPAL_SCRIPT_PARAM_KEY]
      user_principal = user_principal.replace('_HOST', host_name.lower())
    if USER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
      user_keytab = parameters[USER_KEYTAB_SCRIPT_PARAM_KEY]

    # check configurations last as they should always take precedence
    if USER_PRINCIPAL_KEY in configurations:
      user_principal = configurations[USER_PRINCIPAL_KEY]
      user_principal = user_principal.replace('_HOST', host_name.lower())
    if USER_KEYTAB_KEY in configurations:
      user_keytab = configurations[USER_KEYTAB_KEY]

    # Create the kerberos credentials cache (ccache) file and set it in the environment to use
    # when executing curl
    env = Environment.get_instance()
    ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid())
    kerberos_env = {'KRB5CCNAME': ccache_file}

    # Get the configured Kerberos executable search paths, if any
    kerberos_executable_search_paths = None
    if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
      kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

    klist_path_local = get_klist_path(kerberos_executable_search_paths)
    kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
    kinit_part_command = format("{kinit_path_local} -l 5m20s -c {ccache_file} -kt {user_keytab} {user_principal}; ")

    # Determine if we need to kinit by testing to see if the relevant cache exists and has
    # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
    # it kinits we do but recover quickly when keytabs are regenerated

    if only_kinit:
      kinit_command = kinit_part_command
    else:
      kinit_command = "{0} -s {1} || ".format(klist_path_local, ccache_file) + kinit_part_command

    Execute(kinit_command, environment=kerberos_env, user=user)

  # oozie configuration directory uses a symlink when > HDP 2.2
  oozie_config_directory = OOZIE_CONF_DIR_LEGACY
  if os.path.exists(OOZIE_CONF_DIR):
    oozie_config_directory = OOZIE_CONF_DIR

  command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format(
    oozie_config_directory, oozie_url)

  return (command, kerberos_env, user)
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  webhcat_port = WEBHCAT_PORT_DEFAULT
  if TEMPLETON_PORT_KEY in configurations:
    webhcat_port = int(configurations[TEMPLETON_PORT_KEY])

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true'

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
    curl_connection_timeout = str(int(connection_timeout))


  # the alert will always run on the webhcat host
  if host_name is None:
    host_name = socket.getfqdn()

  # webhcat always uses http, never SSL
  query_url = "http://{0}:{1}/templeton/v1/status".format(host_name, webhcat_port)

  # initialize
  total_time = 0
  json_response = {}

  if security_enabled:
    if WEBHCAT_KEYTAB_KEY not in configurations or WEBHCAT_PRINCIPAL_KEY not in configurations:
      return (RESULT_CODE_UNKNOWN, [str(configurations)])

    try:
      webhcat_keytab = configurations[WEBHCAT_KEYTAB_KEY]
      webhcat_principal = configurations[WEBHCAT_PRINCIPAL_KEY]

      # substitute _HOST in kerberos principal with actual fqdn
      webhcat_principal = webhcat_principal.replace('_HOST', host_name)

      # Create the kerberos credentials cache (ccache) file and set it in the environment to use
      # when executing curl
      env = Environment.get_instance()
      ccache_file = "{0}{1}webhcat_alert_cc_{2}".format(env.tmp_dir, sep, getpid())
      kerberos_env = {'KRB5CCNAME': ccache_file}

      # Get the configured Kerberos executable search paths, if any
      if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
        kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
      else:
        kerberos_executable_search_paths = None

      klist_path_local = get_klist_path(kerberos_executable_search_paths)
      klist_command = format("{klist_path_local} -s {ccache_file}")

      # Determine if we need to kinit by testing to see if the relevant cache exists and has
      # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
      # it kinits we do but recover quickly when keytabs are regenerated
      return_code, _ = call(klist_command)
      if return_code != 0:
        kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
        kinit_command = format("{kinit_path_local} -l 5m -c {ccache_file} -kt {webhcat_keytab} {webhcat_principal}; ")

        # kinit so that curl will work with --negotiate
        Execute(kinit_command)

      # make a single curl call to get just the http code
      curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-sL', '-w',
        '%{http_code}', '--connect-timeout', curl_connection_timeout,
        '-o', '/dev/null', query_url], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env)

      stdout, stderr = curl.communicate()

      if stderr != '':
        raise Exception(stderr)

      # check the response code
      response_code = int(stdout)

      # 0 indicates no connection
      if response_code == 0:
        label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
        return (RESULT_CODE_CRITICAL, [label])

      # any other response aside from 200 is a problem
      if response_code != 200:
        label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url)
        return (RESULT_CODE_CRITICAL, [label])

      # now that we have the http status and it was 200, get the content
      start_time = time.time()
      curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-sL',
        '--connect-timeout', curl_connection_timeout, query_url, ],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env)

      stdout, stderr = curl.communicate()
      total_time = time.time() - start_time

      if stderr != '':
        raise Exception(stderr)

      json_response = json.loads(stdout)
    except Exception, exception:
      return (RESULT_CODE_CRITICAL, [str(exception)])
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
    if configurations is None:
        return (RESULT_STATE_UNKNOWN,
                ['There were no configurations supplied to the script.'])

    # if not in HA mode, then SKIP
    if not NAMESERVICE_KEY in configurations:
        return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled'])

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # determine whether or not SSL is enabled
    is_ssl_enabled = False
    if DFS_POLICY_KEY in configurations:
        dfs_policy = configurations[DFS_POLICY_KEY]
        if dfs_policy == "HTTPS_ONLY":
            is_ssl_enabled = True

    name_service = configurations[NAMESERVICE_KEY]
    hdfs_site = configurations[HDFS_SITE_KEY]

    # look for dfs.ha.namenodes.foo
    nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
    if not nn_unique_ids_key in hdfs_site:
        return (RESULT_STATE_UNKNOWN, [
            'Unable to find unique namenode alias key {0}'.format(
                nn_unique_ids_key)
        ])

    namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
    jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

    if is_ssl_enabled:
        namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
        jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

    active_namenodes = []
    standby_namenodes = []
    unknown_namenodes = []

    # now we have something like 'nn1,nn2,nn3,nn4'
    # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
    # ie dfs.namenode.http-address.hacluster.nn1
    nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
    for nn_unique_id in nn_unique_ids:
        key = namenode_http_fragment.format(name_service, nn_unique_id)

        if key in hdfs_site:
            # use str() to ensure that unicode strings do not have the u' in them
            value = str(hdfs_site[key])

            try:
                jmx_uri = jmx_uri_fragment.format(value)
                if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
                    env = Environment.get_instance()

                    # curl requires an integer timeout
                    curl_connection_timeout = int(connection_timeout)

                    state_response, error_msg, time_millis = curl_krb_request(
                        env.tmp_dir,
                        kerberos_keytab,
                        kerberos_principal,
                        jmx_uri,
                        "ha_nn_health",
                        None,
                        False,
                        "NameNode High Availability Health",
                        smokeuser,
                        connection_timeout=curl_connection_timeout)

                    state = _get_ha_state_from_json(state_response)
                else:
                    state_response = get_jmx(jmx_uri, connection_timeout)
                    state = _get_ha_state_from_json(state_response)

                if state == HDFS_NN_STATE_ACTIVE:
                    active_namenodes.append(value)
                elif state == HDFS_NN_STATE_STANDBY:
                    standby_namenodes.append(value)
                else:
                    unknown_namenodes.append(value)
            except:
                unknown_namenodes.append(value)

    # now that the request is done, determine if this host is the host that
    # should report the status of the HA topology
    is_active_namenode = False
    for active_namenode in active_namenodes:
        if active_namenode.startswith(host_name):
            is_active_namenode = True

    # there's only one scenario here; there is exactly 1 active and 1 standby
    is_topology_healthy = len(active_namenodes) == 1 and len(
        standby_namenodes) == 1

    result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(
        str(active_namenodes), str(standby_namenodes), str(unknown_namenodes))

    # Healthy Topology:
    #   - Active NN reports the alert, standby does not
    #
    # Unhealthy Topology:
    #   - Report the alert if this is the first named host
    #   - Report the alert if not the first named host, but the other host
    #   could not report its status
    if is_topology_healthy:
        if is_active_namenode is True:
            return (RESULT_STATE_OK, [result_label])
        else:
            return (RESULT_STATE_SKIPPED,
                    ['Another host will report this alert'])
    else:
        # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode
        first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format(
            name_service, nn_unique_ids[0])

        first_listed_host = ''
        if first_listed_host_key in hdfs_site:
            first_listed_host = hdfs_site[first_listed_host_key]

        is_first_listed_host = False
        if first_listed_host.startswith(host_name):
            is_first_listed_host = True

        if is_first_listed_host:
            return (RESULT_STATE_CRITICAL, [result_label])
        else:
            # not the first listed host, but the first host might be in the unknown
            return (RESULT_STATE_SKIPPED,
                    ['Another host will report this alert'])
def execute(configs={}, parameters={}, host_name=None):

    if configs is None:
        return 'UNKNOWN', [
            'There were no configurations supplied to the script.'
        ]

    if host_name is None:
        host_name = socket.getfqdn()

    env = Environment.get_instance()

    solr_user = configs[SMOKEUSER_KEY]

    ui_ssl_enabled = False
    ui_ssl_enabled_key = UI_SSL_ENABLED_KEY_DEFAULT

    security_enabled = False
    if SECURITY_ENABLED_KEY in configs:
        security_enabled = str(configs[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    # check parameters
    if UI_SSL_ENABLED_KEY in parameters:
        ui_ssl_enabled_key = parameters[UI_SSL_ENABLED_KEY]

    if ui_ssl_enabled_key in configs:
        ui_ssl_enabled = str(configs[UI_SSL_ENABLED_KEY]).upper() == 'TRUE'

    solr_port = SOLR_PORT_DEFAULT
    if SOLR_PORT_KEY in parameters:
        solr_port = parameters[SOLR_PORT_KEY]

    connection_timeout = SOLR_CONNECTION_TIMEOUT_DEFAULT
    if SOLR_CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = parameters[SOLR_CONNECTION_TIMEOUT_KEY]

    if security_enabled:
        try:
            security_auth(configs, host_name, solr_user)
        except Exception as e:
            return RESULT_CODE_CRITICAL, ["kinit error: " + str(e)]

    if ui_ssl_enabled:
        scheme = "https"
    else:
        scheme = "http"

    state_file = "{}/solrstatus.json".format(env.tmp_dir)
    cmd = "curl -s -m {} -o {} --negotiate -u: -k '{}://{}:{}/solr/admin/collections?action=clusterstatus&wt=json'".format(
        connection_timeout, state_file, scheme, host_name, solr_port)
    try:
        Execute(cmd, tries=2, try_sleep=3, user=solr_user, logoutput=True)
    except:
        return (RESULT_CODE_CRITICAL,
                ["curl cannot reach Solr, solr seems to be down"])
    try:
        state = json.load(open(state_file))
    except:
        return (RESULT_CODE_CRITICAL,
                ["Get status failed, could not load state file"])

    os.remove(state_file)
    cluster_state = state['cluster']['collections']
    outdata = dict()
    outdata['shards'] = list()
    outdata['replicas'] = list()

    for key in cluster_state:
        for shard, shard_data in cluster_state[key]['shards'].iteritems():
            for replica, replica_data in shard_data['replicas'].iteritems():
                if replica_data['state'] != 'active':
                    rname = '-'.join([key, shard, replica])
                    outdata['replicas'].append({rname: replica_data['state']})
                else:
                    pass

            if shard_data['state'] != 'active':
                sname = '-'.join([key, shard])
                outdata['shards'].append({sname: shard_data['state']})
            else:
                pass

    if outdata['shards'] or outdata['replicas']:
        return (RESULT_CODE_CRITICAL, [
            "Replicas or Shards found not active. %s" % json.dumps(outdata)
        ])
    else:
        return (RESULT_CODE_OK, ["All Shards and replicas healthy"])
def execute(parameters=None, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  parameters (dictionary): a mapping of parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  if parameters is None:
    return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.'])

  if not OOZIE_URL_KEY in parameters:
    return (RESULT_CODE_UNKNOWN, ['The Oozie URL is a required parameter.'])

  # use localhost on Windows, 0.0.0.0 on others; 0.0.0.0 means bind to all
  # interfaces, which doesn't work on Windows
  localhost_address = 'localhost' if OSCheck.get_os_family() == OSConst.WINSRV_FAMILY else '0.0.0.0'

  oozie_url = parameters[OOZIE_URL_KEY]
  oozie_url = oozie_url.replace(urlparse(oozie_url).hostname,localhost_address)

  security_enabled = False
  if SECURITY_ENABLED in parameters:
    security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE'

  command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status")

  try:
    # kinit if security is enabled so that oozie-env.sh can make the web request
    kerberos_env = None

    if security_enabled:
      if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters:
        oozie_keytab = parameters[OOZIE_KEYTAB]
        oozie_principal = parameters[OOZIE_PRINCIPAL]

        # substitute _HOST in kerberos principal with actual fqdn
        oozie_principal = oozie_principal.replace('_HOST', host_name)
      else:
        return (RESULT_CODE_UNKNOWN, ['The Oozie keytab and principal are required parameters when security is enabled.'])

      # Create the kerberos credentials cache (ccache) file and set it in the environment to use
      # when executing curl
      env = Environment.get_instance()
      ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, sep, getpid())
      kerberos_env = {'KRB5CCNAME': ccache_file}

      klist_path_local = get_klist_path()
      klist_command = format("{klist_path_local} -s {ccache_file}")

      # Determine if we need to kinit by testing to see if the relevant cache exists and has
      # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
      # it kinits we do but recover quickly when keytabs are regenerated
      return_code, _ = call(klist_command)
      if return_code != 0:
        kinit_path_local = get_kinit_path()
        kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ")

        # kinit
        Execute(kinit_command, environment=kerberos_env)

    # execute the command
    Execute(command, environment=kerberos_env)

    return (RESULT_CODE_OK, ["Successful connection to {0}".format(oozie_url)])

  except Exception, ex:
    return (RESULT_CODE_CRITICAL, [str(ex)])
Beispiel #50
0
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  if configurations is None:
    return (('UNKNOWN', ['There were no configurations supplied to the script.']))
  
  uri = None
  scheme = 'http'  
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'
  checkpoint_tx = CHECKPOINT_TX_DEFAULT
  checkpoint_period = CHECKPOINT_PERIOD_DEFAULT
  
  if NN_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NN_HTTP_ADDRESS_KEY]

  if NN_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

  if NN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[NN_HTTP_POLICY_KEY]

  if NN_CHECKPOINT_TX_KEY in configurations:
    checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

  if NN_CHECKPOINT_PERIOD_KEY in configurations:
    checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]
    
  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  percent_warning = PERCENT_WARNING_DEFAULT
  if PERCENT_WARNING_KEY in parameters:
    percent_warning = float(parameters[PERCENT_WARNING_KEY])

  percent_critical = PERCENT_CRITICAL_DEFAULT
  if PERCENT_CRITICAL_KEY in parameters:
    percent_critical = float(parameters[PERCENT_CRITICAL_KEY])

  checkpoint_txn_multiplier_warning = CHECKPOINT_TX_MULTIPLIER_WARNING_DEFAULT
  if CHECKPOINT_TX_MULTIPLIER_WARNING_KEY in parameters:
    checkpoint_txn_multiplier_warning = float(parameters[CHECKPOINT_TX_MULTIPLIER_WARNING_KEY])

  checkpoint_txn_multiplier_critical = CHECKPOINT_TX_MULTIPLIER_CRITICAL_DEFAULT
  if CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY in parameters:
    checkpoint_txn_multiplier_critical = float(parameters[CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY])

  kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS)

  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'
    
    if https_uri is not None:
      uri = https_uri 
  
  current_time = int(round(time.time() * 1000))

  last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri)
  journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme,uri)

  # start out assuming an OK status
  label = None
  result_code = "OK"

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab,
        kerberos_principal, last_checkpoint_time_qry,"checkpoint_time_alert", executable_paths, False,
        "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout,
        kinit_timer_ms = kinit_timer_ms)

      last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response)
      last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"])

      journal_transaction_info_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab,
        kerberos_principal, journal_transaction_info_qry,"checkpoint_time_alert", executable_paths,
        False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout,
        kinit_timer_ms = kinit_timer_ms)

      journal_transaction_info_response_json = json.loads(journal_transaction_info_response)
      journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"]
    else:
      last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,
      "LastCheckpointTime", connection_timeout))

      journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,
      "JournalTransactionInfo", connection_timeout)

    journal_transaction_info_dict = json.loads(journal_transaction_info)
  
    last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
    most_recent_tx = int(journal_transaction_info_dict['MostRecentCheckpointTxId'])
    transaction_difference = last_tx - most_recent_tx
    
    delta = (current_time - last_checkpoint_time)/1000

    label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference)

    is_checkpoint_txn_warning = transaction_difference > checkpoint_txn_multiplier_warning * int(checkpoint_tx)
    is_checkpoint_txn_critical = transaction_difference > checkpoint_txn_multiplier_critical * int(checkpoint_tx)

    # Either too many uncommitted transactions or missed check-pointing for
    # long time decided by the thresholds
    if is_checkpoint_txn_critical or (float(delta) / int(checkpoint_period)*100 >= int(percent_critical)):
      logger.debug('Raising critical alert: transaction_difference = {0}, checkpoint_tx = {1}'.format(transaction_difference, checkpoint_tx))
      result_code = 'CRITICAL'
    elif is_checkpoint_txn_warning or (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)):
      logger.debug('Raising warning alert: transaction_difference = {0}, checkpoint_tx = {1}'.format(transaction_difference, checkpoint_tx))
      result_code = 'WARNING'

  except:
    label = traceback.format_exc()
    result_code = 'UNKNOWN'
        
  return ((result_code, [label]))
Beispiel #51
0
 def __init__(self, name):
     self.env = Environment.get_instance()
     self.name = name
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

    if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

    if YARN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[YARN_HTTP_POLICY_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == 'HTTPS_ONLY':
        scheme = 'https'

        if https_uri is not None:
            uri = https_uri

    uri = str(host_name) + ":" + uri.split(":")[1]
    live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(
        scheme, uri)
    convert_to_json_failed = False
    response_code = None
    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()
            url_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir, kerberos_keytab, kerberos_principal,
                live_nodemanagers_qry, "nm_health_summary_alert", None, False,
                "NodeManager Health Summary", smokeuser)
            try:
                url_response_json = json.loads(url_response)
                live_nodemanagers = json.loads(
                    url_response_json["beans"][0]["LiveNodeManagers"])
            except ValueError, error:
                convert_to_json_failed = True
                if logger.isEnabledFor(logging.DEBUG):
                    logger.exception(
                        "[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}"
                        .format("NodeManager Health Summary", str(error)))

            if convert_to_json_failed:
                response_code, error_msg, time_millis = curl_krb_request(
                    env.tmp_dir, kerberos_keytab, kerberos_principal,
                    live_nodemanagers_qry, "nm_health_summary_alert", None,
                    True, "NodeManager Health Summary", smokeuser)
        else: