Ejemplo n.º 1
0
  def _load_jmx(self, ssl, host, port, jmx_metric):
    """ creates a JmxMetric object that holds info about jmx-based metrics """
    value_list = []
    kerberos_keytab = None
    kerberos_principal = None

    if logger.isEnabledFor(logging.DEBUG):
      logger.debug(str(jmx_metric.property_map))

    security_enabled = str(self._get_configuration_value(SECURITY_ENABLED_KEY)).upper() == 'TRUE'

    if self.uri_property_keys.kerberos_principal is not None:
      kerberos_principal = self._get_configuration_value(
      self.uri_property_keys.kerberos_principal)

      if kerberos_principal is not None:
        # substitute _HOST in kerberos principal with actual fqdn
        kerberos_principal = kerberos_principal.replace('_HOST', self.host_name)

    if self.uri_property_keys.kerberos_keytab is not None:
      kerberos_keytab = self._get_configuration_value(self.uri_property_keys.kerberos_keytab)

    if "0.0.0.0" in str(host):
      host = self.host_name

    for jmx_property_key, jmx_property_value in jmx_metric.property_map.iteritems():
      url = "{0}://{1}:{2}/jmx?qry={3}".format(
        "https" if ssl else "http", host, str(port), jmx_property_key)

      # use a customer header processor that will look for the non-standard
      # "Refresh" header and attempt to follow the redirect
      response = None
      content = ''
      try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
          tmp_dir = Constants.AGENT_TMP_DIR
          if tmp_dir is None:
            tmp_dir = gettempdir()

          kerberos_executable_search_paths = self._get_configuration_value('{{kerberos-env/executable_search_paths}}')
          smokeuser = self._get_configuration_value('{{cluster-env/smokeuser}}')

          response, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal, url,
            "metric_alert", kerberos_executable_search_paths, False, self.get_name(), smokeuser,
            connection_timeout=self.curl_connection_timeout, kinit_timer_ms = self.kinit_timeout)

          content = response
        else:
          url_opener = urllib2.build_opener(RefreshHeaderProcessor())
          response = url_opener.open(url, timeout=self.connection_timeout)
          content = response.read()
      except Exception, exception:
        if logger.isEnabledFor(logging.DEBUG):
          logger.exception("[Alert][{0}] Unable to make a web request: {1}".format(self.get_name(), str(exception)))
      finally:
Ejemplo n.º 2
0
  def _make_web_request(self, url):
    """
    Makes an http(s) request to a web resource and returns the http code. If
    there was an error making the request, return 0 for the status code.
    """
    error_msg = None
    try:
      response_code = 0
      kerberos_keytab = None
      kerberos_principal = None

      if self.uri_property_keys.kerberos_principal is not None:
        kerberos_principal = self._get_configuration_value(
          self.uri_property_keys.kerberos_principal)

        if kerberos_principal is not None:
          # substitute _HOST in kerberos principal with actual fqdn
          kerberos_principal = kerberos_principal.replace('_HOST', self.host_name)

      if self.uri_property_keys.kerberos_keytab is not None:
        kerberos_keytab = self._get_configuration_value(self.uri_property_keys.kerberos_keytab)

      security_enabled = self._get_configuration_value('{{cluster-env/security_enabled}}')
      
      if kerberos_principal is not None and kerberos_keytab is not None \
        and security_enabled is not None and security_enabled.lower() == "true":
        # Create the kerberos credentials cache (ccache) file and set it in the environment to use
        # when executing curl. Use the md5 hash of the combination of the principal and keytab file
        # to generate a (relatively) unique cache filename so that we can use it as needed.
        tmp_dir = Constants.AGENT_TMP_DIR
        if tmp_dir is None:
          tmp_dir = gettempdir()

        # Get the configured Kerberos executables search paths, if any
        kerberos_executable_search_paths = self._get_configuration_value('{{kerberos-env/executable_search_paths}}')
        smokeuser = self._get_configuration_value('{{cluster-env/smokeuser}}')

        response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal, url,
          "web_alert", kerberos_executable_search_paths, True, self.get_name(), smokeuser,
          connection_timeout=self.curl_connection_timeout)
      else:
        # kerberos is not involved; use urllib2
        response_code, time_millis, error_msg = self._make_web_request_urllib(url)

      return WebResponse(status_code=response_code, time_millis=time_millis,
        error_msg=error_msg)

    except Exception, exception:
      if logger.isEnabledFor(logging.DEBUG):
        logger.exception("[Alert][{0}] Unable to make a web request.".format(self.get_name()))

      return WebResponse(status_code=0, time_millis=0, error_msg=str(exception))
Ejemplo n.º 3
0
  def call_curl_request(self,user,keytab,principal, url, flag_http_response, request_method='GET',request_body='',header=''):
    """
    :param user: service user for which call is to be made
    :param keytab: keytab of service user
    :param principal: principal of service user
    :param url: url with which call is to be made
    :param flag_http_response: flag to get only response-code or response string
    :param request_method: http method (GET / POST / PUT / DELETE)
    :param request_body: data to be send along with the request
    :param header: http header required for the call
    :return: Returns the response error_msg , time_millis
    """
    response = None
    error_msg = None
    time_millis = 0
    response, error_msg, time_millis = curl_krb_request(Environment.get_instance().tmp_dir, keytab, principal, url, 'ranger_admin_calls',
                                                         None, flag_http_response, "Ranger-Admin API calls", user,kinit_timer_ms=0,method = request_method,body=request_body,header=header)

    return response, error_msg, time_millis
Ejemplo n.º 4
0
def get_jmx_data(nn_address, modeler_type, metric, encrypted=False, security_enabled=False):
  """
  :param nn_address: Namenode Address, e.g., host:port, ** MAY ** be preceded with "http://" or "https://" already.
  If not preceded, will use the encrypted param to determine.
  :param modeler_type: Modeler type to query using startswith function
  :param metric: Metric to return
  :return: Return an object representation of the metric, or None if it does not exist
  """
  if not nn_address or not modeler_type or not metric:
    return None

  nn_address = nn_address.strip()
  if not nn_address.startswith("http"):
    nn_address = ("https://" if encrypted else "http://") + nn_address
  if not nn_address.endswith("/"):
    nn_address = nn_address + "/"

  nn_address = nn_address + "jmx"
  Logger.info("Retrieve modeler: %s, metric: %s from JMX endpoint %s" % (modeler_type, metric, nn_address))

  if security_enabled:
    import params
    data, error_msg, time_millis = curl_krb_request(params.tmp_dir, params.smoke_user_keytab, params.smokeuser_principal, nn_address,
                            "jn_upgrade", params.kinit_path_local, False, None, params.smoke_user)
  else:
    data = urllib2.urlopen(nn_address).read()
  my_data = None
  if data:
    data_dict = json.loads(data)
    if data_dict:
      for el in data_dict['beans']:
        if el is not None and el['modelerType'] is not None and el['modelerType'].startswith(modeler_type):
          if metric in el:
            my_data = el[metric]
            if my_data:
              my_data = json.loads(str(my_data))
              break
  return my_data
Ejemplo n.º 5
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
    if configurations is None:
        return (RESULT_STATE_UNKNOWN,
                ['There were no configurations supplied to the script.'])

    # if not in HA mode, then SKIP
    if not NAMESERVICE_KEY in configurations:
        return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled'])

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                    DEFAULT_KERBEROS_KINIT_TIMER_MS)

    # determine whether or not SSL is enabled
    is_ssl_enabled = False
    if DFS_POLICY_KEY in configurations:
        dfs_policy = configurations[DFS_POLICY_KEY]
        if dfs_policy == "HTTPS_ONLY":
            is_ssl_enabled = True

    name_service = configurations[NAMESERVICE_KEY]
    hdfs_site = configurations[HDFS_SITE_KEY]

    # look for dfs.ha.namenodes.foo
    nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
    if not nn_unique_ids_key in hdfs_site:
        return (RESULT_STATE_UNKNOWN, [
            'Unable to find unique namenode alias key {0}'.format(
                nn_unique_ids_key)
        ])

    namenode_http_fragment = NAMENODE_HTTP_FRAGMENT
    jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

    if is_ssl_enabled:
        namenode_http_fragment = NAMENODE_HTTPS_FRAGMENT
        jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

    active_namenodes = []
    standby_namenodes = []
    unknown_namenodes = []

    # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
    # ie dfs.namenode.http-address.hacluster.nn1
    nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
    for nn_unique_id in nn_unique_ids:
        key = namenode_http_fragment.format(name_service, nn_unique_id)
        rpc_key = NAMENODE_RPC_FRAGMENT.format(name_service, nn_unique_id)

        if key in hdfs_site:
            # use str() to ensure that unicode strings do not have the u' in them
            value = str(hdfs_site[key])
            if INADDR_ANY in value and rpc_key in hdfs_site:
                rpc_value = str(hdfs_site[rpc_key])
                if INADDR_ANY not in rpc_value:
                    rpc_host = rpc_value.split(":")[0]
                    value = value.replace(INADDR_ANY, rpc_host)

            try:
                jmx_uri = jmx_uri_fragment.format(value)
                if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
                    env = Environment.get_instance()

                    # curl requires an integer timeout
                    curl_connection_timeout = int(connection_timeout)

                    state_response, error_msg, time_millis = curl_krb_request(
                        env.tmp_dir,
                        kerberos_keytab,
                        kerberos_principal,
                        jmx_uri,
                        "ha_nn_health",
                        executable_paths,
                        False,
                        "NameNode High Availability Health",
                        smokeuser,
                        connection_timeout=curl_connection_timeout,
                        kinit_timer_ms=kinit_timer_ms)

                    state = _get_ha_state_from_json(state_response)
                else:
                    state_response = get_jmx(jmx_uri, connection_timeout)
                    state = _get_ha_state_from_json(state_response)

                if state == HDFS_NN_STATE_ACTIVE:
                    active_namenodes.append(value)
                elif state == HDFS_NN_STATE_STANDBY:
                    standby_namenodes.append(value)
                else:
                    unknown_namenodes.append(value)
            except:
                logger.exception(LOGGER_EXCEPTION_MESSAGE.format(host_name))
                unknown_namenodes.append(value)

    # there's only one scenario here; there is exactly 1 active and 1 standby
    is_topology_healthy = len(active_namenodes) == 1 and len(
        standby_namenodes) == 1

    result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(
        str(active_namenodes), str(standby_namenodes), str(unknown_namenodes))

    if is_topology_healthy:
        # if there is exactly 1 active and 1 standby NN
        return (RESULT_STATE_OK, [result_label])
    else:
        # other scenario
        return (RESULT_STATE_CRITICAL, [result_label])
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return ("UNKNOWN", ["There were no configurations supplied to the script."])

    uri = None
    scheme = "http"
    http_uri = None
    https_uri = None
    http_policy = "HTTP_ONLY"
    checkpoint_tx = CHECKPOINT_TX_DEFAULT
    checkpoint_period = CHECKPOINT_PERIOD_DEFAULT

    if NN_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NN_HTTP_ADDRESS_KEY]

    if NN_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if NN_CHECKPOINT_TX_KEY in configurations:
        checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

    if NN_CHECKPOINT_PERIOD_KEY in configurations:
        checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == "TRUE"

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace("_HOST", host_name)

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    percent_warning = PERCENT_WARNING_DEFAULT
    if PERCENT_WARNING_KEY in parameters:
        percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100

    percent_critical = PERCENT_CRITICAL_DEFAULT
    if PERCENT_CRITICAL_KEY in parameters:
        percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == "HTTPS_ONLY":
        scheme = "https"

        if https_uri is not None:
            uri = https_uri

    current_time = int(round(time.time() * 1000))

    last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme, uri)
    journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            # curl requires an integer timeout
            curl_connection_timeout = int(connection_timeout)

            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                last_checkpoint_time_qry,
                "checkpoint_time_alert",
                executable_paths,
                False,
                "NameNode Last Checkpoint",
                smokeuser,
                connection_timeout=curl_connection_timeout,
            )

            last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response)
            last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"])

            journal_transaction_info_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                journal_transaction_info_qry,
                "checkpoint_time_alert",
                executable_paths,
                False,
                "NameNode Last Checkpoint",
                smokeuser,
                connection_timeout=curl_connection_timeout,
            )

            journal_transaction_info_response_json = json.loads(journal_transaction_info_response)
            journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"]
        else:
            last_checkpoint_time = int(
                get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout)
            )

            journal_transaction_info = get_value_from_jmx(
                journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout
            )

        journal_transaction_info_dict = json.loads(journal_transaction_info)

        last_tx = int(journal_transaction_info_dict["LastAppliedOrWrittenTxId"])
        most_recent_tx = int(journal_transaction_info_dict["MostRecentCheckpointTxId"])
        transaction_difference = last_tx - most_recent_tx

        delta = (current_time - last_checkpoint_time) / 1000

        label = LABEL.format(h=get_time(delta)["h"], m=get_time(delta)["m"], tx=transaction_difference)

        if (transaction_difference > int(checkpoint_tx)) and (
            float(delta) / int(checkpoint_period) * 100 >= int(percent_critical)
        ):
            result_code = "CRITICAL"
        elif (transaction_difference > int(checkpoint_tx)) and (
            float(delta) / int(checkpoint_period) * 100 >= int(percent_warning)
        ):
            result_code = "WARNING"

    except Exception, e:
        label = str(e)
        result_code = "UNKNOWN"
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  webhcat_port = WEBHCAT_PORT_DEFAULT
  if TEMPLETON_PORT_KEY in configurations:
    webhcat_port = int(configurations[TEMPLETON_PORT_KEY])

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true'

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
    curl_connection_timeout = str(int(connection_timeout))


  # the alert will always run on the webhcat host
  if host_name is None:
    host_name = socket.getfqdn()

  smokeuser = SMOKEUSER_DEFAULT

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  # webhcat always uses http, never SSL
  query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(host_name, webhcat_port, smokeuser)

  # initialize
  total_time = 0
  json_response = {}

  if security_enabled:
    try:
      # defaults
      smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT
      smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT

      # check script params
      if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
        smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY]
      if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
        smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY]

      # check configurations last as they should always take precedence
      if SMOKEUSER_PRINCIPAL_KEY in configurations:
        smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
      if SMOKEUSER_KEYTAB_KEY in configurations:
        smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY]

      # Get the configured Kerberos executable search paths, if any
      kerberos_executable_search_paths = None
      if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
        kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

      env = Environment.get_instance()
      stdout, stderr, time_millis = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal,
                                                      query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True,
                                                      "WebHCat Server Status", smokeuser,
                                                      connection_timeout=curl_connection_timeout)

      # check the response code
      response_code = int(stdout)

      # 0 indicates no connection
      if response_code == 0:
        label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
        return (RESULT_CODE_CRITICAL, [label])

      # any other response aside from 200 is a problem
      if response_code != 200:
        label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url)
        return (RESULT_CODE_CRITICAL, [label])

      # now that we have the http status and it was 200, get the content
      stdout, stderr, total_time = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal,
                                                      query_url, "webhcat_alert_cc_", kerberos_executable_search_paths,
                                                      False, "WebHCat Server Status", smokeuser,
                                                      connection_timeout=curl_connection_timeout)
      json_response = json.loads(stdout)
    except Exception, exception:
      return (RESULT_CODE_CRITICAL, [str(exception)])
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
  if configurations is None:
    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])

  # if not in HA mode, then SKIP
  if not NAMESERVICE_KEY in configurations:
    return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled'])

  # hdfs-site is required
  if not HDFS_SITE_KEY in configurations:
    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
  
  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)


  # determine whether or not SSL is enabled
  is_ssl_enabled = False
  if DFS_POLICY_KEY in configurations:
    dfs_policy = configurations[DFS_POLICY_KEY]
    if dfs_policy == "HTTPS_ONLY":
      is_ssl_enabled = True

  name_service = configurations[NAMESERVICE_KEY]
  hdfs_site = configurations[HDFS_SITE_KEY]

  # look for dfs.ha.namenodes.foo
  nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
  if not nn_unique_ids_key in hdfs_site:
    return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)])

  namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
  jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

  if is_ssl_enabled:
    namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
    jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"


  active_namenodes = []
  standby_namenodes = []
  unknown_namenodes = []

  # now we have something like 'nn1,nn2,nn3,nn4'
  # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
  # ie dfs.namenode.http-address.hacluster.nn1
  nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
  for nn_unique_id in nn_unique_ids:
    key = namenode_http_fragment.format(name_service,nn_unique_id)

    if key in hdfs_site:
      # use str() to ensure that unicode strings do not have the u' in them
      value = str(hdfs_site[key])

      try:
        jmx_uri = jmx_uri_fragment.format(value)
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
          env = Environment.get_instance()

          # curl requires an integer timeout
          curl_connection_timeout = int(connection_timeout)

          state_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir,
            kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False,
            "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout)

          state = _get_ha_state_from_json(state_response)
        else:
          state_response = get_jmx(jmx_uri, connection_timeout)
          state = _get_ha_state_from_json(state_response)

        if state == HDFS_NN_STATE_ACTIVE:
          active_namenodes.append(value)
        elif state == HDFS_NN_STATE_STANDBY:
          standby_namenodes.append(value)
        else:
          unknown_namenodes.append(value)
      except:
        unknown_namenodes.append(value)

  # now that the request is done, determine if this host is the host that
  # should report the status of the HA topology
  is_active_namenode = False
  for active_namenode in active_namenodes:
    if active_namenode.startswith(host_name):
      is_active_namenode = True

  # there's only one scenario here; there is exactly 1 active and 1 standby
  is_topology_healthy = len(active_namenodes) == 1 and len(standby_namenodes) == 1

  result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(str(active_namenodes),
    str(standby_namenodes), str(unknown_namenodes))

  # Healthy Topology:
  #   - Active NN reports the alert, standby does not
  #
  # Unhealthy Topology:
  #   - Report the alert if this is the first named host
  #   - Report the alert if not the first named host, but the other host
  #   could not report its status
  if is_topology_healthy:
    if is_active_namenode is True:
      return (RESULT_STATE_OK, [result_label])
    else:
      return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
  else:
    # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode
    first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format(
      name_service, nn_unique_ids[0])

    first_listed_host = ''
    if first_listed_host_key in hdfs_site:
      first_listed_host = hdfs_site[first_listed_host_key]

    is_first_listed_host = False
    if first_listed_host.startswith(host_name):
      is_first_listed_host = True

    if is_first_listed_host:
      return (RESULT_STATE_CRITICAL, [result_label])
    else:
      # not the first listed host, but the first host might be in the unknown
      return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """

  if configurations is None:
    return (('UNKNOWN', ['There were no configurations supplied to the script.']))

  uri = None
  scheme = 'http'
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  if NN_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NN_HTTP_ADDRESS_KEY]

  if NN_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

  if NN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[NN_HTTP_POLICY_KEY]

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme, uri)

  # start out assuming an OK status
  label = None
  result_code = "OK"

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
        env.tmp_dir, kerberos_keytab,
        kerberos_principal, upgrade_finalized_qry, "upgrade_finalized_state", executable_paths, False,
        "HDFS Upgrade Finalized State", smokeuser
       )

      upgrade_finalized_response_json = json.loads(last_checkpoint_time_response)
      upgrade_finalized = bool(upgrade_finalized_response_json["beans"][0]["UpgradeFinalized"])

    else:
      upgrade_finalized = bool(get_value_from_jmx(upgrade_finalized_qry,
                                                    "UpgradeFinalized"))

    if upgrade_finalized:
      label = "HDFS cluster is not in the upgrade state"
      result_code = 'OK'
    else:
      label = "HDFS cluster is not finalized"
      result_code = 'CRITICAL'

  except:
    label = traceback.format_exc()
    result_code = 'UNKNOWN'

  return ((result_code, [label]))
Ejemplo n.º 10
0
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  scheme = 'http'
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

  if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

  if YARN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[YARN_HTTP_POLICY_KEY]

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  label = ''
  url_response = None
  node_healthy = 'false'
  total_time = 0

  # some yarn-site structures don't have the web ui address
  if uri is None:
    if host_name is None:
      host_name = socket.getfqdn()

    uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
    
  if OSCheck.is_windows_family():
    uri_host, uri_port = uri.split(':')
    # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
    uri_host = resolve_address(uri_host)
    uri = '{0}:{1}'.format(uri_host, uri_port)

  query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
        query, "nm_health_alert", executable_paths, False, "NodeManager Health", smokeuser,
        connection_timeout=curl_connection_timeout)

      json_response = json.loads(url_response)
    else:
      # execute the query for the JSON that includes templeton status
      url_response = urllib2.urlopen(query, timeout=connection_timeout)
      json_response = json.loads(url_response.read())
  except urllib2.HTTPError, httpError:
    label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
      str(httpError), traceback.format_exc())

    return (RESULT_CODE_CRITICAL, [label])
Ejemplo n.º 11
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    result_code = RESULT_CODE_UNKNOWN

    if configurations is None:
        return (result_code,
                ['There were no configurations supplied to the script.'])

    webhcat_port = WEBHCAT_PORT_DEFAULT
    if TEMPLETON_PORT_KEY in configurations:
        webhcat_port = int(configurations[TEMPLETON_PORT_KEY])

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = configurations[SECURITY_ENABLED_KEY].lower(
        ) == 'true'

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
        curl_connection_timeout = str(int(connection_timeout))

    # the alert will always run on the webhcat host
    if host_name is None:
        host_name = socket.getfqdn()

    smokeuser = SMOKEUSER_DEFAULT

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    # webhcat always uses http, never SSL
    query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(
        host_name, webhcat_port, smokeuser)

    # initialize
    total_time = 0
    json_response = {}

    if security_enabled:
        try:
            # defaults
            smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT
            smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT

            # check script params
            if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
                smokeuser_principal = parameters[
                    SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY]
            if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
                smokeuser_keytab = parameters[
                    SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY]

            # check configurations last as they should always take precedence
            if SMOKEUSER_PRINCIPAL_KEY in configurations:
                smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
            if SMOKEUSER_KEYTAB_KEY in configurations:
                smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY]

            # Get the configured Kerberos executable search paths, if any
            kerberos_executable_search_paths = None
            if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
                kerberos_executable_search_paths = configurations[
                    KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

            kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                            DEFAULT_KERBEROS_KINIT_TIMER_MS)

            env = Environment.get_instance()
            stdout, stderr, time_millis = curl_krb_request(
                env.tmp_dir,
                smokeuser_keytab,
                smokeuser_principal,
                query_url,
                "webhcat_alert_cc_",
                kerberos_executable_search_paths,
                True,
                "WebHCat Server Status",
                smokeuser,
                connection_timeout=curl_connection_timeout,
                kinit_timer_ms=kinit_timer_ms)

            # check the response code
            response_code = int(stdout)

            # 0 indicates no connection
            if response_code == 0:
                label = CRITICAL_CONNECTION_MESSAGE.format(
                    query_url, traceback.format_exc())
                return (RESULT_CODE_CRITICAL, [label])

            # any other response aside from 200 is a problem
            if response_code != 200:
                label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url,
                                                     traceback.format_exc())
                return (RESULT_CODE_CRITICAL, [label])

            # now that we have the http status and it was 200, get the content
            stdout, stderr, total_time = curl_krb_request(
                env.tmp_dir,
                smokeuser_keytab,
                smokeuser_principal,
                query_url,
                "webhcat_alert_cc_",
                kerberos_executable_search_paths,
                False,
                "WebHCat Server Status",
                smokeuser,
                connection_timeout=curl_connection_timeout,
                kinit_timer_ms=kinit_timer_ms)

            json_response = json.loads(stdout)
        except:
            return (RESULT_CODE_CRITICAL, [traceback.format_exc()])
    else:
        url_response = None

        try:
            # execute the query for the JSON that includes WebHCat status
            start_time = time.time()
            url_response = urllib2.urlopen(query_url,
                                           timeout=connection_timeout)
            total_time = time.time() - start_time

            json_response = json.loads(url_response.read())
        except urllib2.HTTPError as httpError:
            label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url,
                                                 traceback.format_exc())
            return (RESULT_CODE_CRITICAL, [label])
        except:
            label = CRITICAL_CONNECTION_MESSAGE.format(query_url,
                                                       traceback.format_exc())
            return (RESULT_CODE_CRITICAL, [label])
        finally:
            if url_response is not None:
                try:
                    url_response.close()
                except:
                    pass

    # if status is not in the response, we can't do any check; return CRIT
    if 'status' not in json_response:
        return (RESULT_CODE_CRITICAL,
                [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + str(json_response)])

    # URL response received, parse it
    try:
        webhcat_status = json_response['status']
    except:
        return (RESULT_CODE_CRITICAL, [
            CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + "\n" +
            traceback.format_exc()
        ])

    # proper JSON received, compare against known value
    if webhcat_status.lower() == WEBHCAT_OK_RESPONSE:
        result_code = RESULT_CODE_OK
        label = OK_MESSAGE.format(total_time, query_url)
    else:
        result_code = RESULT_CODE_CRITICAL
        label = CRITICAL_WEBHCAT_STATUS_MESSAGE.format(webhcat_status)

    return (result_code, [label])
Ejemplo n.º 12
0
    def _make_web_request(self, url):
        """
    Makes an http(s) request to a web resource and returns the http code. If
    there was an error making the request, return 0 for the status code.
    """
        error_msg = None
        try:
            response_code = 0
            kerberos_keytab = None
            kerberos_principal = None

            if self.uri_property_keys.kerberos_principal is not None:
                kerberos_principal = self._get_configuration_value(
                    self.uri_property_keys.kerberos_principal)

                if kerberos_principal is not None:
                    # substitute _HOST in kerberos principal with actual fqdn
                    kerberos_principal = kerberos_principal.replace(
                        '_HOST', self.host_name)

            if self.uri_property_keys.kerberos_keytab is not None:
                kerberos_keytab = self._get_configuration_value(
                    self.uri_property_keys.kerberos_keytab)

            security_enabled = self._get_configuration_value(
                '{{cluster-env/security_enabled}}')

            if kerberos_principal is not None and kerberos_keytab is not None \
              and security_enabled is not None and security_enabled.lower() == "true":
                # Create the kerberos credentials cache (ccache) file and set it in the environment to use
                # when executing curl. Use the md5 hash of the combination of the principal and keytab file
                # to generate a (relatively) unique cache filename so that we can use it as needed.
                tmp_dir = Constants.AGENT_TMP_DIR
                if tmp_dir is None:
                    tmp_dir = gettempdir()

                # Get the configured Kerberos executables search paths, if any
                kerberos_executable_search_paths = self._get_configuration_value(
                    '{{kerberos-env/executable_search_paths}}')
                smokeuser = self._get_configuration_value(
                    '{{cluster-env/smokeuser}}')

                response_code, error_msg, time_millis = curl_krb_request(
                    tmp_dir,
                    kerberos_keytab,
                    kerberos_principal,
                    url,
                    "web_alert",
                    kerberos_executable_search_paths,
                    True,
                    self.get_name(),
                    smokeuser,
                    connection_timeout=self.curl_connection_timeout,
                    kinit_timer_ms=self.kinit_timeout)
            else:
                # kerberos is not involved; use urllib2
                response_code, time_millis, error_msg = self._make_web_request_urllib(
                    url)

            return WebResponse(status_code=response_code,
                               time_millis=time_millis,
                               error_msg=error_msg)

        except Exception, exception:
            if logger.isEnabledFor(logging.DEBUG):
                logger.exception(
                    "[Alert][{0}] Unable to make a web request.".format(
                        self.get_name()))

            return WebResponse(status_code=0,
                               time_millis=0,
                               error_msg=str(exception))
Ejemplo n.º 13
0
def call_curl_krb_request(tmp_dir,
                          user_keytab,
                          user_princ,
                          uri,
                          kinit_path,
                          user,
                          connection_timeout,
                          method='GET',
                          metric_json='',
                          header='',
                          tries=1,
                          current_time=0,
                          random_value=0):
    if method == 'POST':
        Logger.info("Generated metrics for %s:\n%s" % (uri, metric_json))

    for i in xrange(0, tries):
        try:
            Logger.info("Connecting (%s) to %s" % (method, uri))

            response = None
            errmsg = None
            time_millis = 0

            response, errmsg, time_millis = curl_krb_request(
                tmp_dir,
                user_keytab,
                user_princ,
                uri,
                'ams_service_check',
                kinit_path,
                False,
                "AMS Service Check",
                user,
                connection_timeout=connection_timeout,
                kinit_timer_ms=0,
                method=method,
                body=metric_json,
                header=header)
        except Exception as exception:
            if i < tries - 1:  # range/xrange returns items from start to end-1
                time.sleep(connection_timeout)
                Logger.info(
                    "Connection failed for %s. Next retry in %s seconds." %
                    (uri, connection_timeout))
                continue
            else:
                raise Fail(
                    "Unable to {0} metrics on: {1}. Exception: {2}".format(
                        method, uri, str(exception)))
        finally:
            if not response:
                Logger.error(
                    "Unable to {0} metrics on: {1}.  Error: {2}".format(
                        method, uri, errmsg))
            else:
                Logger.info("%s response from %s: %s, errmsg: %s" %
                            (method, uri, response, errmsg))
                try:
                    response.close()
                except:
                    Logger.debug(
                        "Unable to close {0} connection to {1}".format(
                            method, uri))

        if method == 'GET':
            data_json = json.loads(response)

            def floats_eq(f1, f2, delta):
                return abs(f1 - f2) < delta

            values_are_present = False
            for metrics_data in data_json["metrics"]:
                if (str(current_time) in metrics_data["metrics"]
                        and str(current_time + 1000) in metrics_data["metrics"]
                        and floats_eq(
                            metrics_data["metrics"][str(current_time)],
                            random_value, 0.0000001)
                        and floats_eq(
                            metrics_data["metrics"][str(current_time + 1000)],
                            current_time, 1)):
                    Logger.info(
                        "Values %s and %s were found in the response from %s."
                        % (uri, random_value, current_time))
                    values_are_present = True
                    break
                    pass

            if not values_are_present:
                if i < tries - 1:  # range/xrange returns items from start to end-1
                    Logger.info(
                        "Values weren't stored yet. Retrying in %s seconds." %
                        (tries))
                    time.sleep(connection_timeout)
                else:
                    raise Fail(
                        "Values %s and %s were not found in the response." %
                        (random_value, current_time))
            else:
                break
                pass
        else:
            break
Ejemplo n.º 14
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    uri = None
    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'
    checkpoint_tx = CHECKPOINT_TX_DEFAULT
    checkpoint_period = CHECKPOINT_PERIOD_DEFAULT

    if NN_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NN_HTTP_ADDRESS_KEY]

    if NN_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if NN_CHECKPOINT_TX_KEY in configurations:
        checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

    if NN_CHECKPOINT_PERIOD_KEY in configurations:
        checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    percent_warning = PERCENT_WARNING_DEFAULT
    if PERCENT_WARNING_KEY in parameters:
        percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100

    percent_critical = PERCENT_CRITICAL_DEFAULT
    if PERCENT_CRITICAL_KEY in parameters:
        percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == 'HTTPS_ONLY':
        scheme = 'https'

        if https_uri is not None:
            uri = https_uri

    current_time = int(round(time.time() * 1000))

    last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(
        scheme, uri)
    journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(
        scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            # curl requires an integer timeout
            curl_connection_timeout = int(connection_timeout)

            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                last_checkpoint_time_qry,
                "checkpoint_time_alert",
                None,
                False,
                "NameNode Last Checkpoint",
                smokeuser,
                connection_timeout=curl_connection_timeout)

            last_checkpoint_time_response_json = json.loads(
                last_checkpoint_time_response)
            last_checkpoint_time = int(
                last_checkpoint_time_response_json["beans"][0]
                ["LastCheckpointTime"])

            journal_transaction_info_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                journal_transaction_info_qry,
                "checkpoint_time_alert",
                None,
                False,
                "NameNode Last Checkpoint",
                smokeuser,
                connection_timeout=curl_connection_timeout)

            journal_transaction_info_response_json = json.loads(
                journal_transaction_info_response)
            journal_transaction_info = journal_transaction_info_response_json[
                "beans"][0]["JournalTransactionInfo"]
        else:
            last_checkpoint_time = int(
                get_value_from_jmx(last_checkpoint_time_qry,
                                   "LastCheckpointTime", connection_timeout))

            journal_transaction_info = get_value_from_jmx(
                journal_transaction_info_qry, "JournalTransactionInfo",
                connection_timeout)

        journal_transaction_info_dict = json.loads(journal_transaction_info)

        last_tx = int(
            journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
        most_recent_tx = int(
            journal_transaction_info_dict['MostRecentCheckpointTxId'])
        transaction_difference = last_tx - most_recent_tx

        delta = (current_time - last_checkpoint_time) / 1000

        label = LABEL.format(h=get_time(delta)['h'],
                             m=get_time(delta)['m'],
                             tx=transaction_difference)

        if (transaction_difference > int(checkpoint_tx)) and (
                float(delta) / int(checkpoint_period) * 100 >=
                int(percent_critical)):
            result_code = 'CRITICAL'
        elif (transaction_difference > int(checkpoint_tx)) and (
                float(delta) / int(checkpoint_period) * 100 >=
                int(percent_warning)):
            result_code = 'WARNING'

    except Exception, e:
        label = str(e)
        result_code = 'UNKNOWN'
Ejemplo n.º 15
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

    if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

    if YARN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[YARN_HTTP_POLICY_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                    DEFAULT_KERBEROS_KINIT_TIMER_MS)

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == 'HTTPS_ONLY':
        scheme = 'https'

        if https_uri is not None:
            uri = https_uri

    uri = str(host_name) + ":" + uri.split(":")[1]
    live_nodemanagers_qry = "{0}://{1}/jmx?qry={2}".format(scheme, uri, QRY)
    convert_to_json_failed = False
    response_code = None
    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            # curl requires an integer timeout
            curl_connection_timeout = int(connection_timeout)

            url_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                live_nodemanagers_qry,
                "nm_health_summary_alert",
                executable_paths,
                False,
                "NodeManager Health Summary",
                smokeuser,
                connection_timeout=curl_connection_timeout,
                kinit_timer_ms=kinit_timer_ms)

            try:
                url_response_json = json.loads(url_response)
                live_nodemanagers = json.loads(
                    find_value_in_jmx(url_response_json, "LiveNodeManagers",
                                      live_nodemanagers_qry))
            except ValueError, error:
                convert_to_json_failed = True
                logger.exception(
                    "[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}"
                    .format("NodeManager Health Summary", str(error)))

            if convert_to_json_failed:
                response_code, error_msg, time_millis = curl_krb_request(
                    env.tmp_dir,
                    kerberos_keytab,
                    kerberos_principal,
                    live_nodemanagers_qry,
                    "nm_health_summary_alert",
                    executable_paths,
                    True,
                    "NodeManager Health Summary",
                    smokeuser,
                    connection_timeout=curl_connection_timeout,
                    kinit_timer_ms=kinit_timer_ms)
        else:
Ejemplo n.º 16
0
                for attr in jmx_property_value:
                    if attr not in json_data:
                        raise Exception(
                            "Unable to find {0} in JSON from {1} ".format(
                                attr, url))

                    value_list.append(json_data[attr])

            http_response_code = None
            if not json_is_valid and security_enabled and kerberos_principal is not None and kerberos_keytab is not None:
                http_response_code, error_msg, time_millis = curl_krb_request(
                    tmp_dir,
                    kerberos_keytab,
                    kerberos_principal,
                    url,
                    "metric_alert",
                    kerberos_executable_search_paths,
                    True,
                    self.get_name(),
                    smokeuser,
                    connection_timeout=self.curl_connection_timeout)

        return (value_list, http_response_code)

    def _get_reporting_text(self, state):
        '''
    Always returns {0} since the result of the script alert is a rendered string.
    This will ensure that the base class takes the result string and just uses
    it directly.

    :param state: the state of the alert in uppercase (such as OK, WARNING, etc)
Ejemplo n.º 17
0
    def _load_jmx(self, ssl, host, port, jmx_metric):
        """ creates a JmxMetric object that holds info about jmx-based metrics """
        value_list = []
        kerberos_keytab = None
        kerberos_principal = None

        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(str(jmx_metric.property_map))

        security_enabled = str(
            self._get_configuration_value(
                SECURITY_ENABLED_KEY)).upper() == 'TRUE'

        if self.uri_property_keys.kerberos_principal is not None:
            kerberos_principal = self._get_configuration_value(
                self.uri_property_keys.kerberos_principal)

            if kerberos_principal is not None:
                # substitute _HOST in kerberos principal with actual fqdn
                kerberos_principal = kerberos_principal.replace(
                    '_HOST', self.host_name)

        if self.uri_property_keys.kerberos_keytab is not None:
            kerberos_keytab = self._get_configuration_value(
                self.uri_property_keys.kerberos_keytab)

        if "0.0.0.0" in str(host):
            host = self.host_name

        for jmx_property_key, jmx_property_value in jmx_metric.property_map.iteritems(
        ):
            url = "{0}://{1}:{2}/jmx?qry={3}".format(
                "https" if ssl else "http", host, str(port), jmx_property_key)

            # use a customer header processor that will look for the non-standard
            # "Refresh" header and attempt to follow the redirect
            response = None
            content = ''
            try:
                if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
                    tmp_dir = self.config.get('agent', 'tmp_dir')
                    if tmp_dir is None:
                        tmp_dir = gettempdir()

                    kerberos_executable_search_paths = self._get_configuration_value(
                        '{{kerberos-env/executable_search_paths}}')
                    smokeuser = self._get_configuration_value(
                        '{{cluster-env/smokeuser}}')

                    response, error_msg, time_millis = curl_krb_request(
                        tmp_dir,
                        kerberos_keytab,
                        kerberos_principal,
                        url,
                        "metric_alert",
                        kerberos_executable_search_paths,
                        False,
                        self.get_name(),
                        smokeuser,
                        connection_timeout=self.curl_connection_timeout)

                    content = response
                else:
                    url_opener = urllib2.build_opener(RefreshHeaderProcessor())
                    response = url_opener.open(url,
                                               timeout=self.connection_timeout)
                    content = response.read()
            except Exception, exception:
                if logger.isEnabledFor(logging.DEBUG):
                    logger.exception(
                        "[Alert][{0}] Unable to make a web request: {1}".
                        format(self.get_name(), str(exception)))
            finally:
Ejemplo n.º 18
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    result_code = RESULT_CODE_UNKNOWN

    if configurations is None:
        return (result_code,
                ['There were no configurations supplied to the script.'])

    webhcat_port = WEBHCAT_PORT_DEFAULT
    if TEMPLETON_PORT_KEY in configurations:
        webhcat_port = int(configurations[TEMPLETON_PORT_KEY])

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = configurations[SECURITY_ENABLED_KEY].lower(
        ) == 'true'

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
        curl_connection_timeout = str(int(connection_timeout))

    # the alert will always run on the webhcat host
    if host_name is None:
        host_name = socket.getfqdn()

    smokeuser = SMOKEUSER_DEFAULT

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    # webhcat always uses http, never SSL
    query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(
        host_name, webhcat_port, smokeuser)

    # initialize
    total_time = 0
    json_response = {}

    if security_enabled:
        try:
            # defaults
            smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT
            smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT

            # check script params
            if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
                smokeuser_principal = parameters[
                    SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY]
            if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
                smokeuser_keytab = parameters[
                    SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY]

            # check configurations last as they should always take precedence
            if SMOKEUSER_PRINCIPAL_KEY in configurations:
                smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
            if SMOKEUSER_KEYTAB_KEY in configurations:
                smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY]

            # Get the configured Kerberos executable search paths, if any
            kerberos_executable_search_paths = None
            if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
                kerberos_executable_search_paths = configurations[
                    KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

            env = Environment.get_instance()
            stdout, stderr, time_millis = curl_krb_request(
                env.tmp_dir,
                smokeuser_keytab,
                smokeuser_principal,
                query_url,
                "webhcat_alert_cc_",
                kerberos_executable_search_paths,
                True,
                "WebHCat Server Status",
                smokeuser,
                connection_timeout=curl_connection_timeout)

            # check the response code
            response_code = int(stdout)

            # 0 indicates no connection
            if response_code == 0:
                label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
                return (RESULT_CODE_CRITICAL, [label])

            # any other response aside from 200 is a problem
            if response_code != 200:
                label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url)
                return (RESULT_CODE_CRITICAL, [label])

            # now that we have the http status and it was 200, get the content
            stdout, stderr, total_time = curl_krb_request(
                env.tmp_dir,
                smokeuser_keytab,
                smokeuser_principal,
                query_url,
                "webhcat_alert_cc_",
                kerberos_executable_search_paths,
                False,
                "WebHCat Server Status",
                smokeuser,
                connection_timeout=curl_connection_timeout)
            json_response = json.loads(stdout)
        except Exception, exception:
            return (RESULT_CODE_CRITICAL, [str(exception)])
Ejemplo n.º 19
0
  def service_check(self, env):
    import params

    env.set_params(params)
    unique = functions.get_unique_id_and_date()
    dir = '/tmp'
    tmp_file = format("{dir}/{unique}")

    safemode_command = format("dfsadmin -fs {namenode_address} -safemode get | grep OFF")

    if params.security_enabled:
      Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
        user=params.hdfs_user
      )
    ExecuteHadoop(safemode_command,
                  user=params.hdfs_user,
                  logoutput=True,
                  conf_dir=params.hadoop_conf_dir,
                  try_sleep=3,
                  tries=20,
                  bin_dir=params.hadoop_bin_dir
    )
    params.HdfsResource(dir,
                        type="directory",
                        action="create_on_execute",
                        mode=0777
    )
    params.HdfsResource(tmp_file,
                        type="file",
                        action="delete_on_execute",
    )

    params.HdfsResource(tmp_file,
                        type="file",
                        source="/etc/passwd",
                        action="create_on_execute"
    )
    params.HdfsResource(None, action="execute")

    if params.has_journalnode_hosts:
      if params.security_enabled:
        for host in params.journalnode_hosts:
          if params.https_only:
            uri = format("https://{host}:{journalnode_port}")
          else:
            uri = format("http://{host}:{journalnode_port}")
          response, errmsg, time_millis = curl_krb_request(params.tmp_dir, params.smoke_user_keytab,
                                                           params.smokeuser_principal, uri, "jn_service_check",
                                                           params.kinit_path_local, False, None, params.smoke_user)
          if not response:
            Logger.error("Cannot access WEB UI on: {0}. Error : {1}", uri, errmsg)
            return 1
      else:
        journalnode_port = params.journalnode_port
        checkWebUIFileName = "checkWebUI.py"
        checkWebUIFilePath = format("{tmp_dir}/{checkWebUIFileName}")
        comma_sep_jn_hosts = ",".join(params.journalnode_hosts)
        checkWebUICmd = format("python {checkWebUIFilePath} -m {comma_sep_jn_hosts} -p {journalnode_port} -s {https_only}")
        File(checkWebUIFilePath,
             content=StaticFile(checkWebUIFileName),
             mode=0775)

        Execute(checkWebUICmd,
                logoutput=True,
                try_sleep=3,
                tries=5,
                user=params.smoke_user
        )

    if params.is_namenode_master:
      if params.has_zkfc_hosts:
        pid_dir = format("{hadoop_pid_dir_prefix}/{hdfs_user}")
        pid_file = format("{pid_dir}/hadoop-{hdfs_user}-zkfc.pid")
        check_zkfc_process_cmd = as_user(format(
          "ls {pid_file} >/dev/null 2>&1 && ps -p `cat {pid_file}` >/dev/null 2>&1"), user=params.hdfs_user)
        Execute(check_zkfc_process_cmd,
                logoutput=True,
                try_sleep=3,
                tries=5
        )
Ejemplo n.º 20
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    uri = None
    http_policy = 'HTTP_ONLY'

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return 'SKIPPED', [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ]

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                    DEFAULT_KERBEROS_KINIT_TIMER_MS)

    # determine the right URI and whether to use SSL
    hdfs_site = configurations[HDFS_SITE_KEY]

    scheme = "https" if http_policy == "HTTPS_ONLY" else "http"

    nn_addresses = get_all_namenode_addresses(hdfs_site)
    for nn_address in nn_addresses:
        if nn_address.startswith(host_name + ":") or nn_address == host_name:
            uri = nn_address
            break
    if not uri:
        return 'SKIPPED', [
            'NameNode on host {0} not found (namenode adresses = {1})'.format(
                host_name, ', '.join(nn_addresses))
        ]

    upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(
        scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                upgrade_finalized_qry,
                "upgrade_finalized_state",
                executable_paths,
                False,
                "HDFS Upgrade Finalized State",
                smokeuser,
                kinit_timer_ms=kinit_timer_ms)

            upgrade_finalized_response_json = json.loads(
                last_checkpoint_time_response)
            upgrade_finalized = bool(upgrade_finalized_response_json["beans"]
                                     [0]["UpgradeFinalized"])

        else:
            upgrade_finalized = bool(
                get_value_from_jmx(upgrade_finalized_qry, "UpgradeFinalized"))

        if upgrade_finalized:
            label = "HDFS cluster is not in the upgrade state"
            result_code = 'OK'
        else:
            label = "HDFS cluster is not finalized"
            result_code = 'CRITICAL'

    except:
        label = traceback.format_exc()
        result_code = 'UNKNOWN'

    return ((result_code, [label]))
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  scheme = 'http'
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

  if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

  if YARN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[YARN_HTTP_POLICY_KEY]


  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])


  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  label = ''
  url_response = None
  node_healthy = 'false'
  total_time = 0

  # some yarn-site structures don't have the web ui address
  if uri is None:
    if host_name is None:
      host_name = socket.getfqdn()

    uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
    
  if OSCheck.is_windows_family():
    uri_host, uri_port = uri.split(':')
    # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
    uri_host = resolve_address(uri_host)
    uri = '{0}:{1}'.format(uri_host, uri_port)

  query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
        query, "nm_health_alert", executable_paths, False, "NodeManager Health", smokeuser,
        connection_timeout=curl_connection_timeout)

      json_response = json.loads(url_response)
    else:
      # execute the query for the JSON that includes templeton status
      url_response = urllib2.urlopen(query, timeout=connection_timeout)
      json_response = json.loads(url_response.read())
  except urllib2.HTTPError, httpError:
    label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
      str(httpError), traceback.format_exc())

    return (RESULT_CODE_CRITICAL, [label])
Ejemplo n.º 22
0
  def service_check(self, env):
    import params

    env.set_params(params)
    unique = functions.get_unique_id_and_date()
    dir = params.hdfs_tmp_dir
    tmp_file = format("{dir}/{unique}")

    """
    Ignore checking safemode, because this command is unable to get safemode state
    when 1 namenode is down in an HA setup (see more in HDFS-8277). Directly
    test HDFS availability by file system operations is consistent in both HA and
    non-HA environment.
    """
    # safemode_command = format("dfsadmin -fs {namenode_address} -safemode get | grep OFF")

    if params.security_enabled:
      Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
        user=params.hdfs_user
      )
    #ExecuteHadoop(safemode_command,
    #              user=params.hdfs_user,
    #              logoutput=True,
    #              conf_dir=params.hadoop_conf_dir,
    #              try_sleep=3,
    #              tries=20,
    #              bin_dir=params.hadoop_bin_dir
    #)
    params.HdfsResource(dir,
                        type="directory",
                        action="create_on_execute",
                        mode=0777
    )
    params.HdfsResource(tmp_file,
                        type="file",
                        action="delete_on_execute",
    )

    params.HdfsResource(tmp_file,
                        type="file",
                        source="/etc/passwd",
                        action="create_on_execute"
    )
    params.HdfsResource(None, action="execute")

    if params.has_journalnode_hosts:
      if params.security_enabled:
        for host in params.journalnode_hosts:
          if params.https_only:
            uri = format("https://{host}:{journalnode_port}")
          else:
            uri = format("http://{host}:{journalnode_port}")
          response, errmsg, time_millis = curl_krb_request(params.tmp_dir, params.smoke_user_keytab,
                                                           params.smokeuser_principal, uri, "jn_service_check",
                                                           params.kinit_path_local, False, None, params.smoke_user)
          if not response:
            Logger.error("Cannot access WEB UI on: {0}. Error : {1}", uri, errmsg)
            return 1
      else:
        journalnode_port = params.journalnode_port
        checkWebUIFileName = "checkWebUI.py"
        checkWebUIFilePath = format("{tmp_dir}/{checkWebUIFileName}")
        comma_sep_jn_hosts = ",".join(params.journalnode_hosts)
        checkWebUICmd = format("ambari-python-wrap {checkWebUIFilePath} -m {comma_sep_jn_hosts} -p {journalnode_port} -s {https_only}")
        File(checkWebUIFilePath,
             content=StaticFile(checkWebUIFileName),
             mode=0775)

        Execute(checkWebUICmd,
                logoutput=True,
                try_sleep=3,
                tries=5,
                user=params.smoke_user
        )

    if params.is_namenode_master:
      if params.has_zkfc_hosts:
        pid_dir = format("{hadoop_pid_dir_prefix}/{hdfs_user}")
        pid_file = format("{pid_dir}/hadoop-{hdfs_user}-zkfc.pid")
        check_zkfc_process_cmd = as_user(format(
          "ls {pid_file} >/dev/null 2>&1 && ps -p `cat {pid_file}` >/dev/null 2>&1"), user=params.hdfs_user)
        Execute(check_zkfc_process_cmd,
                logoutput=True,
                try_sleep=3,
                tries=5
        )
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  webhcat_port = WEBHCAT_PORT_DEFAULT
  if TEMPLETON_PORT_KEY in configurations:
    webhcat_port = int(configurations[TEMPLETON_PORT_KEY])

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true'

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
    curl_connection_timeout = str(int(connection_timeout))


  # the alert will always run on the webhcat host
  if host_name is None:
    host_name = socket.getfqdn()

  smokeuser = SMOKEUSER_DEFAULT

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  # webhcat always uses http, never SSL
  query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(host_name, webhcat_port, smokeuser)

  # initialize
  total_time = 0
  json_response = {}

  if security_enabled:
    try:
      # defaults
      smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT
      smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT

      # check script params
      if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters:
        smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY]
      if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters:
        smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY]

      # check configurations last as they should always take precedence
      if SMOKEUSER_PRINCIPAL_KEY in configurations:
        smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY]
      if SMOKEUSER_KEYTAB_KEY in configurations:
        smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY]

      # Get the configured Kerberos executable search paths, if any
      kerberos_executable_search_paths = None
      if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
        kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]

      env = Environment.get_instance()
      stdout, stderr, time_millis = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal,
                                                      query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True,
                                                      "WebHCat Server Status", smokeuser,
                                                      connection_timeout=curl_connection_timeout)

      # check the response code
      response_code = int(stdout)

      # 0 indicates no connection
      if response_code == 0:
        label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc())
        return (RESULT_CODE_CRITICAL, [label])

      # any other response aside from 200 is a problem
      if response_code != 200:
        label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url, traceback.format_exc())
        return (RESULT_CODE_CRITICAL, [label])

      # now that we have the http status and it was 200, get the content
      stdout, stderr, total_time = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal,
                                                      query_url, "webhcat_alert_cc_", kerberos_executable_search_paths,
                                                      False, "WebHCat Server Status", smokeuser,
                                                      connection_timeout=curl_connection_timeout)
      json_response = json.loads(stdout)
    except:
      return (RESULT_CODE_CRITICAL, [traceback.format_exc()])
  else:
    url_response = None

    try:
      # execute the query for the JSON that includes WebHCat status
      start_time = time.time()
      url_response = urllib2.urlopen(query_url, timeout=connection_timeout)
      total_time = time.time() - start_time

      json_response = json.loads(url_response.read())
    except urllib2.HTTPError as httpError:
      label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url, traceback.format_exc())
      return (RESULT_CODE_CRITICAL, [label])
    except:
      label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc())
      return (RESULT_CODE_CRITICAL, [label])
    finally:
      if url_response is not None:
        try:
          url_response.close()
        except:
          pass


  # if status is not in the response, we can't do any check; return CRIT
  if 'status' not in json_response:
    return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + str(json_response)])


  # URL response received, parse it
  try:
    webhcat_status = json_response['status']
  except:
    return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + "\n" + traceback.format_exc()])


  # proper JSON received, compare against known value
  if webhcat_status.lower() == WEBHCAT_OK_RESPONSE:
    result_code = RESULT_CODE_OK
    label = OK_MESSAGE.format(total_time, query_url)
  else:
    result_code = RESULT_CODE_CRITICAL
    label = CRITICAL_WEBHCAT_STATUS_MESSAGE.format(webhcat_status)

  return (result_code, [label])
Ejemplo n.º 24
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """
    hostnames = host_name
    current_time = int(time.time()) * 1000

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT
    if MERGE_HA_METRICS_PARAM_KEY in parameters:
        merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower(
        ) == 'true'

    metric_name = METRIC_NAME_PARAM_DEFAULT
    if METRIC_NAME_PARAM_KEY in parameters:
        metric_name = parameters[METRIC_NAME_PARAM_KEY]

    metric_units = METRIC_UNITS_DEFAULT
    if METRIC_UNITS_PARAM_KEY in parameters:
        metric_units = parameters[METRIC_UNITS_PARAM_KEY]

    app_id = APP_ID_PARAM_DEFAULT
    if APP_ID_PARAM_KEY in parameters:
        app_id = parameters[APP_ID_PARAM_KEY]

    interval = INTERVAL_PARAM_DEFAULT
    if INTERVAL_PARAM_KEY in parameters:
        interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY])

    warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT
    if DEVIATION_WARNING_THRESHOLD_KEY in parameters:
        warning_threshold = _coerce_to_integer(
            parameters[DEVIATION_WARNING_THRESHOLD_KEY])

    critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT
    if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
        critical_threshold = _coerce_to_integer(
            parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])

    minimum_value_threshold = None
    if MINIMUM_VALUE_THRESHOLD_KEY in parameters:
        minimum_value_threshold = _coerce_to_integer(
            parameters[MINIMUM_VALUE_THRESHOLD_KEY])

    #parse configuration
    if configurations is None:
        return (RESULT_STATE_UNKNOWN,
                ['There were no configurations supplied to the script.'])

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations:
        collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY]
        collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY])
    else:
        # ams-site/timeline.metrics.service.webapp.address is required
        if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
            return (RESULT_STATE_UNKNOWN, [
                '{0} is a required parameter for the script'.format(
                    METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)
            ])
        else:
            collector_webapp_address = configurations[
                METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
            if valid_collector_webapp_address(collector_webapp_address):
                collector_host = select_metric_collector_for_sink(
                    app_id.lower()).split(":")[0]
                collector_port = int(collector_webapp_address[1])
            else:
                return (RESULT_STATE_UNKNOWN, [
                    '{0} value should be set as "fqdn_hostname:port", but set to {1}'
                    .format(
                        METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY,
                        configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])
                ])

    namenode_service_rpc_address = None
    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    hdfs_site = configurations[HDFS_SITE_KEY]

    if 'dfs.namenode.servicerpc-address' in hdfs_site:
        namenode_service_rpc_address = hdfs_site[
            'dfs.namenode.servicerpc-address']

    # if namenode alert and HA mode
    if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
        # hdfs-site is required
        if not HDFS_SITE_KEY in configurations:
            return (RESULT_STATE_UNKNOWN, [
                '{0} is a required parameter for the script'.format(
                    HDFS_SITE_KEY)
            ])

        if SMOKEUSER_KEY in configurations:
            smokeuser = configurations[SMOKEUSER_KEY]

        executable_paths = None
        if EXECUTABLE_SEARCH_PATHS in configurations:
            executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

        # parse script arguments
        security_enabled = False
        if SECURITY_ENABLED_KEY in configurations:
            security_enabled = str(
                configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

        kerberos_keytab = None
        if KERBEROS_KEYTAB in configurations:
            kerberos_keytab = configurations[KERBEROS_KEYTAB]

        kerberos_principal = None
        if KERBEROS_PRINCIPAL in configurations:
            kerberos_principal = configurations[KERBEROS_PRINCIPAL]
            kerberos_principal = kerberos_principal.replace('_HOST', host_name)

        # determine whether or not SSL is enabled
        is_ssl_enabled = False
        if DFS_POLICY_KEY in configurations:
            dfs_policy = configurations[DFS_POLICY_KEY]
            if dfs_policy == "HTTPS_ONLY":
                is_ssl_enabled = True

        kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                        DEFAULT_KERBEROS_KINIT_TIMER_MS)

        name_service = configurations[NAMESERVICE_KEY]

        # look for dfs.ha.namenodes.foo
        nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
        if not nn_unique_ids_key in hdfs_site:
            return (RESULT_STATE_UNKNOWN, [
                'Unable to find unique NameNode alias key {0}'.format(
                    nn_unique_ids_key)
            ])

        namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
        jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

        if is_ssl_enabled:
            namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
            jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

        # now we have something like 'nn1,nn2,nn3,nn4'
        # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
        # ie dfs.namenode.http-address.hacluster.nn1
        namenodes = []
        active_namenodes = []
        nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
        for nn_unique_id in nn_unique_ids:
            key = namenode_http_fragment.format(name_service, nn_unique_id)

            if key in hdfs_site:
                # use str() to ensure that unicode strings do not have the u' in them
                value = str(hdfs_site[key])
                namenode = str(hdfs_site[key]).split(":")[0]

                namenodes.append(namenode)
                try:
                    jmx_uri = jmx_uri_fragment.format(value)
                    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
                        env = Environment.get_instance()

                        # curl requires an integer timeout
                        curl_connection_timeout = int(connection_timeout)
                        state_response, error_msg, time_millis = curl_krb_request(
                            env.tmp_dir,
                            kerberos_keytab,
                            kerberos_principal,
                            jmx_uri,
                            "ha_nn_health",
                            executable_paths,
                            False,
                            "NameNode High Availability Health",
                            smokeuser,
                            connection_timeout=curl_connection_timeout,
                            kinit_timer_ms=kinit_timer_ms)

                        state = _get_ha_state_from_json(state_response)
                    else:
                        state_response = get_jmx(jmx_uri, connection_timeout)
                        state = _get_ha_state_from_json(state_response)

                    if state == HDFS_NN_STATE_ACTIVE:
                        active_namenodes.append(namenode)

                        # Only check active NN
                        nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(
                            name_service, nn_unique_id)
                        if nn_service_rpc_address_key in hdfs_site:
                            namenode_service_rpc_address = hdfs_site[
                                nn_service_rpc_address_key]
                    pass
                except:
                    logger.exception("Unable to determine the active NameNode")
        pass

        if merge_ha_metrics:
            hostnames = ",".join(namenodes)
            # run only on active NN, no need to run the same requests from the standby
            if host_name not in active_namenodes:
                return (RESULT_STATE_SKIPPED,
                        ['This alert will be reported by another host.'])
        pass

    # Skip service rpc alert if port is not enabled
    if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name:
        return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.'])

    get_metrics_parameters = {
        "metricNames": metric_name,
        "appId": app_id,
        "hostname": hostnames,
        "startTime": current_time - interval * 60 * 1000,
        "endTime": current_time,
        "grouped": "true",
    }

    encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters)

    try:
        conn = httplib.HTTPConnection(collector_host,
                                      int(collector_port),
                                      timeout=connection_timeout)
        conn.request("GET",
                     AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
        response = conn.getresponse()
        data = response.read()
        conn.close()
    except Exception:
        return (RESULT_STATE_UNKNOWN, [
            "Unable to retrieve metrics from the Ambari Metrics service."
        ])

    if response.status != 200:
        return (RESULT_STATE_UNKNOWN, [
            "Unable to retrieve metrics from the Ambari Metrics service."
        ])

    data_json = json.loads(data)
    metrics = []
    # will get large standard deviation for multiple hosts,
    # if host1 reports small local values, but host2 reports large local values
    for metrics_data in data_json["metrics"]:
        metrics += metrics_data["metrics"].values()
    pass

    if not metrics or len(metrics) < 2:
        number_of_data_points = len(metrics) if metrics else 0
        return (RESULT_STATE_SKIPPED, [
            "There are not enough data points to calculate the standard deviation ({0} sampled)"
            .format(number_of_data_points)
        ])

    minimum_value_multiplier = 1
    if 'dfs.FSNamesystem.CapacityUsed' in metric_name:
        minimum_value_multiplier = 1024 * 1024  # MB to bytes
    elif 'rpc.rpc.datanode' in metric_name or 'rpc.rpc.client' in metric_name:
        minimum_value_multiplier = 1000  # seconds to millis

    if minimum_value_threshold:
        # Filter out points below min threshold
        metrics = [
            metric for metric in metrics
            if metric > (minimum_value_threshold * minimum_value_multiplier)
        ]
        if len(metrics) < 2:
            return (RESULT_STATE_OK, [
                'There were no data points above the minimum threshold of {0} seconds'
                .format(minimum_value_threshold)
            ])

    mean_value = mean(metrics)
    stddev = sample_standard_deviation(metrics)

    try:
        deviation_percent = stddev / float(mean_value) * 100
    except ZeroDivisionError:
        # should not be a case for this alert
        return (RESULT_STATE_SKIPPED, [
            "Unable to calculate the standard deviation because the mean value is 0"
        ])

    # log the AMS request
    if logger.isEnabledFor(logging.DEBUG):
        logger.debug("""
    AMS request parameters - {0}
    AMS response - {1}
    Mean - {2}
    Standard deviation - {3}
    Percentage standard deviation - {4}
    """.format(encoded_get_metrics_parameters, data_json, mean_value, stddev,
               deviation_percent))

    mean_value_localized = locale.format("%.0f", mean_value, grouping=True)

    variance_value = (deviation_percent / 100.0) * mean_value
    variance_value_localized = locale.format("%.0f",
                                             variance_value,
                                             grouping=True)

    # check for CRITICAL status
    if deviation_percent > critical_threshold:
        threshold_value = ((critical_threshold / 100.0) * mean_value)
        threshold_value_localized = locale.format("%.0f",
                                                  threshold_value,
                                                  grouping=True)

        message = DEVIATION_THRESHOLD_MESSAGE.format(
            variance_value_localized, metric_units, deviation_percent,
            mean_value_localized, metric_units, threshold_value_localized,
            metric_units)

        return (RESULT_STATE_CRITICAL, [message])

    # check for WARNING status
    if deviation_percent > warning_threshold:
        threshold_value = ((warning_threshold / 100.0) * mean_value)
        threshold_value_localized = locale.format("%.0f",
                                                  threshold_value,
                                                  grouping=True)

        message = DEVIATION_THRESHOLD_MESSAGE.format(
            variance_value_localized, metric_units, deviation_percent,
            mean_value_localized, metric_units, threshold_value_localized,
            metric_units)

        return (RESULT_STATE_WARNING, [message])

    # return OK status; use the warning threshold as the value to compare against
    threshold_value = ((warning_threshold / 100.0) * mean_value)
    threshold_value_localized = locale.format("%.0f",
                                              threshold_value,
                                              grouping=True)

    message = DEVIATION_OK_MESSAGE.format(variance_value_localized,
                                          metric_units, warning_threshold,
                                          mean_value_localized, metric_units,
                                          threshold_value_localized,
                                          metric_units)

    return (RESULT_STATE_OK, [message])
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  if configurations is None:
    return (('UNKNOWN', ['There were no configurations supplied to the script.']))

  scheme = 'http'  
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

  if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

  if YARN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[YARN_HTTP_POLICY_KEY]
    
  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  uri = str(host_name) + ":" + uri.split(":")[1]
  live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri)
  convert_to_json_failed = False
  response_code = None
  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
        live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, False,
        "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout)

      try:
        url_response_json = json.loads(url_response)
        live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"])
      except ValueError, error:
        convert_to_json_failed = True
        logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
        format("NodeManager Health Summary", str(error)))

      if convert_to_json_failed:
        response_code, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
          live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, True,
          "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout)
    else:
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """
  hostnames = host_name
  current_time = int(time.time()) * 1000

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT
  if MERGE_HA_METRICS_PARAM_KEY in parameters:
    merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower() == 'true'

  metric_name = METRIC_NAME_PARAM_DEFAULT
  if METRIC_NAME_PARAM_KEY in parameters:
    metric_name = parameters[METRIC_NAME_PARAM_KEY]

  metric_units = METRIC_UNITS_DEFAULT
  if METRIC_UNITS_PARAM_KEY in parameters:
    metric_units = parameters[METRIC_UNITS_PARAM_KEY]

  app_id = APP_ID_PARAM_DEFAULT
  if APP_ID_PARAM_KEY in parameters:
    app_id = parameters[APP_ID_PARAM_KEY]

  interval = INTERVAL_PARAM_DEFAULT
  if INTERVAL_PARAM_KEY in parameters:
    interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY])

  warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT
  if DEVIATION_WARNING_THRESHOLD_KEY in parameters:
    warning_threshold = _coerce_to_integer(parameters[DEVIATION_WARNING_THRESHOLD_KEY])

  critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT
  if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
    critical_threshold = _coerce_to_integer(parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])

  minimum_value_threshold = None
  if MINIMUM_VALUE_THRESHOLD_KEY in parameters:
    minimum_value_threshold = _coerce_to_integer(parameters[MINIMUM_VALUE_THRESHOLD_KEY])

  #parse configuration
  if configurations is None:
    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])

  # hdfs-site is required
  if not HDFS_SITE_KEY in configurations:
    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])

  if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations:
    collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY].split(',')[0]
    collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY])
  else:
    # ams-site/timeline.metrics.service.webapp.address is required
    if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
      return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)])
    else:
      collector_webapp_address = configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
      if valid_collector_webapp_address(collector_webapp_address):
        collector_host = select_metric_collector_for_sink(app_id.lower())
        collector_port = int(collector_webapp_address[1])
      else:
        return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port", but set to {1}'.format(
          METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])])

  namenode_service_rpc_address = None
  # hdfs-site is required
  if not HDFS_SITE_KEY in configurations:
    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])

  hdfs_site = configurations[HDFS_SITE_KEY]

  if 'dfs.namenode.servicerpc-address' in hdfs_site:
    namenode_service_rpc_address = hdfs_site['dfs.namenode.servicerpc-address']

  # if namenode alert and HA mode
  if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
      return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])

    if SMOKEUSER_KEY in configurations:
      smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
      executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    # parse script arguments
    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
      security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
      kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
      kerberos_principal = configurations[KERBEROS_PRINCIPAL]
      kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # determine whether or not SSL is enabled
    is_ssl_enabled = False
    if DFS_POLICY_KEY in configurations:
      dfs_policy = configurations[DFS_POLICY_KEY]
      if dfs_policy == "HTTPS_ONLY":
        is_ssl_enabled = True

    kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS)

    name_service = get_name_service_by_hostname(hdfs_site, host_name)

    # look for dfs.ha.namenodes.foo
    nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
    if not nn_unique_ids_key in hdfs_site:
      return (RESULT_STATE_UNKNOWN, ['Unable to find unique NameNode alias key {0}'.format(nn_unique_ids_key)])

    namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
    jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

    if is_ssl_enabled:
      namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
      jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

    # now we have something like 'nn1,nn2,nn3,nn4'
    # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
    # ie dfs.namenode.http-address.hacluster.nn1
    namenodes = []
    active_namenodes = []
    nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
    for nn_unique_id in nn_unique_ids:
      key = namenode_http_fragment.format(name_service, nn_unique_id)

      if key in hdfs_site:
        # use str() to ensure that unicode strings do not have the u' in them
        value = str(hdfs_site[key])
        namenode = str(hdfs_site[key]).split(":")[0]

        namenodes.append(namenode)
        try:
          jmx_uri = jmx_uri_fragment.format(value)
          if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            # curl requires an integer timeout
            curl_connection_timeout = int(connection_timeout)
            state_response, error_msg, time_millis = curl_krb_request(env.tmp_dir,
              kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False,
              "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout,
              kinit_timer_ms = kinit_timer_ms)

            state = _get_ha_state_from_json(state_response)
          else:
            state = _get_state_from_jmx(jmx_uri, connection_timeout)

          if state == HDFS_NN_STATE_ACTIVE:
            active_namenodes.append(namenode)

            # Only check active NN
            nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(name_service, nn_unique_id)
            if nn_service_rpc_address_key in hdfs_site:
              namenode_service_rpc_address = hdfs_site[nn_service_rpc_address_key]
          pass
        except:
          logger.exception("Unable to determine the active NameNode")
    pass

    if merge_ha_metrics:
      hostnames = ",".join(namenodes)
      # run only on active NN, no need to run the same requests from the standby
      if host_name not in active_namenodes:
        return (RESULT_STATE_SKIPPED, ['This alert will be reported by another host.'])
    pass

  # Skip service rpc alert if port is not enabled
  if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name:
    return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.'])

  get_metrics_parameters = {
    "metricNames": metric_name,
    "appId": app_id,
    "hostname": hostnames,
    "startTime": current_time - interval * 60 * 1000,
    "endTime": current_time,
    "grouped": "true",
    }

  encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters)

  ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf"
  metric_truststore_ca_certs='ca.pem'
  ca_certs = os.path.join(ams_monitor_conf_dir,
                          metric_truststore_ca_certs)
  metric_collector_https_enabled = str(configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY"

  _ssl_version = _get_ssl_version()
  try:
    conn = network.get_http_connection(
      collector_host,
      int(collector_port),
      metric_collector_https_enabled,
      ca_certs,
      ssl_version=_ssl_version
    )
    conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
    response = conn.getresponse()
    data = response.read()
    conn.close()
  except Exception, e:
    logger.info(str(e))
    return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from the Ambari Metrics service."])
Ejemplo n.º 27
0
      if json_is_valid:
        for attr in jmx_property_value:
          if attr not in json_data:
            beans = json_response['beans']
            for jmx_prop_list_item in beans:
              if "name" in jmx_prop_list_item and jmx_prop_list_item["name"] == jmx_property_key:
                if attr not in jmx_prop_list_item:
                  raise Exception("Unable to find {0} in JSON from {1} ".format(attr, url))
                json_data = jmx_prop_list_item

          value_list.append(json_data[attr])

      http_response_code = None
      if not json_is_valid and security_enabled and kerberos_principal is not None and kerberos_keytab is not None:
        http_response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab,
          kerberos_principal, url, "metric_alert", kerberos_executable_search_paths, True,
          self.get_name(), smokeuser, connection_timeout=self.curl_connection_timeout,
          kinit_timer_ms = self.kinit_timeout)

    return (value_list, http_response_code)

  def _get_reporting_text(self, state):
    '''
    Always returns {0} since the result of the script alert is a rendered string.
    This will ensure that the base class takes the result string and just uses
    it directly.

    :param state: the state of the alert in uppercase (such as OK, WARNING, etc)
    :return:  the parameterized text
    '''
    return '{0}'
Ejemplo n.º 28
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    uri = None
    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'
    checkpoint_tx = CHECKPOINT_TX_DEFAULT
    checkpoint_period = CHECKPOINT_PERIOD_DEFAULT

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if NN_CHECKPOINT_TX_KEY in configurations:
        checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

    if NN_CHECKPOINT_PERIOD_KEY in configurations:
        checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    percent_warning = PERCENT_WARNING_DEFAULT
    if PERCENT_WARNING_KEY in parameters:
        percent_warning = float(parameters[PERCENT_WARNING_KEY])

    percent_critical = PERCENT_CRITICAL_DEFAULT
    if PERCENT_CRITICAL_KEY in parameters:
        percent_critical = float(parameters[PERCENT_CRITICAL_KEY])

    checkpoint_txn_multiplier_warning = CHECKPOINT_TX_MULTIPLIER_WARNING_DEFAULT
    if CHECKPOINT_TX_MULTIPLIER_WARNING_KEY in parameters:
        checkpoint_txn_multiplier_warning = float(
            parameters[CHECKPOINT_TX_MULTIPLIER_WARNING_KEY])

    checkpoint_txn_multiplier_critical = CHECKPOINT_TX_MULTIPLIER_CRITICAL_DEFAULT
    if CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY in parameters:
        checkpoint_txn_multiplier_critical = float(
            parameters[CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY])

    kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                    DEFAULT_KERBEROS_KINIT_TIMER_MS)

    # determine the right URI and whether to use SSL
    hdfs_site = configurations[HDFS_SITE_KEY]

    scheme = "https" if http_policy == "HTTPS_ONLY" else "http"

    nn_addresses = get_all_namenode_addresses(hdfs_site)
    for nn_address in nn_addresses:
        if nn_address.startswith(host_name + ":"):
            uri = nn_address
            break
    if not uri:
        return (RESULT_STATE_SKIPPED, [
            'NameNode on host {0} not found (namenode adresses = {1})'.format(
                host_name, ', '.join(nn_addresses))
        ])

    current_time = int(round(time.time() * 1000))

    last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(
        scheme, uri)
    journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(
        scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            # curl requires an integer timeout
            curl_connection_timeout = int(connection_timeout)

            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                last_checkpoint_time_qry,
                "checkpoint_time_alert",
                executable_paths,
                False,
                "NameNode Last Checkpoint",
                smokeuser,
                connection_timeout=curl_connection_timeout,
                kinit_timer_ms=kinit_timer_ms)

            last_checkpoint_time_response_json = json.loads(
                last_checkpoint_time_response)
            last_checkpoint_time = int(
                last_checkpoint_time_response_json["beans"][0]
                ["LastCheckpointTime"])

            journal_transaction_info_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir,
                kerberos_keytab,
                kerberos_principal,
                journal_transaction_info_qry,
                "checkpoint_time_alert",
                executable_paths,
                False,
                "NameNode Last Checkpoint",
                smokeuser,
                connection_timeout=curl_connection_timeout,
                kinit_timer_ms=kinit_timer_ms)

            journal_transaction_info_response_json = json.loads(
                journal_transaction_info_response)
            journal_transaction_info = journal_transaction_info_response_json[
                "beans"][0]["JournalTransactionInfo"]
        else:
            last_checkpoint_time = int(
                get_value_from_jmx(last_checkpoint_time_qry,
                                   "LastCheckpointTime", connection_timeout))

            journal_transaction_info = get_value_from_jmx(
                journal_transaction_info_qry, "JournalTransactionInfo",
                connection_timeout)

        journal_transaction_info_dict = json.loads(journal_transaction_info)

        last_tx = int(
            journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
        most_recent_tx = int(
            journal_transaction_info_dict['MostRecentCheckpointTxId'])
        transaction_difference = last_tx - most_recent_tx

        delta = (current_time - last_checkpoint_time) / 1000

        label = LABEL.format(h=get_time(delta)['h'],
                             m=get_time(delta)['m'],
                             tx=transaction_difference)

        is_checkpoint_txn_warning = transaction_difference > checkpoint_txn_multiplier_warning * int(
            checkpoint_tx)
        is_checkpoint_txn_critical = transaction_difference > checkpoint_txn_multiplier_critical * int(
            checkpoint_tx)

        # Either too many uncommitted transactions or missed check-pointing for
        # long time decided by the thresholds
        if is_checkpoint_txn_critical or (float(delta) / int(checkpoint_period)
                                          * 100 >= int(percent_critical)):
            logger.debug(
                'Raising critical alert: transaction_difference = {0}, checkpoint_tx = {1}'
                .format(transaction_difference, checkpoint_tx))
            result_code = 'CRITICAL'
        elif is_checkpoint_txn_warning or (
                float(delta) / int(checkpoint_period) * 100 >=
                int(percent_warning)):
            logger.debug(
                'Raising warning alert: transaction_difference = {0}, checkpoint_tx = {1}'
                .format(transaction_difference, checkpoint_tx))
            result_code = 'WARNING'

    except:
        label = traceback.format_exc()
        result_code = 'UNKNOWN'

    return ((result_code, [label]))
Ejemplo n.º 29
0
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  if configurations is None:
    return (('UNKNOWN', ['There were no configurations supplied to the script.']))

  scheme = 'http'  
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

  if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

  if YARN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[YARN_HTTP_POLICY_KEY]

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  uri = str(host_name) + ":" + uri.split(":")[1]
  live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri)
  convert_to_json_failed = False
  response_code = None
  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()
      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
                                              live_nodemanagers_qry, "nm_health_summary_alert", None, False,
                                              "NodeManager Health Summary")
      try:
        url_response_json = json.loads(url_response)
        live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"])
      except ValueError, error:
        convert_to_json_failed = True
        if logger.isEnabledFor(logging.DEBUG):
          logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
          format("NodeManager Health Summary", str(error)))

      if convert_to_json_failed:
        response_code, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
                                                    live_nodemanagers_qry, "nm_health_summary_alert", None, True,
                                                    "NodeManager Health Summary")
    else:
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label
  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  if configurations is None:
    return (('UNKNOWN', ['There were no configurations supplied to the script.']))

  # Set configuration settings

  if STORM_UI_PORT in configurations:
    stormuiport = configurations[STORM_UI_PORT]

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  if WARNING_KEY in parameters:
    warning_val = parameters[WARNING_KEY]

  if CRITICAL_KEY in parameters:
    critical_val = parameters[CRITICAL_KEY]

  if COMPARISON_KEY in parameters:
    comparison_val = parameters[COMPARISON_KEY]

  if FIELD_TYPE_KEY in parameters:
    field_type_val = parameters[FIELD_TYPE_KEY]

  if FIELD_NAME_KEY in parameters:
    field_name_val = parameters[FIELD_NAME_KEY]

  if TOPOLOGY_ID_KEY in parameters:
    topology_id_val = parameters[TOPOLOGY_ID_KEY]

  if HTTPS_ENABLED_KEY in parameters and lower(str(parameters[HTTPS_ENABLED_KEY])) == 'true':
    if HTTPS_PORT_KEY in parameters:
      stormuiport = str(parameters[HTTPS_PORT_KEY])
      protocol = 'https'
    else:
      return (('UNKNOWN', ['Please provide a port number as parameter: '+HTTPS_PORT_KEY]))
  else:
    protocol = 'http'

  # Check comparison and field type combination
  if not field_type_val in ALLOWED_COMPARISON_VALUES.keys():
    return (('UNKNOWN', ['Field type error, must be one of: '+','.join(ALLOWED_COMPARISON_VALUES.keys())]))

  if not comparison_val in ALLOWED_COMPARISON_VALUES[field_type_val]:
    return (('UNKNOWN', ['Comparison error, must be one of: '+','.join(ALLOWED_COMPARISON_VALUES[field_type_val])+' for given field type: '+field_type_val+'. Type not valid: '+comparison_val]))

  label = None
  result_code = "OK"

  try:

    # Set up url to query
    rest_api_request_summary = protocol+'://'+host_name+':'+stormuiport+'/api/v1/topology/summary'

    # Kerberos curl
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)
      summary_response, error_msg, time_millis = curl_krb_request('/tmp/', kerberos_keytab, kerberos_principal, rest_api_request_summary, "storm_topology", executable_paths, False, "Storm Topology Rest API", smokeuser, connection_timeout=curl_connection_timeout)

    # Non-kerberos curl
    else:
      req = urllib2.Request(rest_api_request_summary)
      response = urllib2.urlopen(req)
      summary_response = response.read()

    # Get summary to check if the topology is in there
    summary = json.loads(summary_response)
    topology_name = None
    topology_id = None
    for top in summary['topologies']:
      if topology_id_val == top['id']:
        topology_id = top['id']
      elif topology_id_val == top['name']:
        if topology_name or topology_id:
          return (('UNKNOWN', ['Multiple topologies for with id or name: '+topology_id_val]))
        topology_id = top['id']

    if not topology_id:
      return (('UNKNOWN', ['No topology found with id or name: '+topology_id_val]))

    # Get topology information
    rest_api_request_topology = protocol+'://'+host_name+':'+stormuiport+'/api/v1/topology/'+topology_id

    # Kerberos curl
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      topology_response, error_msg, time_millis = curl_krb_request('/tmp/', kerberos_keytab, kerberos_principal, rest_api_request_topology, "storm_topology", executable_paths, False, "Storm Topology Rest API", smokeuser, connection_timeout=curl_connection_timeout)

    # Non-kerberos curl
    else:
      req = urllib2.Request(rest_api_request_topology)
      response = urllib2.urlopen(req)
      topology_response = response.read()

    # Load response
    json_response = json.loads(topology_response)
    field_val = json_response

    # Retrive value
    for field in field_name_val.split('.'):
      if not field in field_val.keys():
        return (('UNKNOWN', ['Could not find field: '+field_name_val+' in response: '+topology_response]))
      else:
        field_val = field_val[field]
        if isinstance(field_val, list):
          for elem in field_val:
            if 'window' in elem.keys() and elem['window'] == DEFAULT_WINDOW_VALUE:
              field_val = elem
              break

    #Cast all three values to appropriate type
    raw_field_values = { 'field':field_val, 'WARNING':warning_val, 'CRITICAL':critical_val }
    field_values = dict()
    for field in raw_field_values.keys():
      success,value = try_cast(raw_field_values[field],field_type_val)
      if success:
        field_values[field] = value
      else:
        return (('UNKNOWN', [field+' error: '+value]))

    #Assume correct
    label = 'The current value is {c}. Warning threshold is {o} {w} and critical threshold is {o} {t}.'.format(c=field_values['field'],o=comparison_val,w=field_values['WARNING'],t=field_values['CRITICAL'])

    #Perform comparison for each type
    for level in ['WARNING', 'CRITICAL']:
      if comparison(field_values['field'],field_values[level],comparison_val):
        result_code = level
        label = 'The current value is {c}, the threshold is {o} {t}'.format(c=field_values['field'],o=comparison_val,t=field_values[level])

  #Catch any exceptions during the curls
  except:
    label = traceback.format_exc()
    result_code = 'UNKNOWN'

  label = 'Topology: '+topology_id_val+', '+label
  return ((result_code, [label]))