def _load_jmx(self, ssl, host, port, jmx_metric): """ creates a JmxMetric object that holds info about jmx-based metrics """ value_list = [] kerberos_keytab = None kerberos_principal = None if logger.isEnabledFor(logging.DEBUG): logger.debug(str(jmx_metric.property_map)) security_enabled = str(self._get_configuration_value(SECURITY_ENABLED_KEY)).upper() == 'TRUE' if self.uri_property_keys.kerberos_principal is not None: kerberos_principal = self._get_configuration_value( self.uri_property_keys.kerberos_principal) if kerberos_principal is not None: # substitute _HOST in kerberos principal with actual fqdn kerberos_principal = kerberos_principal.replace('_HOST', self.host_name) if self.uri_property_keys.kerberos_keytab is not None: kerberos_keytab = self._get_configuration_value(self.uri_property_keys.kerberos_keytab) if "0.0.0.0" in str(host): host = self.host_name for jmx_property_key, jmx_property_value in jmx_metric.property_map.iteritems(): url = "{0}://{1}:{2}/jmx?qry={3}".format( "https" if ssl else "http", host, str(port), jmx_property_key) # use a customer header processor that will look for the non-standard # "Refresh" header and attempt to follow the redirect response = None content = '' try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: tmp_dir = Constants.AGENT_TMP_DIR if tmp_dir is None: tmp_dir = gettempdir() kerberos_executable_search_paths = self._get_configuration_value('{{kerberos-env/executable_search_paths}}') smokeuser = self._get_configuration_value('{{cluster-env/smokeuser}}') response, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal, url, "metric_alert", kerberos_executable_search_paths, False, self.get_name(), smokeuser, connection_timeout=self.curl_connection_timeout, kinit_timer_ms = self.kinit_timeout) content = response else: url_opener = urllib2.build_opener(RefreshHeaderProcessor()) response = url_opener.open(url, timeout=self.connection_timeout) content = response.read() except Exception, exception: if logger.isEnabledFor(logging.DEBUG): logger.exception("[Alert][{0}] Unable to make a web request: {1}".format(self.get_name(), str(exception))) finally:
def _make_web_request(self, url): """ Makes an http(s) request to a web resource and returns the http code. If there was an error making the request, return 0 for the status code. """ error_msg = None try: response_code = 0 kerberos_keytab = None kerberos_principal = None if self.uri_property_keys.kerberos_principal is not None: kerberos_principal = self._get_configuration_value( self.uri_property_keys.kerberos_principal) if kerberos_principal is not None: # substitute _HOST in kerberos principal with actual fqdn kerberos_principal = kerberos_principal.replace('_HOST', self.host_name) if self.uri_property_keys.kerberos_keytab is not None: kerberos_keytab = self._get_configuration_value(self.uri_property_keys.kerberos_keytab) security_enabled = self._get_configuration_value('{{cluster-env/security_enabled}}') if kerberos_principal is not None and kerberos_keytab is not None \ and security_enabled is not None and security_enabled.lower() == "true": # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl. Use the md5 hash of the combination of the principal and keytab file # to generate a (relatively) unique cache filename so that we can use it as needed. tmp_dir = Constants.AGENT_TMP_DIR if tmp_dir is None: tmp_dir = gettempdir() # Get the configured Kerberos executables search paths, if any kerberos_executable_search_paths = self._get_configuration_value('{{kerberos-env/executable_search_paths}}') smokeuser = self._get_configuration_value('{{cluster-env/smokeuser}}') response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal, url, "web_alert", kerberos_executable_search_paths, True, self.get_name(), smokeuser, connection_timeout=self.curl_connection_timeout) else: # kerberos is not involved; use urllib2 response_code, time_millis, error_msg = self._make_web_request_urllib(url) return WebResponse(status_code=response_code, time_millis=time_millis, error_msg=error_msg) except Exception, exception: if logger.isEnabledFor(logging.DEBUG): logger.exception("[Alert][{0}] Unable to make a web request.".format(self.get_name())) return WebResponse(status_code=0, time_millis=0, error_msg=str(exception))
def call_curl_request(self,user,keytab,principal, url, flag_http_response, request_method='GET',request_body='',header=''): """ :param user: service user for which call is to be made :param keytab: keytab of service user :param principal: principal of service user :param url: url with which call is to be made :param flag_http_response: flag to get only response-code or response string :param request_method: http method (GET / POST / PUT / DELETE) :param request_body: data to be send along with the request :param header: http header required for the call :return: Returns the response error_msg , time_millis """ response = None error_msg = None time_millis = 0 response, error_msg, time_millis = curl_krb_request(Environment.get_instance().tmp_dir, keytab, principal, url, 'ranger_admin_calls', None, flag_http_response, "Ranger-Admin API calls", user,kinit_timer_ms=0,method = request_method,body=request_body,header=header) return response, error_msg, time_millis
def get_jmx_data(nn_address, modeler_type, metric, encrypted=False, security_enabled=False): """ :param nn_address: Namenode Address, e.g., host:port, ** MAY ** be preceded with "http://" or "https://" already. If not preceded, will use the encrypted param to determine. :param modeler_type: Modeler type to query using startswith function :param metric: Metric to return :return: Return an object representation of the metric, or None if it does not exist """ if not nn_address or not modeler_type or not metric: return None nn_address = nn_address.strip() if not nn_address.startswith("http"): nn_address = ("https://" if encrypted else "http://") + nn_address if not nn_address.endswith("/"): nn_address = nn_address + "/" nn_address = nn_address + "jmx" Logger.info("Retrieve modeler: %s, metric: %s from JMX endpoint %s" % (modeler_type, metric, nn_address)) if security_enabled: import params data, error_msg, time_millis = curl_krb_request(params.tmp_dir, params.smoke_user_keytab, params.smokeuser_principal, nn_address, "jn_upgrade", params.kinit_path_local, False, None, params.smoke_user) else: data = urllib2.urlopen(nn_address).read() my_data = None if data: data_dict = json.loads(data) if data_dict: for el in data_dict['beans']: if el is not None and el['modelerType'] is not None and el['modelerType'].startswith(modeler_type): if metric in el: my_data = el[metric] if my_data: my_data = json.loads(str(my_data)) break return my_data
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # if not in HA mode, then SKIP if not NAMESERVICE_KEY in configurations: return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True name_service = configurations[NAMESERVICE_KEY] hdfs_site = configurations[HDFS_SITE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, [ 'Unable to find unique namenode alias key {0}'.format( nn_unique_ids_key) ]) namenode_http_fragment = NAMENODE_HTTP_FRAGMENT jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = NAMENODE_HTTPS_FRAGMENT jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" active_namenodes = [] standby_namenodes = [] unknown_namenodes = [] # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) rpc_key = NAMENODE_RPC_FRAGMENT.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) if INADDR_ANY in value and rpc_key in hdfs_site: rpc_value = str(hdfs_site[rpc_key]) if INADDR_ANY not in rpc_value: rpc_host = rpc_value.split(":")[0] value = value.replace(INADDR_ANY, rpc_host) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri, "ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(value) elif state == HDFS_NN_STATE_STANDBY: standby_namenodes.append(value) else: unknown_namenodes.append(value) except: logger.exception(LOGGER_EXCEPTION_MESSAGE.format(host_name)) unknown_namenodes.append(value) # there's only one scenario here; there is exactly 1 active and 1 standby is_topology_healthy = len(active_namenodes) == 1 and len( standby_namenodes) == 1 result_label = 'Active{0}, Standby{1}, Unknown{2}'.format( str(active_namenodes), str(standby_namenodes), str(unknown_namenodes)) if is_topology_healthy: # if there is exactly 1 active and 1 standby NN return (RESULT_STATE_OK, [result_label]) else: # other scenario return (RESULT_STATE_CRITICAL, [result_label])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return ("UNKNOWN", ["There were no configurations supplied to the script."]) uri = None scheme = "http" http_uri = None https_uri = None http_policy = "HTTP_ONLY" checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == "TRUE" kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace("_HOST", host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) percent_warning = PERCENT_WARNING_DEFAULT if PERCENT_WARNING_KEY in parameters: percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100 percent_critical = PERCENT_CRITICAL_DEFAULT if PERCENT_CRITICAL_KEY in parameters: percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100 # determine the right URI and whether to use SSL uri = http_uri if http_policy == "HTTPS_ONLY": scheme = "https" if https_uri is not None: uri = https_uri current_time = int(round(time.time() * 1000)) last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme, uri) journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, last_checkpoint_time_qry, "checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, ) last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response) last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"]) journal_transaction_info_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, journal_transaction_info_qry, "checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, ) journal_transaction_info_response_json = json.loads(journal_transaction_info_response) journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"] else: last_checkpoint_time = int( get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout) ) journal_transaction_info = get_value_from_jmx( journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout ) journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int(journal_transaction_info_dict["LastAppliedOrWrittenTxId"]) most_recent_tx = int(journal_transaction_info_dict["MostRecentCheckpointTxId"]) transaction_difference = last_tx - most_recent_tx delta = (current_time - last_checkpoint_time) / 1000 label = LABEL.format(h=get_time(delta)["h"], m=get_time(delta)["m"], tx=transaction_difference) if (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_critical) ): result_code = "CRITICAL" elif (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_warning) ): result_code = "WARNING" except Exception, e: label = str(e) result_code = "UNKNOWN"
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT if TEMPLETON_PORT_KEY in configurations: webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true' # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) curl_connection_timeout = str(int(connection_timeout)) # the alert will always run on the webhcat host if host_name is None: host_name = socket.getfqdn() smokeuser = SMOKEUSER_DEFAULT if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # webhcat always uses http, never SSL query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(host_name, webhcat_port, smokeuser) # initialize total_time = 0 json_response = {} if security_enabled: try: # defaults smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT # check script params if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if SMOKEUSER_PRINCIPAL_KEY in configurations: smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] if SMOKEUSER_KEYTAB_KEY in configurations: smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] env = Environment.get_instance() stdout, stderr, time_millis = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) # check the response code response_code = int(stdout) # 0 indicates no connection if response_code == 0: label = CRITICAL_CONNECTION_MESSAGE.format(query_url) return (RESULT_CODE_CRITICAL, [label]) # any other response aside from 200 is a problem if response_code != 200: label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url) return (RESULT_CODE_CRITICAL, [label]) # now that we have the http status and it was 200, get the content stdout, stderr, total_time = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, False, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(stdout) except Exception, exception: return (RESULT_CODE_CRITICAL, [str(exception)])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # if not in HA mode, then SKIP if not NAMESERVICE_KEY in configurations: return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True name_service = configurations[NAMESERVICE_KEY] hdfs_site = configurations[HDFS_SITE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" active_namenodes = [] standby_namenodes = [] unknown_namenodes = [] # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service,nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(value) elif state == HDFS_NN_STATE_STANDBY: standby_namenodes.append(value) else: unknown_namenodes.append(value) except: unknown_namenodes.append(value) # now that the request is done, determine if this host is the host that # should report the status of the HA topology is_active_namenode = False for active_namenode in active_namenodes: if active_namenode.startswith(host_name): is_active_namenode = True # there's only one scenario here; there is exactly 1 active and 1 standby is_topology_healthy = len(active_namenodes) == 1 and len(standby_namenodes) == 1 result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(str(active_namenodes), str(standby_namenodes), str(unknown_namenodes)) # Healthy Topology: # - Active NN reports the alert, standby does not # # Unhealthy Topology: # - Report the alert if this is the first named host # - Report the alert if not the first named host, but the other host # could not report its status if is_topology_healthy: if is_active_namenode is True: return (RESULT_STATE_OK, [result_label]) else: return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) else: # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format( name_service, nn_unique_ids[0]) first_listed_host = '' if first_listed_host_key in hdfs_site: first_listed_host = hdfs_site[first_listed_host_key] is_first_listed_host = False if first_listed_host.startswith(host_name): is_first_listed_host = True if is_first_listed_host: return (RESULT_STATE_CRITICAL, [result_label]) else: # not the first listed host, but the first host might be in the unknown return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, upgrade_finalized_qry, "upgrade_finalized_state", executable_paths, False, "HDFS Upgrade Finalized State", smokeuser ) upgrade_finalized_response_json = json.loads(last_checkpoint_time_response) upgrade_finalized = bool(upgrade_finalized_response_json["beans"][0]["UpgradeFinalized"]) else: upgrade_finalized = bool(get_value_from_jmx(upgrade_finalized_qry, "UpgradeFinalized")) if upgrade_finalized: label = "HDFS cluster is not in the upgrade state" result_code = 'OK' else: label = "HDFS cluster is not finalized" result_code = 'CRITICAL' except: label = traceback.format_exc() result_code = 'UNKNOWN' return ((result_code, [label]))
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) query = "{0}://{1}/ws/v1/node/info".format(scheme,uri) try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, query, "nm_health_alert", executable_paths, False, "NodeManager Health", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(url_response) else: # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query, timeout=connection_timeout) json_response = json.loads(url_response.read()) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError), traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT if TEMPLETON_PORT_KEY in configurations: webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = configurations[SECURITY_ENABLED_KEY].lower( ) == 'true' # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) curl_connection_timeout = str(int(connection_timeout)) # the alert will always run on the webhcat host if host_name is None: host_name = socket.getfqdn() smokeuser = SMOKEUSER_DEFAULT if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # webhcat always uses http, never SSL query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format( host_name, webhcat_port, smokeuser) # initialize total_time = 0 json_response = {} if security_enabled: try: # defaults smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT # check script params if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: smokeuser_principal = parameters[ SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: smokeuser_keytab = parameters[ SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if SMOKEUSER_PRINCIPAL_KEY in configurations: smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] if SMOKEUSER_KEYTAB_KEY in configurations: smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) env = Environment.get_instance() stdout, stderr, time_millis = curl_krb_request( env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) # check the response code response_code = int(stdout) # 0 indicates no connection if response_code == 0: label = CRITICAL_CONNECTION_MESSAGE.format( query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) # any other response aside from 200 is a problem if response_code != 200: label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) # now that we have the http status and it was 200, get the content stdout, stderr, total_time = curl_krb_request( env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, False, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) json_response = json.loads(stdout) except: return (RESULT_CODE_CRITICAL, [traceback.format_exc()]) else: url_response = None try: # execute the query for the JSON that includes WebHCat status start_time = time.time() url_response = urllib2.urlopen(query_url, timeout=connection_timeout) total_time = time.time() - start_time json_response = json.loads(url_response.read()) except urllib2.HTTPError as httpError: label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) except: label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) finally: if url_response is not None: try: url_response.close() except: pass # if status is not in the response, we can't do any check; return CRIT if 'status' not in json_response: return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + str(json_response)]) # URL response received, parse it try: webhcat_status = json_response['status'] except: return (RESULT_CODE_CRITICAL, [ CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + "\n" + traceback.format_exc() ]) # proper JSON received, compare against known value if webhcat_status.lower() == WEBHCAT_OK_RESPONSE: result_code = RESULT_CODE_OK label = OK_MESSAGE.format(total_time, query_url) else: result_code = RESULT_CODE_CRITICAL label = CRITICAL_WEBHCAT_STATUS_MESSAGE.format(webhcat_status) return (result_code, [label])
def _make_web_request(self, url): """ Makes an http(s) request to a web resource and returns the http code. If there was an error making the request, return 0 for the status code. """ error_msg = None try: response_code = 0 kerberos_keytab = None kerberos_principal = None if self.uri_property_keys.kerberos_principal is not None: kerberos_principal = self._get_configuration_value( self.uri_property_keys.kerberos_principal) if kerberos_principal is not None: # substitute _HOST in kerberos principal with actual fqdn kerberos_principal = kerberos_principal.replace( '_HOST', self.host_name) if self.uri_property_keys.kerberos_keytab is not None: kerberos_keytab = self._get_configuration_value( self.uri_property_keys.kerberos_keytab) security_enabled = self._get_configuration_value( '{{cluster-env/security_enabled}}') if kerberos_principal is not None and kerberos_keytab is not None \ and security_enabled is not None and security_enabled.lower() == "true": # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl. Use the md5 hash of the combination of the principal and keytab file # to generate a (relatively) unique cache filename so that we can use it as needed. tmp_dir = Constants.AGENT_TMP_DIR if tmp_dir is None: tmp_dir = gettempdir() # Get the configured Kerberos executables search paths, if any kerberos_executable_search_paths = self._get_configuration_value( '{{kerberos-env/executable_search_paths}}') smokeuser = self._get_configuration_value( '{{cluster-env/smokeuser}}') response_code, error_msg, time_millis = curl_krb_request( tmp_dir, kerberos_keytab, kerberos_principal, url, "web_alert", kerberos_executable_search_paths, True, self.get_name(), smokeuser, connection_timeout=self.curl_connection_timeout, kinit_timer_ms=self.kinit_timeout) else: # kerberos is not involved; use urllib2 response_code, time_millis, error_msg = self._make_web_request_urllib( url) return WebResponse(status_code=response_code, time_millis=time_millis, error_msg=error_msg) except Exception, exception: if logger.isEnabledFor(logging.DEBUG): logger.exception( "[Alert][{0}] Unable to make a web request.".format( self.get_name())) return WebResponse(status_code=0, time_millis=0, error_msg=str(exception))
def call_curl_krb_request(tmp_dir, user_keytab, user_princ, uri, kinit_path, user, connection_timeout, method='GET', metric_json='', header='', tries=1, current_time=0, random_value=0): if method == 'POST': Logger.info("Generated metrics for %s:\n%s" % (uri, metric_json)) for i in xrange(0, tries): try: Logger.info("Connecting (%s) to %s" % (method, uri)) response = None errmsg = None time_millis = 0 response, errmsg, time_millis = curl_krb_request( tmp_dir, user_keytab, user_princ, uri, 'ams_service_check', kinit_path, False, "AMS Service Check", user, connection_timeout=connection_timeout, kinit_timer_ms=0, method=method, body=metric_json, header=header) except Exception as exception: if i < tries - 1: # range/xrange returns items from start to end-1 time.sleep(connection_timeout) Logger.info( "Connection failed for %s. Next retry in %s seconds." % (uri, connection_timeout)) continue else: raise Fail( "Unable to {0} metrics on: {1}. Exception: {2}".format( method, uri, str(exception))) finally: if not response: Logger.error( "Unable to {0} metrics on: {1}. Error: {2}".format( method, uri, errmsg)) else: Logger.info("%s response from %s: %s, errmsg: %s" % (method, uri, response, errmsg)) try: response.close() except: Logger.debug( "Unable to close {0} connection to {1}".format( method, uri)) if method == 'GET': data_json = json.loads(response) def floats_eq(f1, f2, delta): return abs(f1 - f2) < delta values_are_present = False for metrics_data in data_json["metrics"]: if (str(current_time) in metrics_data["metrics"] and str(current_time + 1000) in metrics_data["metrics"] and floats_eq( metrics_data["metrics"][str(current_time)], random_value, 0.0000001) and floats_eq( metrics_data["metrics"][str(current_time + 1000)], current_time, 1)): Logger.info( "Values %s and %s were found in the response from %s." % (uri, random_value, current_time)) values_are_present = True break pass if not values_are_present: if i < tries - 1: # range/xrange returns items from start to end-1 Logger.info( "Values weren't stored yet. Retrying in %s seconds." % (tries)) time.sleep(connection_timeout) else: raise Fail( "Values %s and %s were not found in the response." % (random_value, current_time)) else: break pass else: break
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) percent_warning = PERCENT_WARNING_DEFAULT if PERCENT_WARNING_KEY in parameters: percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100 percent_critical = PERCENT_CRITICAL_DEFAULT if PERCENT_CRITICAL_KEY in parameters: percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100 # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri current_time = int(round(time.time() * 1000)) last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format( scheme, uri) journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format( scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, last_checkpoint_time_qry, "checkpoint_time_alert", None, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout) last_checkpoint_time_response_json = json.loads( last_checkpoint_time_response) last_checkpoint_time = int( last_checkpoint_time_response_json["beans"][0] ["LastCheckpointTime"]) journal_transaction_info_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, journal_transaction_info_qry, "checkpoint_time_alert", None, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout) journal_transaction_info_response_json = json.loads( journal_transaction_info_response) journal_transaction_info = journal_transaction_info_response_json[ "beans"][0]["JournalTransactionInfo"] else: last_checkpoint_time = int( get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout)) journal_transaction_info = get_value_from_jmx( journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout) journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int( journal_transaction_info_dict['LastAppliedOrWrittenTxId']) most_recent_tx = int( journal_transaction_info_dict['MostRecentCheckpointTxId']) transaction_difference = last_tx - most_recent_tx delta = (current_time - last_checkpoint_time) / 1000 label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) if (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_critical)): result_code = 'CRITICAL' elif (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_warning)): result_code = 'WARNING' except Exception, e: label = str(e) result_code = 'UNKNOWN'
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri uri = str(host_name) + ":" + uri.split(":")[1] live_nodemanagers_qry = "{0}://{1}/jmx?qry={2}".format(scheme, uri, QRY) convert_to_json_failed = False response_code = None try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, False, "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) try: url_response_json = json.loads(url_response) live_nodemanagers = json.loads( find_value_in_jmx(url_response_json, "LiveNodeManagers", live_nodemanagers_qry)) except ValueError, error: convert_to_json_failed = True logger.exception( "[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}" .format("NodeManager Health Summary", str(error))) if convert_to_json_failed: response_code, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, True, "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) else:
for attr in jmx_property_value: if attr not in json_data: raise Exception( "Unable to find {0} in JSON from {1} ".format( attr, url)) value_list.append(json_data[attr]) http_response_code = None if not json_is_valid and security_enabled and kerberos_principal is not None and kerberos_keytab is not None: http_response_code, error_msg, time_millis = curl_krb_request( tmp_dir, kerberos_keytab, kerberos_principal, url, "metric_alert", kerberos_executable_search_paths, True, self.get_name(), smokeuser, connection_timeout=self.curl_connection_timeout) return (value_list, http_response_code) def _get_reporting_text(self, state): ''' Always returns {0} since the result of the script alert is a rendered string. This will ensure that the base class takes the result string and just uses it directly. :param state: the state of the alert in uppercase (such as OK, WARNING, etc)
def _load_jmx(self, ssl, host, port, jmx_metric): """ creates a JmxMetric object that holds info about jmx-based metrics """ value_list = [] kerberos_keytab = None kerberos_principal = None if logger.isEnabledFor(logging.DEBUG): logger.debug(str(jmx_metric.property_map)) security_enabled = str( self._get_configuration_value( SECURITY_ENABLED_KEY)).upper() == 'TRUE' if self.uri_property_keys.kerberos_principal is not None: kerberos_principal = self._get_configuration_value( self.uri_property_keys.kerberos_principal) if kerberos_principal is not None: # substitute _HOST in kerberos principal with actual fqdn kerberos_principal = kerberos_principal.replace( '_HOST', self.host_name) if self.uri_property_keys.kerberos_keytab is not None: kerberos_keytab = self._get_configuration_value( self.uri_property_keys.kerberos_keytab) if "0.0.0.0" in str(host): host = self.host_name for jmx_property_key, jmx_property_value in jmx_metric.property_map.iteritems( ): url = "{0}://{1}:{2}/jmx?qry={3}".format( "https" if ssl else "http", host, str(port), jmx_property_key) # use a customer header processor that will look for the non-standard # "Refresh" header and attempt to follow the redirect response = None content = '' try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: tmp_dir = self.config.get('agent', 'tmp_dir') if tmp_dir is None: tmp_dir = gettempdir() kerberos_executable_search_paths = self._get_configuration_value( '{{kerberos-env/executable_search_paths}}') smokeuser = self._get_configuration_value( '{{cluster-env/smokeuser}}') response, error_msg, time_millis = curl_krb_request( tmp_dir, kerberos_keytab, kerberos_principal, url, "metric_alert", kerberos_executable_search_paths, False, self.get_name(), smokeuser, connection_timeout=self.curl_connection_timeout) content = response else: url_opener = urllib2.build_opener(RefreshHeaderProcessor()) response = url_opener.open(url, timeout=self.connection_timeout) content = response.read() except Exception, exception: if logger.isEnabledFor(logging.DEBUG): logger.exception( "[Alert][{0}] Unable to make a web request: {1}". format(self.get_name(), str(exception))) finally:
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT if TEMPLETON_PORT_KEY in configurations: webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = configurations[SECURITY_ENABLED_KEY].lower( ) == 'true' # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) curl_connection_timeout = str(int(connection_timeout)) # the alert will always run on the webhcat host if host_name is None: host_name = socket.getfqdn() smokeuser = SMOKEUSER_DEFAULT if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # webhcat always uses http, never SSL query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format( host_name, webhcat_port, smokeuser) # initialize total_time = 0 json_response = {} if security_enabled: try: # defaults smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT # check script params if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: smokeuser_principal = parameters[ SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: smokeuser_keytab = parameters[ SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if SMOKEUSER_PRINCIPAL_KEY in configurations: smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] if SMOKEUSER_KEYTAB_KEY in configurations: smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] env = Environment.get_instance() stdout, stderr, time_millis = curl_krb_request( env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) # check the response code response_code = int(stdout) # 0 indicates no connection if response_code == 0: label = CRITICAL_CONNECTION_MESSAGE.format(query_url) return (RESULT_CODE_CRITICAL, [label]) # any other response aside from 200 is a problem if response_code != 200: label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url) return (RESULT_CODE_CRITICAL, [label]) # now that we have the http status and it was 200, get the content stdout, stderr, total_time = curl_krb_request( env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, False, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(stdout) except Exception, exception: return (RESULT_CODE_CRITICAL, [str(exception)])
def service_check(self, env): import params env.set_params(params) unique = functions.get_unique_id_and_date() dir = '/tmp' tmp_file = format("{dir}/{unique}") safemode_command = format("dfsadmin -fs {namenode_address} -safemode get | grep OFF") if params.security_enabled: Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user=params.hdfs_user ) ExecuteHadoop(safemode_command, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, try_sleep=3, tries=20, bin_dir=params.hadoop_bin_dir ) params.HdfsResource(dir, type="directory", action="create_on_execute", mode=0777 ) params.HdfsResource(tmp_file, type="file", action="delete_on_execute", ) params.HdfsResource(tmp_file, type="file", source="/etc/passwd", action="create_on_execute" ) params.HdfsResource(None, action="execute") if params.has_journalnode_hosts: if params.security_enabled: for host in params.journalnode_hosts: if params.https_only: uri = format("https://{host}:{journalnode_port}") else: uri = format("http://{host}:{journalnode_port}") response, errmsg, time_millis = curl_krb_request(params.tmp_dir, params.smoke_user_keytab, params.smokeuser_principal, uri, "jn_service_check", params.kinit_path_local, False, None, params.smoke_user) if not response: Logger.error("Cannot access WEB UI on: {0}. Error : {1}", uri, errmsg) return 1 else: journalnode_port = params.journalnode_port checkWebUIFileName = "checkWebUI.py" checkWebUIFilePath = format("{tmp_dir}/{checkWebUIFileName}") comma_sep_jn_hosts = ",".join(params.journalnode_hosts) checkWebUICmd = format("python {checkWebUIFilePath} -m {comma_sep_jn_hosts} -p {journalnode_port} -s {https_only}") File(checkWebUIFilePath, content=StaticFile(checkWebUIFileName), mode=0775) Execute(checkWebUICmd, logoutput=True, try_sleep=3, tries=5, user=params.smoke_user ) if params.is_namenode_master: if params.has_zkfc_hosts: pid_dir = format("{hadoop_pid_dir_prefix}/{hdfs_user}") pid_file = format("{pid_dir}/hadoop-{hdfs_user}-zkfc.pid") check_zkfc_process_cmd = as_user(format( "ls {pid_file} >/dev/null 2>&1 && ps -p `cat {pid_file}` >/dev/null 2>&1"), user=params.hdfs_user) Execute(check_zkfc_process_cmd, logoutput=True, try_sleep=3, tries=5 )
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None http_policy = 'HTTP_ONLY' # hdfs-site is required if not HDFS_SITE_KEY in configurations: return 'SKIPPED', [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) # determine the right URI and whether to use SSL hdfs_site = configurations[HDFS_SITE_KEY] scheme = "https" if http_policy == "HTTPS_ONLY" else "http" nn_addresses = get_all_namenode_addresses(hdfs_site) for nn_address in nn_addresses: if nn_address.startswith(host_name + ":") or nn_address == host_name: uri = nn_address break if not uri: return 'SKIPPED', [ 'NameNode on host {0} not found (namenode adresses = {1})'.format( host_name, ', '.join(nn_addresses)) ] upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format( scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, upgrade_finalized_qry, "upgrade_finalized_state", executable_paths, False, "HDFS Upgrade Finalized State", smokeuser, kinit_timer_ms=kinit_timer_ms) upgrade_finalized_response_json = json.loads( last_checkpoint_time_response) upgrade_finalized = bool(upgrade_finalized_response_json["beans"] [0]["UpgradeFinalized"]) else: upgrade_finalized = bool( get_value_from_jmx(upgrade_finalized_qry, "UpgradeFinalized")) if upgrade_finalized: label = "HDFS cluster is not in the upgrade state" result_code = 'OK' else: label = "HDFS cluster is not finalized" result_code = 'CRITICAL' except: label = traceback.format_exc() result_code = 'UNKNOWN' return ((result_code, [label]))
def service_check(self, env): import params env.set_params(params) unique = functions.get_unique_id_and_date() dir = params.hdfs_tmp_dir tmp_file = format("{dir}/{unique}") """ Ignore checking safemode, because this command is unable to get safemode state when 1 namenode is down in an HA setup (see more in HDFS-8277). Directly test HDFS availability by file system operations is consistent in both HA and non-HA environment. """ # safemode_command = format("dfsadmin -fs {namenode_address} -safemode get | grep OFF") if params.security_enabled: Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user=params.hdfs_user ) #ExecuteHadoop(safemode_command, # user=params.hdfs_user, # logoutput=True, # conf_dir=params.hadoop_conf_dir, # try_sleep=3, # tries=20, # bin_dir=params.hadoop_bin_dir #) params.HdfsResource(dir, type="directory", action="create_on_execute", mode=0777 ) params.HdfsResource(tmp_file, type="file", action="delete_on_execute", ) params.HdfsResource(tmp_file, type="file", source="/etc/passwd", action="create_on_execute" ) params.HdfsResource(None, action="execute") if params.has_journalnode_hosts: if params.security_enabled: for host in params.journalnode_hosts: if params.https_only: uri = format("https://{host}:{journalnode_port}") else: uri = format("http://{host}:{journalnode_port}") response, errmsg, time_millis = curl_krb_request(params.tmp_dir, params.smoke_user_keytab, params.smokeuser_principal, uri, "jn_service_check", params.kinit_path_local, False, None, params.smoke_user) if not response: Logger.error("Cannot access WEB UI on: {0}. Error : {1}", uri, errmsg) return 1 else: journalnode_port = params.journalnode_port checkWebUIFileName = "checkWebUI.py" checkWebUIFilePath = format("{tmp_dir}/{checkWebUIFileName}") comma_sep_jn_hosts = ",".join(params.journalnode_hosts) checkWebUICmd = format("ambari-python-wrap {checkWebUIFilePath} -m {comma_sep_jn_hosts} -p {journalnode_port} -s {https_only}") File(checkWebUIFilePath, content=StaticFile(checkWebUIFileName), mode=0775) Execute(checkWebUICmd, logoutput=True, try_sleep=3, tries=5, user=params.smoke_user ) if params.is_namenode_master: if params.has_zkfc_hosts: pid_dir = format("{hadoop_pid_dir_prefix}/{hdfs_user}") pid_file = format("{pid_dir}/hadoop-{hdfs_user}-zkfc.pid") check_zkfc_process_cmd = as_user(format( "ls {pid_file} >/dev/null 2>&1 && ps -p `cat {pid_file}` >/dev/null 2>&1"), user=params.hdfs_user) Execute(check_zkfc_process_cmd, logoutput=True, try_sleep=3, tries=5 )
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT if TEMPLETON_PORT_KEY in configurations: webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true' # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) curl_connection_timeout = str(int(connection_timeout)) # the alert will always run on the webhcat host if host_name is None: host_name = socket.getfqdn() smokeuser = SMOKEUSER_DEFAULT if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # webhcat always uses http, never SSL query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(host_name, webhcat_port, smokeuser) # initialize total_time = 0 json_response = {} if security_enabled: try: # defaults smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT # check script params if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if SMOKEUSER_PRINCIPAL_KEY in configurations: smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] if SMOKEUSER_KEYTAB_KEY in configurations: smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] env = Environment.get_instance() stdout, stderr, time_millis = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) # check the response code response_code = int(stdout) # 0 indicates no connection if response_code == 0: label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) # any other response aside from 200 is a problem if response_code != 200: label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) # now that we have the http status and it was 200, get the content stdout, stderr, total_time = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, False, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(stdout) except: return (RESULT_CODE_CRITICAL, [traceback.format_exc()]) else: url_response = None try: # execute the query for the JSON that includes WebHCat status start_time = time.time() url_response = urllib2.urlopen(query_url, timeout=connection_timeout) total_time = time.time() - start_time json_response = json.loads(url_response.read()) except urllib2.HTTPError as httpError: label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) except: label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) finally: if url_response is not None: try: url_response.close() except: pass # if status is not in the response, we can't do any check; return CRIT if 'status' not in json_response: return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + str(json_response)]) # URL response received, parse it try: webhcat_status = json_response['status'] except: return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + "\n" + traceback.format_exc()]) # proper JSON received, compare against known value if webhcat_status.lower() == WEBHCAT_OK_RESPONSE: result_code = RESULT_CODE_OK label = OK_MESSAGE.format(total_time, query_url) else: result_code = RESULT_CODE_CRITICAL label = CRITICAL_WEBHCAT_STATUS_MESSAGE.format(webhcat_status) return (result_code, [label])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ hostnames = host_name current_time = int(time.time()) * 1000 # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT if MERGE_HA_METRICS_PARAM_KEY in parameters: merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower( ) == 'true' metric_name = METRIC_NAME_PARAM_DEFAULT if METRIC_NAME_PARAM_KEY in parameters: metric_name = parameters[METRIC_NAME_PARAM_KEY] metric_units = METRIC_UNITS_DEFAULT if METRIC_UNITS_PARAM_KEY in parameters: metric_units = parameters[METRIC_UNITS_PARAM_KEY] app_id = APP_ID_PARAM_DEFAULT if APP_ID_PARAM_KEY in parameters: app_id = parameters[APP_ID_PARAM_KEY] interval = INTERVAL_PARAM_DEFAULT if INTERVAL_PARAM_KEY in parameters: interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY]) warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT if DEVIATION_WARNING_THRESHOLD_KEY in parameters: warning_threshold = _coerce_to_integer( parameters[DEVIATION_WARNING_THRESHOLD_KEY]) critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters: critical_threshold = _coerce_to_integer( parameters[DEVIATION_CRITICAL_THRESHOLD_KEY]) minimum_value_threshold = None if MINIMUM_VALUE_THRESHOLD_KEY in parameters: minimum_value_threshold = _coerce_to_integer( parameters[MINIMUM_VALUE_THRESHOLD_KEY]) #parse configuration if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations: collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY] collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY]) else: # ams-site/timeline.metrics.service.webapp.address is required if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY) ]) else: collector_webapp_address = configurations[ METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":") if valid_collector_webapp_address(collector_webapp_address): collector_host = select_metric_collector_for_sink( app_id.lower()).split(":")[0] collector_port = int(collector_webapp_address[1]) else: return (RESULT_STATE_UNKNOWN, [ '{0} value should be set as "fqdn_hostname:port", but set to {1}' .format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY]) ]) namenode_service_rpc_address = None # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) hdfs_site = configurations[HDFS_SITE_KEY] if 'dfs.namenode.servicerpc-address' in hdfs_site: namenode_service_rpc_address = hdfs_site[ 'dfs.namenode.servicerpc-address'] # if namenode alert and HA mode if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode': # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format( HDFS_SITE_KEY) ]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) name_service = configurations[NAMESERVICE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, [ 'Unable to find unique NameNode alias key {0}'.format( nn_unique_ids_key) ]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 namenodes = [] active_namenodes = [] nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) namenode = str(hdfs_site[key]).split(":")[0] namenodes.append(namenode) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri, "ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(namenode) # Only check active NN nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format( name_service, nn_unique_id) if nn_service_rpc_address_key in hdfs_site: namenode_service_rpc_address = hdfs_site[ nn_service_rpc_address_key] pass except: logger.exception("Unable to determine the active NameNode") pass if merge_ha_metrics: hostnames = ",".join(namenodes) # run only on active NN, no need to run the same requests from the standby if host_name not in active_namenodes: return (RESULT_STATE_SKIPPED, ['This alert will be reported by another host.']) pass # Skip service rpc alert if port is not enabled if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name: return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.']) get_metrics_parameters = { "metricNames": metric_name, "appId": app_id, "hostname": hostnames, "startTime": current_time - interval * 60 * 1000, "endTime": current_time, "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters) try: conn = httplib.HTTPConnection(collector_host, int(collector_port), timeout=connection_timeout) conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() data = response.read() conn.close() except Exception: return (RESULT_STATE_UNKNOWN, [ "Unable to retrieve metrics from the Ambari Metrics service." ]) if response.status != 200: return (RESULT_STATE_UNKNOWN, [ "Unable to retrieve metrics from the Ambari Metrics service." ]) data_json = json.loads(data) metrics = [] # will get large standard deviation for multiple hosts, # if host1 reports small local values, but host2 reports large local values for metrics_data in data_json["metrics"]: metrics += metrics_data["metrics"].values() pass if not metrics or len(metrics) < 2: number_of_data_points = len(metrics) if metrics else 0 return (RESULT_STATE_SKIPPED, [ "There are not enough data points to calculate the standard deviation ({0} sampled)" .format(number_of_data_points) ]) minimum_value_multiplier = 1 if 'dfs.FSNamesystem.CapacityUsed' in metric_name: minimum_value_multiplier = 1024 * 1024 # MB to bytes elif 'rpc.rpc.datanode' in metric_name or 'rpc.rpc.client' in metric_name: minimum_value_multiplier = 1000 # seconds to millis if minimum_value_threshold: # Filter out points below min threshold metrics = [ metric for metric in metrics if metric > (minimum_value_threshold * minimum_value_multiplier) ] if len(metrics) < 2: return (RESULT_STATE_OK, [ 'There were no data points above the minimum threshold of {0} seconds' .format(minimum_value_threshold) ]) mean_value = mean(metrics) stddev = sample_standard_deviation(metrics) try: deviation_percent = stddev / float(mean_value) * 100 except ZeroDivisionError: # should not be a case for this alert return (RESULT_STATE_SKIPPED, [ "Unable to calculate the standard deviation because the mean value is 0" ]) # log the AMS request if logger.isEnabledFor(logging.DEBUG): logger.debug(""" AMS request parameters - {0} AMS response - {1} Mean - {2} Standard deviation - {3} Percentage standard deviation - {4} """.format(encoded_get_metrics_parameters, data_json, mean_value, stddev, deviation_percent)) mean_value_localized = locale.format("%.0f", mean_value, grouping=True) variance_value = (deviation_percent / 100.0) * mean_value variance_value_localized = locale.format("%.0f", variance_value, grouping=True) # check for CRITICAL status if deviation_percent > critical_threshold: threshold_value = ((critical_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping=True) message = DEVIATION_THRESHOLD_MESSAGE.format( variance_value_localized, metric_units, deviation_percent, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_CRITICAL, [message]) # check for WARNING status if deviation_percent > warning_threshold: threshold_value = ((warning_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping=True) message = DEVIATION_THRESHOLD_MESSAGE.format( variance_value_localized, metric_units, deviation_percent, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_WARNING, [message]) # return OK status; use the warning threshold as the value to compare against threshold_value = ((warning_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping=True) message = DEVIATION_OK_MESSAGE.format(variance_value_localized, metric_units, warning_threshold, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_OK, [message])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri uri = str(host_name) + ":" + uri.split(":")[1] live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri) convert_to_json_failed = False response_code = None try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, False, "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout) try: url_response_json = json.loads(url_response) live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"]) except ValueError, error: convert_to_json_failed = True logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}". format("NodeManager Health Summary", str(error))) if convert_to_json_failed: response_code, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, True, "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout) else:
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ hostnames = host_name current_time = int(time.time()) * 1000 # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT if MERGE_HA_METRICS_PARAM_KEY in parameters: merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower() == 'true' metric_name = METRIC_NAME_PARAM_DEFAULT if METRIC_NAME_PARAM_KEY in parameters: metric_name = parameters[METRIC_NAME_PARAM_KEY] metric_units = METRIC_UNITS_DEFAULT if METRIC_UNITS_PARAM_KEY in parameters: metric_units = parameters[METRIC_UNITS_PARAM_KEY] app_id = APP_ID_PARAM_DEFAULT if APP_ID_PARAM_KEY in parameters: app_id = parameters[APP_ID_PARAM_KEY] interval = INTERVAL_PARAM_DEFAULT if INTERVAL_PARAM_KEY in parameters: interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY]) warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT if DEVIATION_WARNING_THRESHOLD_KEY in parameters: warning_threshold = _coerce_to_integer(parameters[DEVIATION_WARNING_THRESHOLD_KEY]) critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters: critical_threshold = _coerce_to_integer(parameters[DEVIATION_CRITICAL_THRESHOLD_KEY]) minimum_value_threshold = None if MINIMUM_VALUE_THRESHOLD_KEY in parameters: minimum_value_threshold = _coerce_to_integer(parameters[MINIMUM_VALUE_THRESHOLD_KEY]) #parse configuration if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations: collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY].split(',')[0] collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY]) else: # ams-site/timeline.metrics.service.webapp.address is required if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)]) else: collector_webapp_address = configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":") if valid_collector_webapp_address(collector_webapp_address): collector_host = select_metric_collector_for_sink(app_id.lower()) collector_port = int(collector_webapp_address[1]) else: return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port", but set to {1}'.format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])]) namenode_service_rpc_address = None # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) hdfs_site = configurations[HDFS_SITE_KEY] if 'dfs.namenode.servicerpc-address' in hdfs_site: namenode_service_rpc_address = hdfs_site['dfs.namenode.servicerpc-address'] # if namenode alert and HA mode if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode': # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) name_service = get_name_service_by_hostname(hdfs_site, host_name) # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, ['Unable to find unique NameNode alias key {0}'.format(nn_unique_ids_key)]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 namenodes = [] active_namenodes = [] nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) namenode = str(hdfs_site[key]).split(":")[0] namenodes.append(namenode) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms = kinit_timer_ms) state = _get_ha_state_from_json(state_response) else: state = _get_state_from_jmx(jmx_uri, connection_timeout) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(namenode) # Only check active NN nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(name_service, nn_unique_id) if nn_service_rpc_address_key in hdfs_site: namenode_service_rpc_address = hdfs_site[nn_service_rpc_address_key] pass except: logger.exception("Unable to determine the active NameNode") pass if merge_ha_metrics: hostnames = ",".join(namenodes) # run only on active NN, no need to run the same requests from the standby if host_name not in active_namenodes: return (RESULT_STATE_SKIPPED, ['This alert will be reported by another host.']) pass # Skip service rpc alert if port is not enabled if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name: return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.']) get_metrics_parameters = { "metricNames": metric_name, "appId": app_id, "hostname": hostnames, "startTime": current_time - interval * 60 * 1000, "endTime": current_time, "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters) ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf" metric_truststore_ca_certs='ca.pem' ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs) metric_collector_https_enabled = str(configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY" _ssl_version = _get_ssl_version() try: conn = network.get_http_connection( collector_host, int(collector_port), metric_collector_https_enabled, ca_certs, ssl_version=_ssl_version ) conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() data = response.read() conn.close() except Exception, e: logger.info(str(e)) return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from the Ambari Metrics service."])
if json_is_valid: for attr in jmx_property_value: if attr not in json_data: beans = json_response['beans'] for jmx_prop_list_item in beans: if "name" in jmx_prop_list_item and jmx_prop_list_item["name"] == jmx_property_key: if attr not in jmx_prop_list_item: raise Exception("Unable to find {0} in JSON from {1} ".format(attr, url)) json_data = jmx_prop_list_item value_list.append(json_data[attr]) http_response_code = None if not json_is_valid and security_enabled and kerberos_principal is not None and kerberos_keytab is not None: http_response_code, error_msg, time_millis = curl_krb_request(tmp_dir, kerberos_keytab, kerberos_principal, url, "metric_alert", kerberos_executable_search_paths, True, self.get_name(), smokeuser, connection_timeout=self.curl_connection_timeout, kinit_timer_ms = self.kinit_timeout) return (value_list, http_response_code) def _get_reporting_text(self, state): ''' Always returns {0} since the result of the script alert is a rendered string. This will ensure that the base class takes the result string and just uses it directly. :param state: the state of the alert in uppercase (such as OK, WARNING, etc) :return: the parameterized text ''' return '{0}'
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) percent_warning = PERCENT_WARNING_DEFAULT if PERCENT_WARNING_KEY in parameters: percent_warning = float(parameters[PERCENT_WARNING_KEY]) percent_critical = PERCENT_CRITICAL_DEFAULT if PERCENT_CRITICAL_KEY in parameters: percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) checkpoint_txn_multiplier_warning = CHECKPOINT_TX_MULTIPLIER_WARNING_DEFAULT if CHECKPOINT_TX_MULTIPLIER_WARNING_KEY in parameters: checkpoint_txn_multiplier_warning = float( parameters[CHECKPOINT_TX_MULTIPLIER_WARNING_KEY]) checkpoint_txn_multiplier_critical = CHECKPOINT_TX_MULTIPLIER_CRITICAL_DEFAULT if CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY in parameters: checkpoint_txn_multiplier_critical = float( parameters[CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY]) kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) # determine the right URI and whether to use SSL hdfs_site = configurations[HDFS_SITE_KEY] scheme = "https" if http_policy == "HTTPS_ONLY" else "http" nn_addresses = get_all_namenode_addresses(hdfs_site) for nn_address in nn_addresses: if nn_address.startswith(host_name + ":"): uri = nn_address break if not uri: return (RESULT_STATE_SKIPPED, [ 'NameNode on host {0} not found (namenode adresses = {1})'.format( host_name, ', '.join(nn_addresses)) ]) current_time = int(round(time.time() * 1000)) last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format( scheme, uri) journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format( scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, last_checkpoint_time_qry, "checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) last_checkpoint_time_response_json = json.loads( last_checkpoint_time_response) last_checkpoint_time = int( last_checkpoint_time_response_json["beans"][0] ["LastCheckpointTime"]) journal_transaction_info_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, journal_transaction_info_qry, "checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) journal_transaction_info_response_json = json.loads( journal_transaction_info_response) journal_transaction_info = journal_transaction_info_response_json[ "beans"][0]["JournalTransactionInfo"] else: last_checkpoint_time = int( get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout)) journal_transaction_info = get_value_from_jmx( journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout) journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int( journal_transaction_info_dict['LastAppliedOrWrittenTxId']) most_recent_tx = int( journal_transaction_info_dict['MostRecentCheckpointTxId']) transaction_difference = last_tx - most_recent_tx delta = (current_time - last_checkpoint_time) / 1000 label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) is_checkpoint_txn_warning = transaction_difference > checkpoint_txn_multiplier_warning * int( checkpoint_tx) is_checkpoint_txn_critical = transaction_difference > checkpoint_txn_multiplier_critical * int( checkpoint_tx) # Either too many uncommitted transactions or missed check-pointing for # long time decided by the thresholds if is_checkpoint_txn_critical or (float(delta) / int(checkpoint_period) * 100 >= int(percent_critical)): logger.debug( 'Raising critical alert: transaction_difference = {0}, checkpoint_tx = {1}' .format(transaction_difference, checkpoint_tx)) result_code = 'CRITICAL' elif is_checkpoint_txn_warning or ( float(delta) / int(checkpoint_period) * 100 >= int(percent_warning)): logger.debug( 'Raising warning alert: transaction_difference = {0}, checkpoint_tx = {1}' .format(transaction_difference, checkpoint_tx)) result_code = 'WARNING' except: label = traceback.format_exc() result_code = 'UNKNOWN' return ((result_code, [label]))
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri uri = str(host_name) + ":" + uri.split(":")[1] live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri) convert_to_json_failed = False response_code = None try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", None, False, "NodeManager Health Summary") try: url_response_json = json.loads(url_response) live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"]) except ValueError, error: convert_to_json_failed = True if logger.isEnabledFor(logging.DEBUG): logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}". format("NodeManager Health Summary", str(error))) if convert_to_json_failed: response_code, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", None, True, "NodeManager Health Summary") else:
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) # Set configuration settings if STORM_UI_PORT in configurations: stormuiport = configurations[STORM_UI_PORT] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) if WARNING_KEY in parameters: warning_val = parameters[WARNING_KEY] if CRITICAL_KEY in parameters: critical_val = parameters[CRITICAL_KEY] if COMPARISON_KEY in parameters: comparison_val = parameters[COMPARISON_KEY] if FIELD_TYPE_KEY in parameters: field_type_val = parameters[FIELD_TYPE_KEY] if FIELD_NAME_KEY in parameters: field_name_val = parameters[FIELD_NAME_KEY] if TOPOLOGY_ID_KEY in parameters: topology_id_val = parameters[TOPOLOGY_ID_KEY] if HTTPS_ENABLED_KEY in parameters and lower(str(parameters[HTTPS_ENABLED_KEY])) == 'true': if HTTPS_PORT_KEY in parameters: stormuiport = str(parameters[HTTPS_PORT_KEY]) protocol = 'https' else: return (('UNKNOWN', ['Please provide a port number as parameter: '+HTTPS_PORT_KEY])) else: protocol = 'http' # Check comparison and field type combination if not field_type_val in ALLOWED_COMPARISON_VALUES.keys(): return (('UNKNOWN', ['Field type error, must be one of: '+','.join(ALLOWED_COMPARISON_VALUES.keys())])) if not comparison_val in ALLOWED_COMPARISON_VALUES[field_type_val]: return (('UNKNOWN', ['Comparison error, must be one of: '+','.join(ALLOWED_COMPARISON_VALUES[field_type_val])+' for given field type: '+field_type_val+'. Type not valid: '+comparison_val])) label = None result_code = "OK" try: # Set up url to query rest_api_request_summary = protocol+'://'+host_name+':'+stormuiport+'/api/v1/topology/summary' # Kerberos curl if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) summary_response, error_msg, time_millis = curl_krb_request('/tmp/', kerberos_keytab, kerberos_principal, rest_api_request_summary, "storm_topology", executable_paths, False, "Storm Topology Rest API", smokeuser, connection_timeout=curl_connection_timeout) # Non-kerberos curl else: req = urllib2.Request(rest_api_request_summary) response = urllib2.urlopen(req) summary_response = response.read() # Get summary to check if the topology is in there summary = json.loads(summary_response) topology_name = None topology_id = None for top in summary['topologies']: if topology_id_val == top['id']: topology_id = top['id'] elif topology_id_val == top['name']: if topology_name or topology_id: return (('UNKNOWN', ['Multiple topologies for with id or name: '+topology_id_val])) topology_id = top['id'] if not topology_id: return (('UNKNOWN', ['No topology found with id or name: '+topology_id_val])) # Get topology information rest_api_request_topology = protocol+'://'+host_name+':'+stormuiport+'/api/v1/topology/'+topology_id # Kerberos curl if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: topology_response, error_msg, time_millis = curl_krb_request('/tmp/', kerberos_keytab, kerberos_principal, rest_api_request_topology, "storm_topology", executable_paths, False, "Storm Topology Rest API", smokeuser, connection_timeout=curl_connection_timeout) # Non-kerberos curl else: req = urllib2.Request(rest_api_request_topology) response = urllib2.urlopen(req) topology_response = response.read() # Load response json_response = json.loads(topology_response) field_val = json_response # Retrive value for field in field_name_val.split('.'): if not field in field_val.keys(): return (('UNKNOWN', ['Could not find field: '+field_name_val+' in response: '+topology_response])) else: field_val = field_val[field] if isinstance(field_val, list): for elem in field_val: if 'window' in elem.keys() and elem['window'] == DEFAULT_WINDOW_VALUE: field_val = elem break #Cast all three values to appropriate type raw_field_values = { 'field':field_val, 'WARNING':warning_val, 'CRITICAL':critical_val } field_values = dict() for field in raw_field_values.keys(): success,value = try_cast(raw_field_values[field],field_type_val) if success: field_values[field] = value else: return (('UNKNOWN', [field+' error: '+value])) #Assume correct label = 'The current value is {c}. Warning threshold is {o} {w} and critical threshold is {o} {t}.'.format(c=field_values['field'],o=comparison_val,w=field_values['WARNING'],t=field_values['CRITICAL']) #Perform comparison for each type for level in ['WARNING', 'CRITICAL']: if comparison(field_values['field'],field_values[level],comparison_val): result_code = level label = 'The current value is {c}, the threshold is {o} {t}'.format(c=field_values['field'],o=comparison_val,t=field_values[level]) #Catch any exceptions during the curls except: label = traceback.format_exc() result_code = 'UNKNOWN' label = 'Topology: '+topology_id_val+', '+label return ((result_code, [label]))