def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ hostnames = host_name current_time = int(time.time()) * 1000 # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT if MERGE_HA_METRICS_PARAM_KEY in parameters: merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower() == 'true' metric_name = METRIC_NAME_PARAM_DEFAULT if METRIC_NAME_PARAM_KEY in parameters: metric_name = parameters[METRIC_NAME_PARAM_KEY] app_id = APP_ID_PARAM_DEFAULT if APP_ID_PARAM_KEY in parameters: app_id = parameters[APP_ID_PARAM_KEY] interval = INTERVAL_PARAM_DEFAULT if INTERVAL_PARAM_KEY in parameters: interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY]) warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT if DEVIATION_WARNING_THRESHOLD_KEY in parameters: warning_threshold = _coerce_to_integer(parameters[DEVIATION_WARNING_THRESHOLD_KEY]) critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters: critical_threshold = _coerce_to_integer(parameters[DEVIATION_CRITICAL_THRESHOLD_KEY]) minimum_value_threshold = None if MINIMUM_VALUE_THRESHOLD_KEY in parameters: minimum_value_threshold = _coerce_to_integer(parameters[MINIMUM_VALUE_THRESHOLD_KEY]) #parse configuration if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) # ams-site/timeline.metrics.service.webapp.address is required if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)]) else: collector_webapp_address = configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":") if valid_collector_webapp_address(collector_webapp_address): collector_host = collector_webapp_address[0] collector_port = int(collector_webapp_address[1]) else: return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port", but set to {1}'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])]) namenode_service_rpc_address = None # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) hdfs_site = configurations[HDFS_SITE_KEY] if 'dfs.namenode.servicerpc-address' in hdfs_site: namenode_service_rpc_address = hdfs_site['dfs.namenode.servicerpc-address'] # if namenode alert and HA mode if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode': # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) name_service = configurations[NAMESERVICE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, ['Unable to find unique NameNode alias key {0}'.format(nn_unique_ids_key)]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 namenodes = [] active_namenodes = [] nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) namenode = str(hdfs_site[key]).split(":")[0] namenodes.append(namenode) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms = kinit_timer_ms) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(namenode) # Only check active NN nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(name_service, nn_unique_id) if nn_service_rpc_address_key in hdfs_site: namenode_service_rpc_address = hdfs_site[nn_service_rpc_address_key] pass except: logger.exception("Unable to determine active NameNode") pass if merge_ha_metrics: hostnames = ",".join(namenodes) # run only on active NN, no need to run the same requests from the standby if host_name not in active_namenodes: return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) pass # Skip service rpc alert if port is not enabled if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name: return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.']) get_metrics_parameters = { "metricNames": metric_name, "appId": app_id, "hostname": hostnames, "startTime": current_time - interval * 60 * 1000, "endTime": current_time, "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters) try: conn = httplib.HTTPConnection(collector_host, int(collector_port), timeout=connection_timeout) conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() data = response.read() conn.close() except Exception: return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from AMS."]) if response.status != 200: return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from AMS."]) data_json = json.loads(data) metrics = [] # will get large standard deviation for multiple hosts, # if host1 reports small local values, but host2 reports large local values for metrics_data in data_json["metrics"]: metrics += metrics_data["metrics"].values() pass if not metrics or len(metrics) < 2: return (RESULT_STATE_SKIPPED, ["Unable to calculate the standard deviation for {0} datapoints".format(len(metrics))]) if minimum_value_threshold: # Filter out points below min threshold metrics = [metric for metric in metrics if metric > (minimum_value_threshold * 1000)] if len(metrics) < 2: return (RESULT_STATE_OK, ['No datapoints found above the minimum threshold of {0} seconds'.format(minimum_value_threshold)]) mean_value = mean(metrics) stddev = sample_standard_deviation(metrics) max_value = max(metrics) / 1000 try: deviation_percent = stddev / mean_value * 100 except ZeroDivisionError: # should not be a case for this alert return (RESULT_STATE_SKIPPED, ["Unable to calculate the standard deviation percentage. The mean value is 0"]) logger.debug(""" AMS request parameters - {0} AMS response - {1} Mean - {2} Standard deviation - {3} Percentage standard deviation - {4} """.format(encoded_get_metrics_parameters, data_json, mean_value, stddev, deviation_percent)) if deviation_percent > critical_threshold: return (RESULT_STATE_CRITICAL,['CRITICAL. Percentage standard deviation value {0}% is beyond the critical threshold of {1}% (growing {2} seconds to {3} seconds)'.format("%.2f" % deviation_percent, "%.2f" % critical_threshold, minimum_value_threshold, "%.2f" % max_value)]) if deviation_percent > warning_threshold: return (RESULT_STATE_WARNING,['WARNING. Percentage standard deviation value {0}% is beyond the warning threshold of {1}% (growing {2} seconds to {3} seconds)'.format("%.2f" % deviation_percent, "%.2f" % warning_threshold, minimum_value_threshold, "%.2f" % max_value)]) return (RESULT_STATE_OK,['OK. Percentage standard deviation value is {0}%'.format("%.2f" % deviation_percent)])
elif 'rpc.rpc.datanode' in metric_name or 'rpc.rpc.client' in metric_name: minimum_value_multiplier = 1000 # seconds to millis if minimum_value_threshold: # Filter out points below min threshold metrics = [ metric for metric in metrics if metric > (minimum_value_threshold * minimum_value_multiplier) ] if len(metrics) < 2: return (RESULT_STATE_OK, [ 'There were no data points above the minimum threshold of {0} seconds' .format(minimum_value_threshold) ]) mean_value = mean(metrics) stddev = sample_standard_deviation(metrics) try: deviation_percent = stddev / float(mean_value) * 100 except ZeroDivisionError: # should not be a case for this alert return (RESULT_STATE_SKIPPED, [ "Unable to calculate the standard deviation because the mean value is 0" ]) # log the AMS request if logger.isEnabledFor(logging.DEBUG): logger.debug(""" AMS request parameters - {0} AMS response - {1}