def perform_grafana_put_call(url, id, payload, server): response = None data = None userAndPass = b64encode('{0}:{1}'.format(server.user, server.password)) headers = {"Content-Type": "application/json", 'Authorization' : 'Basic %s' % userAndPass } grafana_https_enabled = server.protocol.lower() == 'https' ca_certs = None if grafana_https_enabled: import params ca_certs = params.ams_grafana_cert_file for i in xrange(0, GRAFANA_CONNECT_TRIES): try: conn = network.get_http_connection(server.host, int(server.port), grafana_https_enabled, ca_certs) conn.request("PUT", url + "/" + str(id), payload, headers) response = conn.getresponse() data = response.read() Logger.info("Http data: %s" % data) conn.close() break except (httplib.HTTPException, socket.error) as ex: if i < GRAFANA_CONNECT_TRIES - 1: time.sleep(GRAFANA_CONNECT_TIMEOUT) Logger.info("Connection to Grafana failed. Next retry in %s seconds." % (GRAFANA_CONNECT_TIMEOUT)) continue else: raise Fail("Ambari Metrics Grafana update failed due to: %s" % str(ex)) pass return (response, data)
def perform_grafana_get_call(url, server): grafana_https_enabled = server.protocol.lower() == 'https' response = None ca_certs = None if grafana_https_enabled: import params ca_certs = params.ams_grafana_cert_file for i in xrange(0, GRAFANA_CONNECT_TRIES): try: conn = network.get_http_connection(server.host, int(server.port), grafana_https_enabled, ca_certs) userAndPass = b64encode('{0}:{1}'.format(server.user, server.password)) headers = { 'Authorization' : 'Basic %s' % userAndPass } Logger.info("Connecting (GET) to %s:%s%s" % (server.host, server.port, url)) conn.request("GET", url, headers = headers) response = conn.getresponse() Logger.info("Http response: %s %s" % (response.status, response.reason)) break except (httplib.HTTPException, socket.error) as ex: if i < GRAFANA_CONNECT_TRIES - 1: time.sleep(GRAFANA_CONNECT_TIMEOUT) Logger.info("Connection to Grafana failed. Next retry in %s seconds." % (GRAFANA_CONNECT_TIMEOUT)) continue else: raise Fail("Ambari Metrics Grafana update failed due to: %s" % str(ex)) pass return response
def perform_grafana_post_call(url, payload, server): import params response = None data = None userAndPass = b64encode('{0}:{1}'.format(server.user, server.password)) Logger.debug('POST payload: %s' % payload) headers = { "Content-Type": "application/json", "Content-Length": len(payload), 'Authorization': 'Basic %s' % userAndPass } grafana_https_enabled = server.protocol.lower() == 'https' ca_certs = None if grafana_https_enabled: ca_certs = params.ams_grafana_ca_cert for i in xrange(0, params.grafana_connect_attempts): try: Logger.info("Connecting (POST) to %s:%s%s" % (server.host, server.port, url)) conn = network.get_http_connection( server.host, int(server.port), grafana_https_enabled, ca_certs, ssl_version=Script.get_force_https_protocol_value()) conn.request("POST", url, payload, headers) response = conn.getresponse() Logger.info("Http response: %s %s" % (response.status, response.reason)) if response.status == 401: #Intermittent error thrown from Grafana if i < params.grafana_connect_attempts - 1: Logger.info( "Connection to Grafana failed. Next retry in %s seconds." % (params.grafana_connect_retry_delay)) time.sleep(params.grafana_connect_retry_delay) continue data = response.read() Logger.info("Http data: %s" % data) conn.close() break except (httplib.HTTPException, socket.error) as ex: if i < params.grafana_connect_attempts - 1: Logger.info( "Connection to Grafana failed. Next retry in %s seconds." % (params.grafana_connect_retry_delay)) time.sleep(params.grafana_connect_retry_delay) continue else: raise Fail("Ambari Metrics Grafana update failed due to: %s" % str(ex)) pass return (response, data)
def post_metrics_to_collector(ams_metrics_post_url, metric_collector_host, metric_collector_port, metric_collector_https_enabled, metric_json, headers, ca_certs, tries=1, connect_timeout=10): for i in xrange(0, tries): try: Logger.info("Generated metrics for host %s :\n%s" % (metric_collector_host, metric_json)) Logger.info("Connecting (POST) to %s:%s%s" % (metric_collector_host, metric_collector_port, ams_metrics_post_url)) conn = network.get_http_connection( metric_collector_host, int(metric_collector_port), metric_collector_https_enabled, ca_certs, ssl_version=Script.get_force_https_protocol_value()) conn.request("POST", ams_metrics_post_url, metric_json, headers) response = conn.getresponse() Logger.info( "Http response for host %s: %s %s" % (metric_collector_host, response.status, response.reason)) except (httplib.HTTPException, socket.error) as ex: if i < tries - 1: #range/xrange returns items from start to end-1 time.sleep(connect_timeout) Logger.info( "Connection failed for host %s. Next retry in %s seconds." % (metric_collector_host, connect_timeout)) continue else: raise Fail("Metrics were not saved. Connection failed.") data = response.read() Logger.info("Http data: %s" % data) conn.close() if response.status == 200: Logger.info("Metrics were saved.") break else: Logger.info("Metrics were not saved.") if i < tries - 1: #range/xrange returns items from start to end-1 time.sleep(tries) Logger.info("Next retry in %s seconds." % (tries)) else: raise Fail( "Metrics were not saved. POST request status: %s %s \n%s" % (response.status, response.reason, data))
def perform_grafana_post_call(url, payload, server): response = None data = None userAndPass = b64encode('{0}:{1}'.format(server.user, server.password)) Logger.debug('POST payload: %s' % payload) headers = { "Content-Type": "application/json", "Content-Length": len(payload), 'Authorization': 'Basic %s' % userAndPass } grafana_https_enabled = server.protocol.lower() == 'https' for i in xrange(0, GRAFANA_CONNECT_TRIES): try: Logger.info("Connecting (POST) to %s:%s%s" % (server.host, server.port, url)) conn = network.get_http_connection(server.host, int(server.port), grafana_https_enabled) conn.request("POST", url, payload, headers) response = conn.getresponse() Logger.info("Http response: %s %s" % (response.status, response.reason)) if response.status == 401: #Intermittent error thrown from Grafana if i < GRAFANA_CONNECT_TRIES - 1: time.sleep(GRAFANA_CONNECT_TIMEOUT) Logger.info( "Connection to Grafana failed. Next retry in %s seconds." % (GRAFANA_CONNECT_TIMEOUT)) continue data = response.read() Logger.info("Http data: %s" % data) conn.close() break except (httplib.HTTPException, socket.error) as ex: if i < GRAFANA_CONNECT_TRIES - 1: time.sleep(GRAFANA_CONNECT_TIMEOUT) Logger.info( "Connection to Grafana failed. Next retry in %s seconds." % (GRAFANA_CONNECT_TIMEOUT)) continue else: raise Fail("Ambari Metrics Grafana update failed due to: %s" % str(ex)) pass return (response, data)
def perform_grafana_delete_call(url, server): import params grafana_https_enabled = server.protocol.lower() == 'https' response = None ca_certs = None if grafana_https_enabled: ca_certs = params.ams_grafana_ca_cert for i in xrange(0, params.grafana_connect_attempts): try: conn = network.get_http_connection( server.host, int(server.port), grafana_https_enabled, ca_certs, ssl_version=Script.get_force_https_protocol_value()) userAndPass = b64encode('{0}:{1}'.format(server.user, server.password)) headers = {'Authorization': 'Basic %s' % userAndPass} Logger.info("Connecting (DELETE) to %s:%s%s" % (server.host, server.port, url)) conn.request("DELETE", url, headers=headers) response = conn.getresponse() Logger.info("Http response: %s %s" % (response.status, response.reason)) break except (httplib.HTTPException, socket.error) as ex: if i < params.grafana_connect_attempts - 1: Logger.info( "Connection to Grafana failed. Next retry in %s seconds." % (params.grafana_connect_retry_delay)) time.sleep(params.grafana_connect_retry_delay) continue else: raise Fail("Ambari Metrics Grafana update failed due to: %s" % str(ex)) pass return response
def _load_metric(self, ams_collector_host, ams_metric, host_filter): get_metrics_parameters = { "metricNames": ams_metric, "appId": self.ams_app_id, "hostname": host_filter, "precision": "seconds", "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode( get_metrics_parameters) url = AMS_METRICS_GET_URL % encoded_get_metrics_parameters _ssl_version = AmbariConfig.get_resolved_config( ).get_force_https_protocol_value() ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf" metric_truststore_ca_certs = 'ca.pem' ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs) conn = None response = None data = None try: conn = network.get_http_connection(ams_collector_host, int(self.ams_collector_port), self.use_ssl, ca_certs, ssl_version=_ssl_version) conn.request("GET", url) response = conn.getresponse() data = response.read() except Exception, exception: if logger.isEnabledFor(logging.DEBUG): logger.exception( "[Alert][{0}] Unable to retrieve metrics from AMS: {1}". format(self.alert_id, str(exception))) status = response.status if response else None return None, status
def service_check_for_single_host(self, metric_collector_host, params): random_value1 = random.random() headers = {"Content-type": "application/json"} ca_certs = os.path.join(params.ams_monitor_conf_dir, params.metric_truststore_ca_certs) current_time = int(time.time()) * 1000 metric_json = Template('smoketest_metrics.json.j2', hostname=params.hostname, random1=random_value1, current_time=current_time).get_content() try: post_metrics_to_collector( self.AMS_METRICS_POST_URL, metric_collector_host, params.metric_collector_port, params.metric_collector_https_enabled, metric_json, headers, ca_certs, self.AMS_CONNECT_TRIES, self.AMS_CONNECT_TIMEOUT) get_metrics_parameters = { "metricNames": "AMBARI_METRICS.SmokeTest.FakeMetric", "appId": "amssmoketestfake", "hostname": params.hostname, "startTime": current_time - 60000, "endTime": current_time + 61000, "precision": "seconds", "grouped": "false", } encoded_get_metrics_parameters = urllib.urlencode( get_metrics_parameters) Logger.info( "Connecting (GET) to %s:%s%s" % (metric_collector_host, params.metric_collector_port, self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters)) for i in xrange(0, self.AMS_READ_TRIES): conn = network.get_http_connection( metric_collector_host, int(params.metric_collector_port), params.metric_collector_https_enabled, ca_certs, ssl_version=Script.get_force_https_protocol_value()) conn.request( "GET", self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() Logger.info( "Http response for host %s : %s %s" % (metric_collector_host, response.status, response.reason)) data = response.read() Logger.info("Http data: %s" % data) conn.close() if response.status == 200: Logger.info("Metrics were retrieved from host %s" % metric_collector_host) else: raise Fail( "Metrics were not retrieved from host %s. GET request status: %s %s \n%s" % (metric_collector_host, response.status, response.reason, data)) data_json = json.loads(data) def floats_eq(f1, f2, delta): return abs(f1 - f2) < delta values_are_present = False for metrics_data in data_json["metrics"]: if (str(current_time) in metrics_data["metrics"] and str(current_time + 1000) in metrics_data["metrics"] and floats_eq( metrics_data["metrics"][str(current_time)], random_value1, 0.0000001) and floats_eq( metrics_data["metrics"][str(current_time + 1000)], current_time, 1)): Logger.info( "Values %s and %s were found in the response from host %s." % (metric_collector_host, random_value1, current_time)) values_are_present = True break pass if not values_are_present: if i < self.AMS_READ_TRIES - 1: #range/xrange returns items from start to end-1 Logger.info( "Values weren't stored yet. Retrying in %s seconds." % (self.AMS_READ_TIMEOUT)) time.sleep(self.AMS_READ_TIMEOUT) else: raise Fail( "Values %s and %s were not found in the response." % (random_value1, current_time)) else: break pass except Fail as ex: Logger.warning( "Ambari Metrics service check failed on collector host %s. Reason : %s" % (metric_collector_host, str(ex))) raise Fail( "Ambari Metrics service check failed on collector host %s. Reason : %s" % (metric_collector_host, str(ex)))
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ hostnames = host_name current_time = int(time.time()) * 1000 # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT if MERGE_HA_METRICS_PARAM_KEY in parameters: merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower() == 'true' metric_name = METRIC_NAME_PARAM_DEFAULT if METRIC_NAME_PARAM_KEY in parameters: metric_name = parameters[METRIC_NAME_PARAM_KEY] metric_units = METRIC_UNITS_DEFAULT if METRIC_UNITS_PARAM_KEY in parameters: metric_units = parameters[METRIC_UNITS_PARAM_KEY] app_id = APP_ID_PARAM_DEFAULT if APP_ID_PARAM_KEY in parameters: app_id = parameters[APP_ID_PARAM_KEY] interval = INTERVAL_PARAM_DEFAULT if INTERVAL_PARAM_KEY in parameters: interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY]) warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT if DEVIATION_WARNING_THRESHOLD_KEY in parameters: warning_threshold = _coerce_to_integer(parameters[DEVIATION_WARNING_THRESHOLD_KEY]) critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters: critical_threshold = _coerce_to_integer(parameters[DEVIATION_CRITICAL_THRESHOLD_KEY]) minimum_value_threshold = None if MINIMUM_VALUE_THRESHOLD_KEY in parameters: minimum_value_threshold = _coerce_to_integer(parameters[MINIMUM_VALUE_THRESHOLD_KEY]) #parse configuration if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations: collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY] collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY]) else: # ams-site/timeline.metrics.service.webapp.address is required if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)]) else: collector_webapp_address = configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":") if valid_collector_webapp_address(collector_webapp_address): collector_host = select_metric_collector_for_sink(app_id.lower()) collector_port = int(collector_webapp_address[1]) else: return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port", but set to {1}'.format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])]) namenode_service_rpc_address = None # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) hdfs_site = configurations[HDFS_SITE_KEY] if 'dfs.namenode.servicerpc-address' in hdfs_site: namenode_service_rpc_address = hdfs_site['dfs.namenode.servicerpc-address'] # if namenode alert and HA mode if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode': # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) name_service = configurations[NAMESERVICE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, ['Unable to find unique NameNode alias key {0}'.format(nn_unique_ids_key)]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 namenodes = [] active_namenodes = [] nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) namenode = str(hdfs_site[key]).split(":")[0] namenodes.append(namenode) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms = kinit_timer_ms) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(namenode) # Only check active NN nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(name_service, nn_unique_id) if nn_service_rpc_address_key in hdfs_site: namenode_service_rpc_address = hdfs_site[nn_service_rpc_address_key] pass except: logger.exception("Unable to determine the active NameNode") pass if merge_ha_metrics: hostnames = ",".join(namenodes) # run only on active NN, no need to run the same requests from the standby if host_name not in active_namenodes: return (RESULT_STATE_SKIPPED, ['This alert will be reported by another host.']) pass # Skip service rpc alert if port is not enabled if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name: return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.']) get_metrics_parameters = { "metricNames": metric_name, "appId": app_id, "hostname": hostnames, "startTime": current_time - interval * 60 * 1000, "endTime": current_time, "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters) ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf" metric_truststore_ca_certs='ca.pem' ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs) metric_collector_https_enabled = str(configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY" try: conn = network.get_http_connection(collector_host, int(collector_port), metric_collector_https_enabled, ca_certs) conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() data = response.read() conn.close() except Exception: return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from the Ambari Metrics service."]) if response.status != 200: return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from the Ambari Metrics service."]) data_json = json.loads(data) metrics = [] # will get large standard deviation for multiple hosts, # if host1 reports small local values, but host2 reports large local values for metrics_data in data_json["metrics"]: metrics += metrics_data["metrics"].values() pass if not metrics or len(metrics) < 2: number_of_data_points = len(metrics) if metrics else 0 return (RESULT_STATE_SKIPPED, ["There are not enough data points to calculate the standard deviation ({0} sampled)".format( number_of_data_points)]) minimum_value_multiplier = 1 if 'dfs.FSNamesystem.CapacityUsed' in metric_name: minimum_value_multiplier = 1024 * 1024 # MB to bytes elif 'rpc.rpc.datanode' in metric_name or 'rpc.rpc.client' in metric_name: minimum_value_multiplier = 1000 # seconds to millis if minimum_value_threshold: # Filter out points below min threshold metrics = [metric for metric in metrics if metric > (minimum_value_threshold * minimum_value_multiplier)] if len(metrics) < 2: return (RESULT_STATE_OK, ['There were no data points above the minimum threshold of {0} seconds'.format(minimum_value_threshold)]) mean_value = mean(metrics) stddev = sample_standard_deviation(metrics) try: deviation_percent = stddev / float(mean_value) * 100 except ZeroDivisionError: # should not be a case for this alert return (RESULT_STATE_SKIPPED, ["Unable to calculate the standard deviation because the mean value is 0"]) # log the AMS request if logger.isEnabledFor(logging.DEBUG): logger.debug(""" AMS request parameters - {0} AMS response - {1} Mean - {2} Standard deviation - {3} Percentage standard deviation - {4} """.format(encoded_get_metrics_parameters, data_json, mean_value, stddev, deviation_percent)) mean_value_localized = locale.format("%.0f", mean_value, grouping=True) variance_value = (deviation_percent / 100.0) * mean_value variance_value_localized = locale.format("%.0f", variance_value, grouping=True) # check for CRITICAL status if deviation_percent > critical_threshold: threshold_value = ((critical_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping=True) message = DEVIATION_THRESHOLD_MESSAGE.format(variance_value_localized, metric_units, deviation_percent, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_CRITICAL,[message]) # check for WARNING status if deviation_percent > warning_threshold: threshold_value = ((warning_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping = True) message = DEVIATION_THRESHOLD_MESSAGE.format(variance_value_localized, metric_units, deviation_percent, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_WARNING, [message]) # return OK status; use the warning threshold as the value to compare against threshold_value = ((warning_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping = True) message = DEVIATION_OK_MESSAGE.format(variance_value_localized, metric_units, warning_threshold, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_OK,[message])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ hostnames = host_name current_time = int(time.time()) * 1000 # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT if MERGE_HA_METRICS_PARAM_KEY in parameters: merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower( ) == 'true' metric_name = METRIC_NAME_PARAM_DEFAULT if METRIC_NAME_PARAM_KEY in parameters: metric_name = parameters[METRIC_NAME_PARAM_KEY] metric_units = METRIC_UNITS_DEFAULT if METRIC_UNITS_PARAM_KEY in parameters: metric_units = parameters[METRIC_UNITS_PARAM_KEY] app_id = APP_ID_PARAM_DEFAULT if APP_ID_PARAM_KEY in parameters: app_id = parameters[APP_ID_PARAM_KEY] interval = INTERVAL_PARAM_DEFAULT if INTERVAL_PARAM_KEY in parameters: interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY]) warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT if DEVIATION_WARNING_THRESHOLD_KEY in parameters: warning_threshold = _coerce_to_integer( parameters[DEVIATION_WARNING_THRESHOLD_KEY]) critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters: critical_threshold = _coerce_to_integer( parameters[DEVIATION_CRITICAL_THRESHOLD_KEY]) minimum_value_threshold = None if MINIMUM_VALUE_THRESHOLD_KEY in parameters: minimum_value_threshold = _coerce_to_integer( parameters[MINIMUM_VALUE_THRESHOLD_KEY]) #parse configuration if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations: collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY] collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY]) else: # ams-site/timeline.metrics.service.webapp.address is required if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY) ]) else: collector_webapp_address = configurations[ METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":") if valid_collector_webapp_address(collector_webapp_address): collector_host = select_metric_collector_for_sink( app_id.lower()) collector_port = int(collector_webapp_address[1]) else: return (RESULT_STATE_UNKNOWN, [ '{0} value should be set as "fqdn_hostname:port", but set to {1}' .format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY]) ]) namenode_service_rpc_address = None # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) hdfs_site = configurations[HDFS_SITE_KEY] if 'dfs.namenode.servicerpc-address' in hdfs_site: namenode_service_rpc_address = hdfs_site[ 'dfs.namenode.servicerpc-address'] # if namenode alert and HA mode if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode': # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format( HDFS_SITE_KEY) ]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) name_service = get_name_service_by_hostname(hdfs_site, host_name) # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, [ 'Unable to find unique NameNode alias key {0}'.format( nn_unique_ids_key) ]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 namenodes = [] active_namenodes = [] nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) namenode = str(hdfs_site[key]).split(":")[0] namenodes.append(namenode) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri, "ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) state = _get_ha_state_from_json(state_response) else: state = _get_state_from_jmx(jmx_uri, connection_timeout) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(namenode) # Only check active NN nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format( name_service, nn_unique_id) if nn_service_rpc_address_key in hdfs_site: namenode_service_rpc_address = hdfs_site[ nn_service_rpc_address_key] pass except: logger.exception("Unable to determine the active NameNode") pass if merge_ha_metrics: hostnames = ",".join(namenodes) # run only on active NN, no need to run the same requests from the standby if host_name not in active_namenodes: return (RESULT_STATE_SKIPPED, ['This alert will be reported by another host.']) pass # Skip service rpc alert if port is not enabled if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name: return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.']) get_metrics_parameters = { "metricNames": metric_name, "appId": app_id, "hostname": hostnames, "startTime": current_time - interval * 60 * 1000, "endTime": current_time, "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters) ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf" metric_truststore_ca_certs = 'ca.pem' ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs) metric_collector_https_enabled = str( configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY" _ssl_version = _get_ssl_version() try: conn = network.get_http_connection(collector_host, int(collector_port), metric_collector_https_enabled, ca_certs, ssl_version=_ssl_version) conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() data = response.read() conn.close() except Exception, e: logger.info(str(e)) return (RESULT_STATE_UNKNOWN, [ "Unable to retrieve metrics from the Ambari Metrics service." ])