def check(self, opts, args): if opts.host is None or opts.port is None: raise UsageError("Hostname and port must be specified") host = opts.host port = int(opts.port) es_cluster_health = get_json(r'http://%s:%d/_cluster/health' % (host, port)) msg = "Monitoring cluster '%s'" % es_cluster_health['cluster_name'] detail = [] perfdata = [] ## Cluster Health Status (green, yellow, red) cluster_status = HEALTH[es_cluster_health['status'].lower()] perfdata.append( PerformanceMetric(label='cluster_health', value=es_cluster_health['status'])) if cluster_status < self.health: raise Status( 'critical', ("Elasticsearch cluster reports degraded health: '%s'" % es_cluster_health['status'], ), perfdata) raise Status(HEALTH_MAP[self.health], (msg, None, "%s\n\n%s" % (msg, "\n".join(detail))), perfdata)
class ESSplitBrainCheck(NagiosCheck): def __init__(self): NagiosCheck.__init__(self) self.add_option('N', 'nodes', 'nodes', 'Cluster nodes') self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') def check(self, opts, args): nodes = opts.nodes.split(",") port = int(opts.port or '9200') masters = [] responding_nodes = [] failed_nodes = [] for node in nodes: try: response = urllib2.urlopen( r'http://%s:%d/_cluster/state/nodes,master_node/' % (node, port)) response_body = response.read() response = json.loads(response_body) except (urllib2.HTTPError, urllib2.URLError), e: failed_nodes.append("%s - %s" % (node, e.reason)) continue if type(response) is dict: cluster_name = str(response['cluster_name']) master = str( response['nodes'][response['master_node']]['name'] ) responding_nodes.append(node) if master not in masters: masters.append(master) if len(responding_nodes) == 0: raise Status('Unknown', "All cluster nodes unresponsive:\r\n" "%s" % (str("\r\n".join(failed_nodes)))) elif len(masters) != 1: raise Status('Critical', "%d masters (%s) found in %s cluster" % (len(masters), str(", ".join(masters)), cluster_name ) ) else: if len(failed_nodes) == 0: raise Status('OK', "%d/%d nodes have same master" % (len(responding_nodes), len(nodes))) else: raise Status('Warning', "%d/%d nodes have same master\r\n"" "%d unresponsive nodes:\r\n%s" % (len(responding_nodes), len(nodes), len(failed_nodes), str("\r\n".join(failed_nodes))))
class ESJVMHealthCheck(NagiosCheck): def __init__(self): NagiosCheck.__init__(self) self.add_option('H', 'host', 'host', 'The cluster to check') self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') self.add_option( 'C', 'critical_threshold', 'critical_threshold', 'The level at which we throw a CRITICAL alert' ' - defaults to 97% of the JVM setting') self.add_option( 'W', 'warning_threshold', 'warning_threshold', 'The level at which we throw a WARNING alert' ' - defaults to 90% of the JVM setting') def check(self, opts, args): host = opts.host port = int(opts.port or '9200') critical = int(opts.critical_threshold or '97') warning = int(opts.warning_threshold or '90') try: response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' % (host, port)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason))
class ESNodesCheck(NagiosCheck): def __init__(self): NagiosCheck.__init__(self) self.add_option('E', 'expected_nodes_in_cluster', 'nodes_in_cluster', 'This is the expected number of nodes in the cluster') self.add_option('H', 'host', 'host', 'The cluster to check') self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') self.add_option('u', 'username', 'username', 'username to login into ES port') self.add_option('p', 'password', 'password', 'password to login into ES port') def check(self, opts, args): host = opts.host port = int(opts.port or '9200') nodes_in_cluster = int(opts.nodes_in_cluster) username = opts.username password = opts.password try: url = urllib2.Request(r'http://%s:%d/_cluster/health' % (host, port)) if username and password: base64string = base64.encodestring( '%s:%s' % (username, password)).replace('\n', '') url.add_header("Authorization", "Basic %s" % base64string) response = urllib2.urlopen(url) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason))
def get_json(uri): try: f = urllib2.urlopen(uri) except urllib2.HTTPError, e: raise Status( 'unknown', ("API failure: %s" % uri, None, "API failure:\n\n%s" % str(e)))
class ESShardsCheck(NagiosCheck): def __init__(self): NagiosCheck.__init__(self) self.add_option('H', 'host', 'host', 'The cluster to check') self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') self.add_option('u', 'username', 'username', 'Username for authentication') self.add_option('p', 'password', 'password', 'password for authentication') def check(self, opts, args): host = opts.host port = int(opts.port or '9200') username = opts.username password = opts.password try: url = r'http://%s:%d/_cluster/health' % (host, port) request = urllib2.Request(url) if username is not None and password is not None: base64string = base64.encodestring( '%s:%s' % (username, password)).replace('\n', '') request.add_header("Authorization", "Basic %s" % base64string) response = urllib2.urlopen(request) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason))
def check(self, opts, args): host = opts.host port = int(opts.port or '9200') es_count_command = os.popen("curl -XGET 'http://{host}:{port}/_count?q={ES_QUERY_STRING}' 2>/dev/null | /usr/bin/grep -o '"count":[0-9]+' |/usr/bin/cut -d":" -f2") error_count = es_count_command.read() if error_count > 3: raise Status("CRITICAL", "Occurrences of a string[Handbill not printed] in ES reached greater than 3")
def check(self, opts, args): # Ignore SSL Cert warnings requests.packages.urllib3.disable_warnings() url = opts.url username = opts.username password = opts.password timeout = opts.timeout cluster_url = r'%s/api/v1/clusters' % (url) if username and password: try: cluster_response = requests.get(cluster_url, auth=(username, password), verify=False) except requests.exceptions.RequestException as e: raise Status('Unknown', ('requests error: %s' % e, )) try: clusters_data = json.loads(cluster_response.content) except ValueError: raise Status('Unknown', ("API returned nonsense", )) clusters = clusters_data['items'] for cluster in clusters: cluster_name = cluster['Clusters']['cluster_name'] try: alerts_response = requests.get( '%s/api/v1/clusters/%s/alerts?fields=*&Alert/state.in(WARNING,CRITICAL,UNKNOWN)&sortBy=Alert/state' % (url, cluster_name), auth=(username, password), verify=False) except requests.exceptions.RequestException as e: raise Status('Unknown', ('requests error: %s' % e, )) try: alerts_data = json.loads(alerts_response.content) except ValueError, e: raise Status('Unknown', ("API returned nonsense", )) alerts = alerts_data['items']
def check(self, opts, args): host = opts.host port = int(opts.port or '9200') try: response = urllib2.urlopen(r'http://%s:%d/_cluster/health' % (host, port)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e)))
def check(self, opts, args): host = opts.host port = int(opts.port or '9200') critical = int(opts.critical_threshold or '97') warning = int(opts.warning_threshold or '90') try: response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' % (host, port)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e)))
def check(self, opts, args): host = opts.host port = int(opts.port or '9200') username = opts.username password = opts.password try: url = r'http://%s:%d/_cluster/health' % (host, port) request = urllib2.Request(url) if username is not None and password is not None: base64string = base64.encodestring( '%s:%s' % (username, password)).replace('\n', '') request.add_header("Authorization", "Basic %s" % base64string) response = urllib2.urlopen(request) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e)))
class ESJVMHealthCheck(NagiosCheck): def __init__(self): NagiosCheck.__init__(self) self.add_option('H', 'host', 'host', 'The cluster to check') self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') self.add_option( 'C', 'critical_threshold', 'critical_threshold', 'The level at which we throw a CRITICAL alert' ' - defaults to 97% of the JVM setting') self.add_option( 'W', 'warning_threshold', 'warning_threshold', 'The level at which we throw a WARNING alert' ' - defaults to 90% of the JVM setting') self.add_option('u', 'username', 'username', 'username to login into ES port') self.add_option('p', 'password', 'password', 'password to login into ES port') def check(self, opts, args): host = opts.host port = int(opts.port or '9200') critical = int(opts.critical_threshold or '97') warning = int(opts.warning_threshold or '90') username = opts.username password = opts.password try: url = urllib2.Request(r'http://%s:%d/_nodes/stats/jvm' % (host, port)) if username and password: base64string = base64.encodestring( '%s:%s' % (username, password)).replace('\n', '') url.add_header("Authorization", "Basic %s" % base64string) response = urllib2.urlopen(url) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure: %s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason))
def check(self, opts, args): host = opts.host port = int(opts.port or '9200') critical = int(opts.critical_threshold or '97') warning = int(opts.warning_threshold or '90') username = opts.username password = opts.password try: url = urllib2.Request(r'http://%s:%d/_nodes/stats/jvm' % (host, port)) if username and password: base64string = base64.encodestring( '%s:%s' % (username, password)).replace('\n', '') url.add_header("Authorization", "Basic %s" % base64string) response = urllib2.urlopen(url) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure: %s" % str(e)))
class ESShardsCheck(NagiosCheck): def __init__(self): NagiosCheck.__init__(self) self.add_option('H', 'host', 'host', 'The cluster to check') self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') def check(self, opts, args): host = opts.host port = int(opts.port or '9200') try: response = urllib2.urlopen(r'http://%s:%d/_cluster/health' % (host, port)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason))
def check(self, opts, args): if opts.host: host = opts.host port = int(opts.port or '9200') else: raise Status(Status.EXIT_CRITICAL, 'plz set host') try: cluster_status = requests.get('http://%s:%d/_cluster/health' % (host, port)).json() cluster_info = requests.get('http://%s:%d/_cluster/state' % (host, port)).json() except Exception as e: raise Status(Status.EXIT_CRITICAL, "can't connect to ES server") if cluster_status['number_of_data_nodes'] >= 2: nbr_node = int(cluster_status['number_of_data_nodes']) master_id = cluster_info['master_node'] master = cluster_info['nodes'][master_id]['name'] unassigned_shards = cluster_status['unassigned_shards'] cluster_info['nodes'].pop(master_id) cluster_name = cluster_status['cluster_name'] for key in cluster_info['nodes']: self.slave.append(cluster_info['nodes'][key]['name']) else: nbr_node = 1 master_id = cluster_info['master_node'] unassigned_shards = cluster_status['unassigned_shards'] master = cluster_info['nodes'][master_id]['name'] self.slave = None cluster_name = cluster_status['cluster_name'] info_master = "cluster name: {} nodes : {}, master: {} slave: {} unassigned_shards: {}"\ .format(cluster_name, nbr_node, master, self.slave, unassigned_shards) if cluster_status['status'].lower() == 'red': raise Status(Status.EXIT_CRITICAL, "RED {}".format(info_master)) elif cluster_status['status'].lower() == 'yellow': raise Status(Status.EXIT_WARNING, "yellow {}".format(info_master)) elif cluster_status['status'].lower() == 'green': raise Status(Status.EXIT_OK, "green {}".format(info_master)) else: raise Status(Status.EXIT_UNKNOWN, ("no info",))
try: response = urllib2.urlopen(r'http://%s:%d/_cluster/health' % (host, port)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason)) response_body = response.read() try: es_cluster_health = json.loads(response_body) except ValueError: raise Status('unknown', ("API returned nonsense", )) unassigned_shards = es_cluster_health['unassigned_shards'] if es_cluster_health['unassigned_shards'] != unassigned_shards: raise Status( 'CRITICAL', "There are '%s' unassigned shards in the cluster" % (unassigned_shards)) else: raise Status('OK', "All shards in the cluster are currently assigned") if __name__ == "__main__": ESShardsCheck().run()
node_esid_map[node] = n node_shard_map[node] = [] if len(failure_domain) > 0: node_location_map[node] = tuple() try: node_location_map[node] = (tuple( map(lambda a: attrs[a], failure_domain))) except KeyError, e: # Nodes that do not store shards (e.g.: 'client' # nodes) cannot be expected to have been configured # with locational attributes. if 'data' not in attrs or booleanise(attrs['data']): missing_attr = e.args[0] raise Status('warning', ("Node '%s' missing location " "attribute '%s'" % (name, missing_attr), )) # Build index maps: # # - name_index_map # indices = es_state['metadata']['indices'] n_indices = len(indices) n_closed_indices = 0 for i in indices: if indices[i]["state"] == "close": n_closed_indices += 1 continue idx_stns = indices[i]['settings'] if version(es_version) < version("1.0.0"):
def check(self, opts, args): host = opts.host or "localhost" port = int(opts.port or '9200') failure_domain = [] if (isinstance(opts.failure_domain, str) and len(opts.failure_domain) > 0): failure_domain.extend(opts.failure_domain.split(",")) if opts.master_nodes is not None: try: if int(opts.master_nodes) < 1: raise ValueError("'master_nodes' must be greater " "than zero") except ValueError: raise UsageError("Argument to -m/--master-nodes must " "be a natural number") # # Data retrieval # # Request "about" info, so we can figure out the ES version, # to allow for version-specific API changes. es_about = get_json(r'http://%s:%d/' % (host, port)) es_version = es_about['version']['number'] # Request cluster 'health'. /_cluster/health is like a tl;dr # for /_cluster/state (see below). There is very little useful # information here. We are primarily interested in ES' cluster # 'health colour': a little rating ES gives itself to describe # how much pain it is in. es_health = get_json(r'http://%s:%d/_cluster/health' % (host, port)) self.health = HEALTH[es_health['status'].lower()] # Request cluster 'state'. This be where all the meat at, yo. # Here, we can see a list of all nodes, indexes, and shards in # the cluster. This response will also contain a map detailing # where all shards are living at this point in time. #es_state = get_json(r'http://%s:%d/_cluster/state' % # (host, port)) # Request a bunch of useful numbers that we export as perfdata. # Details like the number of get, search, and indexing # operations come from here. #es_stats = get_json(r'http://%s:%d/_nodes/_local/' # 'stats?all=true' % (host, port)) #myid = es_stats['nodes'].keys()[0] n_nodes = es_health['number_of_nodes'] n_dnodes = es_health['number_of_data_nodes'] n_active_shards = es_health['active_shards'] n_relocating_shards = es_health['relocating_shards'] n_initialising_shards = es_health['initializing_shards'] n_unassigned_shards = es_health['unassigned_shards'] n_shards = (n_active_shards + n_relocating_shards + n_initialising_shards + n_unassigned_shards) # Add cluster-wide metrics first. If you monitor all of your ES # cluster nodes with this plugin, they should all report the # same figures for these labels. Not ideal, but 'tis better to # graph this data multiple times than not graph it at all. metrics = [["cluster_nodes", n_nodes], ["cluster_data_nodes", n_dnodes], ["cluster_active_shards", n_active_shards], ["cluster_relocating_shards", n_relocating_shards], ["cluster_initialising_shards", n_initialising_shards], ["cluster_unassigned_shards", n_unassigned_shards], ["cluster_total_shards", n_shards]] # # Assertions # detail = [] # Collect error messages into this list msg = "Monitoring cluster '%s', status %s" % ( es_health['cluster_name'], es_health['status']) # ES detected a problem that we did not. This should never # happen. (If it does, you should work out what happened, then # fix this code so that we can detect the problem if it happens # again.) Obviously, in this case, we cannot provide any useful # output to the operator. raise Status(HEALTH_MAP[self.health], (msg, None, "%s %s" % (msg, " ".join(detail))), metrics)
try: response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' % (host, port)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason)) response_body = response.read() try: nodes_jvm_data = json.loads(response_body) except ValueError: raise Status('unknown', ("API returned nonsense", )) criticals = 0 critical_details = [] warnings = 0 warning_details = [] nodes = nodes_jvm_data['nodes'] for node in nodes: jvm_percentage = nodes[node]['jvm']['mem']['heap_used_percent'] node_name = nodes[node]['host'] if int(jvm_percentage) >= critical: criticals = criticals + 1 critical_details.append( "%s currently running at %s%% JVM mem " % (node_name, jvm_percentage))
try: response = urllib2.urlopen(r'http://%s:%d/_cluster/health' % (host, port)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason)) response_body = response.read() try: es_cluster_health = json.loads(response_body) except ValueError: raise Status('unknown', ("API returned nonsense", )) cluster_status = es_cluster_health['status'].lower() if cluster_status == 'red': raise Status("CRITICAL", "Cluster status is currently reporting as " "Red") elif cluster_status == 'yellow': raise Status("WARNING", "Cluster status is currently reporting as " "Yellow") else: raise Status("OK", "Cluster status is currently reporting as Green")
response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' % (host, port)) else: response = urllib2.urlopen( r'http://%s:%d/_nodes/%s/stats/jvm' % (host, port, node)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason)) response_body = response.read() try: nodes_jvm_data = json.loads(response_body) except ValueError: raise Status('unknown', ("API returned nonsense", )) criticals = 0 critical_details = [] critical_message = [] warnings = 0 warning_details = [] warning_message = [] message = [] nodes = nodes_jvm_data['nodes'] if len(nodes) == 0: raise Status('unknown', ("The Node %s is not a cluster member" % node, )) for node in nodes:
class AmbariAlertsHealthCheck(NagiosCheck): def __init__(self): NagiosCheck.__init__(self) self.useragent = '' self.add_option('U', 'url', 'url', 'URL to Ambari') self.add_option('t', 'timeout', 'timeout', 'Timeout in seconds (default 15s)') self.add_option('u', 'username', 'username', 'Username') self.add_option('p', 'password', 'password', 'Password') def check(self, opts, args): # Ignore SSL Cert warnings requests.packages.urllib3.disable_warnings() url = opts.url username = opts.username password = opts.password timeout = opts.timeout cluster_url = r'%s/api/v1/clusters' % (url) if username and password: try: cluster_response = requests.get(cluster_url, auth=(username, password), verify=False) except requests.exceptions.RequestException as e: raise Status('Unknown', ('requests error: %s' % e, )) try: clusters_data = json.loads(cluster_response.content) except ValueError: raise Status('Unknown', ("API returned nonsense", )) clusters = clusters_data['items'] for cluster in clusters: cluster_name = cluster['Clusters']['cluster_name'] try: alerts_response = requests.get( '%s/api/v1/clusters/%s/alerts?fields=*&Alert/state.in(WARNING,CRITICAL,UNKNOWN)&sortBy=Alert/state' % (url, cluster_name), auth=(username, password), verify=False) except requests.exceptions.RequestException as e: raise Status('Unknown', ('requests error: %s' % e, )) try: alerts_data = json.loads(alerts_response.content) except ValueError, e: raise Status('Unknown', ("API returned nonsense", )) alerts = alerts_data['items'] criticals = 0 critical_details = [] warnings = 0 warning_details = [] unknowns = 0 unknown_details = [] for alert_data in alerts: alert = alert_data['Alert'] if alert['state'] == 'CRITICAL': criticals += 1 critical_details.append( "Cluster %s [%s]: %s" % (alert['cluster_name'], alert['component_name'], alert['text'])) elif alert['state'] == 'WARNING': warnings += 1 warning_details.append( "Cluster %s [%s]: %s" % (alert['cluster_name'], alert['component_name'], alert['text'])) elif alert['state'] == 'UNKNOWN': unknowns += 1 unknown_details.append( "Cluster %s [%s]: %s" % (alert['cluster_name'], alert['component_name'], alert['text'])) if criticals > 0: raise Status( "Critical", "There are critical errors: \r\n%s" % (str("\r\n".join(critical_details)))) elif warnings > 0: raise Status( "Warning", "There are warnings: \r\n%s" % (str("\r\n".join(warning_details)))) elif unknowns > 0: raise Status( "Unknown", "There are unknowns: \r\n%s" % (str("\r\n".join(unknown_details)))) else: raise Status("OK", "No alerts.")
class ESSplitBrainCheck(NagiosCheck): def __init__(self): NagiosCheck.__init__(self) self.add_option('N', 'nodes', 'nodes', 'Cluster nodes') self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') self.add_option('u', 'username', 'username', 'Username for authentication') self.add_option('p', 'password', 'password', 'password for authentication') def check(self, opts, args): nodes = opts.nodes.split(",") port = int(opts.port or '9200') username = opts.username password = opts.password masters = [] responding_nodes = [] failed_nodes = [] for node in nodes: try: url = r'http://%s:%d/_cluster/state/nodes,master_node/' % ( node, port) request = urllib2.Request(url) if username is not None and password is not None: base64string = base64.encodestring( '%s:%s' % (username, password)).replace('\n', '') request.add_header("Authorization", "Basic %s" % base64string) response = urllib2.urlopen(request) response_body = response.read() response = json.loads(response_body) except (urllib2.HTTPError, urllib2.URLError), e: failed_nodes.append("%s - %s" % (node, e.reason)) continue if type(response) is dict: cluster_name = str(response['cluster_name']) master = str( response['nodes'][response['master_node']]['name']) responding_nodes.append(node) if master not in masters: masters.append(master) if len(responding_nodes) == 0: raise Status( 'Unknown', "All cluster nodes unresponsive:\r\n" "%s" % (str("\r\n".join(failed_nodes)))) elif len(masters) != 1: raise Status( 'Critical', "%d masters (%s) found in %s cluster" % (len(masters), str(", ".join(masters)), cluster_name)) else: if len(failed_nodes) == 0: raise Status( 'OK', "%d/%d nodes have same master" % (len(responding_nodes), len(nodes))) else: raise Status( 'OK', "%d/%d nodes have same master\r\n" "%d unresponsive nodes:\r\n%s" % (len(responding_nodes), len(nodes), len(failed_nodes), str("\r\n".join(failed_nodes))))
if opts.warn and heap_usage_percent >= float(opts.warn): raise Status( "warning", "JVM Heap Usage: %d%% %s/%s" % (heap_usage_percent, str(heap_used), str(heap_max)), perfdata) raise Status( "ok", "JVM Heap Usage: %d%% %s/%s" % (heap_usage_percent, str(heap_used), str(heap_max)), perfdata) def get_json(uri): try: f = urllib2.urlopen(uri) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: # The server could be down; make this CRITICAL. raise Status('critical', (e.reason, )) body = f.read() try: j = json.loads(body) except ValueError: raise Status('unknown', ("API returned nonsense", )) return j if __name__ == '__main__': ElasticSearchJvmCheck().run()
def check(self, opts, args): host = opts.host or "localhost" port = int(opts.port or '9200') # # Data retrieval # # Request a bunch of useful numbers that we export as perfdata. # Details like the number of get, search, and indexing operations come from here. es_node = get_json(r'http://%s:%d/_nodes/_local/?all=true' % (host, port)) es_stats = get_json(r'http://%s:%d/_nodes/_local/' 'stats?all=true' % (host, port)) myid = es_stats['nodes'].keys()[0] es_node_jvm = get_json(r'http://%s:%d/_nodes/%s/stats?jvm=true' % (host, port, myid)) # # Perfdata # perfdata = [] def dict2perfdata(base, metrics): for metric in metrics: if len(metric) == 2: label, path = metric unit = "" elif len(metric) > 2: label, path, unit = metric else: continue keys = path.split(".") value = base for key in keys: if value is None: break try: value = value[key] except KeyError: value = None break if value is not None: metric = PerformanceMetric(label=label, value=value, unit=unit) perfdata.append(metric) # Add cluster-wide metrics first. # metrics = [["heap_committed", 'heap_committed_in_bytes', "B"], ["heap_used", 'heap_used_in_bytes', "B"], ["non_heap_committed", 'non_heap_committed_in_bytes', "B"], ["non_heap_used", 'non_heap_used_in_bytes', "B"], ["old_gen_used", 'pools.CMS Old Gen.used_in_bytes', "B"], ["perm_gen_used", 'pools.CMS Perm Gen.used_in_bytes', "B"], ["eden_used", 'pools.Par Eden Space.used_in_bytes', "B"], [ "survivor_used", 'pools.Par Survivor Space.used_in_bytes', "B" ], ["code_cache", 'pools.Code Cache.used_in_bytes', "B"]] dict2perfdata(es_stats['nodes'][myid]['jvm']['mem'], metrics) metrics = [["heap_max", 'heap_max_in_bytes', "B"], ["non_heap_max", 'non_heap_max_in_bytes', "B"], ["direct_max", 'direct_max_in_bytes', "B"]] dict2perfdata(es_node['nodes'][myid]['jvm']['mem'], metrics) collectors = es_node_jvm["nodes"][myid]["jvm"]["gc"][ "collectors"].keys() for collector in collectors: # Full collector names are too long for RRD; contract them # to initialisms. collector_initials = "".join([ c for c in collector if (ord(c) >= ord('A') and ord(c) <= ord('Z')) ]) collector_slug = collector_initials.lower() metrics = [ [ "%s_collections" % collector_slug, "collectors.%s.collection_count" % collector, "c" ], [ "%s_time_ms" % collector_slug, "collectors.%s.time_in_millis" % collector, "c" ], ] dict2perfdata(es_node_jvm["nodes"][myid]["jvm"]["gc"], metrics) metrics = [ ["collections", "collection_count", "c"], ["collection_time_ms", "collection_time_in_millis", "c"], ] dict2perfdata(es_node_jvm["nodes"][myid]["jvm"]["gc"], metrics) heap_used_b = int( es_stats['nodes'][myid]['jvm']['mem']['heap_used_in_bytes']) heap_max_b = int( es_node['nodes'][myid]['jvm']['mem']['heap_max_in_bytes']) # ES 1.0 changed their keys, dropping heap_used & heap_max. If they don't exist, lets create them again from # their byte values. Expected to be of the form '1.3gb' try: heap_used = es_stats['nodes'][myid]['jvm']['mem']['heap_used'] heap_max = es_node['nodes'][myid]['jvm']['mem']['heap_max'] except KeyError: heap_used = str( round( es_stats['nodes'][myid]['jvm']['mem']['heap_used_in_bytes'] / 1048576.0, 2)) + 'gb' heap_max = str( round( es_node['nodes'][myid]['jvm']['mem']['heap_max_in_bytes'] / 1048576.0, 2)) + 'gb' heap_usage_percent = (float(heap_used_b) / float(heap_max_b)) * 100 if opts.crit and heap_usage_percent >= float(opts.crit): raise Status( "critical", "JVM Heap Usage: %d%% %s/%s" % (heap_usage_percent, str(heap_used), str(heap_max)), perfdata) if opts.warn and heap_usage_percent >= float(opts.warn): raise Status( "warning", "JVM Heap Usage: %d%% %s/%s" % (heap_usage_percent, str(heap_used), str(heap_max)), perfdata) raise Status( "ok", "JVM Heap Usage: %d%% %s/%s" % (heap_usage_percent, str(heap_used), str(heap_max)), perfdata)
def check(self, opts, args): raise Status('critical', self.msg)
def check(self, opts, args): raise Status('ok', self.msg)
def check(self, opts, args): raise Status('critical', (self.msg, None, self.msg + "\n" + self.lines()), PerformanceMetric('flibs', 10))
try: response = urllib2.urlopen(r'http://%s:%d/_cluster/health' % (host, port)) except urllib2.HTTPError, e: raise Status('unknown', ("API failure", None, "API failure:\n\n%s" % str(e))) except urllib2.URLError, e: raise Status('critical', (e.reason)) response_body = response.read() try: es_cluster_health = json.loads(response_body) except ValueError: raise Status('unknown', ("API returned nonsense", )) active_cluster_nodes = es_cluster_health['number_of_nodes'] if active_cluster_nodes != nodes_in_cluster: raise Status( 'CRITICAL', "Number of nodes in the cluster is " "reporting as '%s' but we expected '%s'" % (active_cluster_nodes, nodes_in_cluster)) else: raise Status( 'OK', "Number of nodes in the cluster is '%s' as " "expected" % (nodes_in_cluster)) if __name__ == "__main__":