コード例 #1
0
    def check(self, opts, args):
        if opts.host is None or opts.port is None:
            raise UsageError("Hostname and port must be specified")

        host = opts.host
        port = int(opts.port)

        es_cluster_health = get_json(r'http://%s:%d/_cluster/health' %
                                     (host, port))

        msg = "Monitoring cluster '%s'" % es_cluster_health['cluster_name']
        detail = []
        perfdata = []

        ## Cluster Health Status (green, yellow, red)
        cluster_status = HEALTH[es_cluster_health['status'].lower()]

        perfdata.append(
            PerformanceMetric(label='cluster_health',
                              value=es_cluster_health['status']))

        if cluster_status < self.health:
            raise Status(
                'critical',
                ("Elasticsearch cluster reports degraded health: '%s'" %
                 es_cluster_health['status'], ), perfdata)

        raise Status(HEALTH_MAP[self.health],
                     (msg, None, "%s\n\n%s" % (msg, "\n".join(detail))),
                     perfdata)
コード例 #2
0
class ESSplitBrainCheck(NagiosCheck):

    def __init__(self):

        NagiosCheck.__init__(self)

        self.add_option('N', 'nodes', 'nodes', 'Cluster nodes')
        self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')

    def check(self, opts, args):
        nodes = opts.nodes.split(",")
        port = int(opts.port or '9200')
        masters = []
        responding_nodes = []
        failed_nodes = []

        for node in nodes:
            try:
                response = urllib2.urlopen(
                        r'http://%s:%d/_cluster/state/nodes,master_node/'
                        % (node, port))
                response_body = response.read()
                response = json.loads(response_body)
            except (urllib2.HTTPError, urllib2.URLError), e:
                failed_nodes.append("%s - %s" % (node, e.reason))
                continue

            if type(response) is dict:
                cluster_name = str(response['cluster_name'])
                master = str(
                        response['nodes'][response['master_node']]['name']
                        )
                responding_nodes.append(node)
                if master not in masters:
                    masters.append(master)

        if len(responding_nodes) == 0:
            raise Status('Unknown',
                         "All cluster nodes unresponsive:\r\n"
                         "%s" % (str("\r\n".join(failed_nodes))))
        elif len(masters) != 1:
            raise Status('Critical', "%d masters (%s) found in %s cluster"
                         % (len(masters),
                            str(", ".join(masters)), cluster_name
                            )
                         )
        else:
            if len(failed_nodes) == 0:
                raise Status('OK', "%d/%d nodes have same master"
                             % (len(responding_nodes), len(nodes)))
            else:
                raise Status('Warning', "%d/%d nodes have same master\r\n""
                                        "%d unresponsive nodes:\r\n%s"
                                        % (len(responding_nodes),
                                           len(nodes),
                                           len(failed_nodes),
                                           str("\r\n".join(failed_nodes))))
コード例 #3
0
class ESJVMHealthCheck(NagiosCheck):
    def __init__(self):

        NagiosCheck.__init__(self)

        self.add_option('H', 'host', 'host', 'The cluster to check')
        self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')
        self.add_option(
            'C', 'critical_threshold', 'critical_threshold',
            'The level at which we throw a CRITICAL alert'
            ' - defaults to 97% of the JVM setting')
        self.add_option(
            'W', 'warning_threshold', 'warning_threshold',
            'The level at which we throw a WARNING alert'
            ' - defaults to 90% of the JVM setting')

    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')
        critical = int(opts.critical_threshold or '97')
        warning = int(opts.warning_threshold or '90')

        try:
            response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' %
                                       (host, port))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))
コード例 #4
0
class ESNodesCheck(NagiosCheck):
    def __init__(self):

        NagiosCheck.__init__(self)

        self.add_option('E', 'expected_nodes_in_cluster', 'nodes_in_cluster',
                        'This is the expected number of nodes in the cluster')
        self.add_option('H', 'host', 'host', 'The cluster to check')
        self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')
        self.add_option('u', 'username', 'username',
                        'username to login into ES port')
        self.add_option('p', 'password', 'password',
                        'password to login into ES port')

    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')
        nodes_in_cluster = int(opts.nodes_in_cluster)
        username = opts.username
        password = opts.password

        try:
            url = urllib2.Request(r'http://%s:%d/_cluster/health' %
                                  (host, port))
            if username and password:
                base64string = base64.encodestring(
                    '%s:%s' % (username, password)).replace('\n', '')
                url.add_header("Authorization", "Basic %s" % base64string)
            response = urllib2.urlopen(url)

        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))
コード例 #5
0
def get_json(uri):
    try:
        f = urllib2.urlopen(uri)
    except urllib2.HTTPError, e:
        raise Status(
            'unknown',
            ("API failure: %s" % uri, None, "API failure:\n\n%s" % str(e)))
class ESShardsCheck(NagiosCheck):
    def __init__(self):

        NagiosCheck.__init__(self)

        self.add_option('H', 'host', 'host', 'The cluster to check')
        self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')
        self.add_option('u', 'username', 'username',
                        'Username for authentication')
        self.add_option('p', 'password', 'password',
                        'password for authentication')

    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')
        username = opts.username
        password = opts.password

        try:
            url = r'http://%s:%d/_cluster/health' % (host, port)
            request = urllib2.Request(url)
            if username is not None and password is not None:
                base64string = base64.encodestring(
                    '%s:%s' % (username, password)).replace('\n', '')
                request.add_header("Authorization", "Basic %s" % base64string)
            response = urllib2.urlopen(request)

        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))
コード例 #7
0
    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')

        es_count_command =  os.popen("curl -XGET 'http://{host}:{port}/_count?q={ES_QUERY_STRING}' 2>/dev/null | /usr/bin/grep -o '"count":[0-9]+' |/usr/bin/cut -d":" -f2")
        error_count = es_count_command.read()
        if error_count > 3:
           raise Status("CRITICAL", "Occurrences of a string[Handbill not printed] in ES reached greater than 3")
コード例 #8
0
    def check(self, opts, args):
        # Ignore SSL Cert warnings
        requests.packages.urllib3.disable_warnings()

        url = opts.url
        username = opts.username
        password = opts.password
        timeout = opts.timeout

        cluster_url = r'%s/api/v1/clusters' % (url)

        if username and password:
            try:
                cluster_response = requests.get(cluster_url,
                                                auth=(username, password),
                                                verify=False)
            except requests.exceptions.RequestException as e:
                raise Status('Unknown', ('requests error: %s' % e, ))

        try:
            clusters_data = json.loads(cluster_response.content)
        except ValueError:
            raise Status('Unknown', ("API returned nonsense", ))

        clusters = clusters_data['items']

        for cluster in clusters:
            cluster_name = cluster['Clusters']['cluster_name']

            try:

                alerts_response = requests.get(
                    '%s/api/v1/clusters/%s/alerts?fields=*&Alert/state.in(WARNING,CRITICAL,UNKNOWN)&sortBy=Alert/state'
                    % (url, cluster_name),
                    auth=(username, password),
                    verify=False)
            except requests.exceptions.RequestException as e:
                raise Status('Unknown', ('requests error: %s' % e, ))

            try:
                alerts_data = json.loads(alerts_response.content)
            except ValueError, e:
                raise Status('Unknown', ("API returned nonsense", ))

            alerts = alerts_data['items']
コード例 #9
0
    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')

        try:
            response = urllib2.urlopen(r'http://%s:%d/_cluster/health' %
                                       (host, port))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
コード例 #10
0
    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')
        critical = int(opts.critical_threshold or '97')
        warning = int(opts.warning_threshold or '90')

        try:
            response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' %
                                       (host, port))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
コード例 #11
0
    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')
        username = opts.username
        password = opts.password

        try:
            url = r'http://%s:%d/_cluster/health' % (host, port)
            request = urllib2.Request(url)
            if username is not None and password is not None:
                base64string = base64.encodestring(
                    '%s:%s' % (username, password)).replace('\n', '')
                request.add_header("Authorization", "Basic %s" % base64string)
            response = urllib2.urlopen(request)

        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
コード例 #12
0
class ESJVMHealthCheck(NagiosCheck):
    def __init__(self):

        NagiosCheck.__init__(self)

        self.add_option('H', 'host', 'host', 'The cluster to check')
        self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')
        self.add_option(
            'C', 'critical_threshold', 'critical_threshold',
            'The level at which we throw a CRITICAL alert'
            ' - defaults to 97% of the JVM setting')
        self.add_option(
            'W', 'warning_threshold', 'warning_threshold',
            'The level at which we throw a WARNING alert'
            ' - defaults to 90% of the JVM setting')
        self.add_option('u', 'username', 'username',
                        'username to login into ES port')
        self.add_option('p', 'password', 'password',
                        'password to login into ES port')

    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')
        critical = int(opts.critical_threshold or '97')
        warning = int(opts.warning_threshold or '90')
        username = opts.username
        password = opts.password

        try:
            url = urllib2.Request(r'http://%s:%d/_nodes/stats/jvm' %
                                  (host, port))
            if username and password:
                base64string = base64.encodestring(
                    '%s:%s' % (username, password)).replace('\n', '')
                url.add_header("Authorization", "Basic %s" % base64string)
            response = urllib2.urlopen(url)

        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure: %s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))
コード例 #13
0
    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')
        critical = int(opts.critical_threshold or '97')
        warning = int(opts.warning_threshold or '90')
        username = opts.username
        password = opts.password

        try:
            url = urllib2.Request(r'http://%s:%d/_nodes/stats/jvm' %
                                  (host, port))
            if username and password:
                base64string = base64.encodestring(
                    '%s:%s' % (username, password)).replace('\n', '')
                url.add_header("Authorization", "Basic %s" % base64string)
            response = urllib2.urlopen(url)

        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure: %s" % str(e)))
コード例 #14
0
class ESShardsCheck(NagiosCheck):
    def __init__(self):

        NagiosCheck.__init__(self)

        self.add_option('H', 'host', 'host', 'The cluster to check')
        self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')

    def check(self, opts, args):
        host = opts.host
        port = int(opts.port or '9200')

        try:
            response = urllib2.urlopen(r'http://%s:%d/_cluster/health' %
                                       (host, port))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))
コード例 #15
0
    def check(self, opts, args):
        if opts.host:
            host = opts.host
            port = int(opts.port or '9200')
        else:
            raise Status(Status.EXIT_CRITICAL, 'plz set host')

        try:
            cluster_status = requests.get('http://%s:%d/_cluster/health' % (host, port)).json()
            cluster_info = requests.get('http://%s:%d/_cluster/state' % (host, port)).json()

        except Exception as e:
            raise Status(Status.EXIT_CRITICAL, "can't connect to ES server")

        if cluster_status['number_of_data_nodes'] >= 2:
            nbr_node = int(cluster_status['number_of_data_nodes'])
            master_id = cluster_info['master_node']
            master = cluster_info['nodes'][master_id]['name']
            unassigned_shards = cluster_status['unassigned_shards']
            cluster_info['nodes'].pop(master_id)
            cluster_name = cluster_status['cluster_name']
            for key in cluster_info['nodes']:
                self.slave.append(cluster_info['nodes'][key]['name'])
        else:
            nbr_node = 1
            master_id = cluster_info['master_node']
            unassigned_shards = cluster_status['unassigned_shards']
            master = cluster_info['nodes'][master_id]['name']
            self.slave = None
            cluster_name = cluster_status['cluster_name']

        info_master = "cluster name: {} nodes : {}, master: {} slave: {} unassigned_shards: {}"\
            .format(cluster_name, nbr_node, master, self.slave, unassigned_shards)

        if cluster_status['status'].lower() == 'red':
            raise Status(Status.EXIT_CRITICAL, "RED {}".format(info_master))
        elif cluster_status['status'].lower() == 'yellow':
            raise Status(Status.EXIT_WARNING, "yellow {}".format(info_master))
        elif cluster_status['status'].lower() == 'green':
            raise Status(Status.EXIT_OK, "green {}".format(info_master))

        else:
            raise Status(Status.EXIT_UNKNOWN, ("no info",))
コード例 #16
0
        try:
            response = urllib2.urlopen(r'http://%s:%d/_cluster/health' %
                                       (host, port))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))

        response_body = response.read()

        try:
            es_cluster_health = json.loads(response_body)
        except ValueError:
            raise Status('unknown', ("API returned nonsense", ))

        unassigned_shards = es_cluster_health['unassigned_shards']

        if es_cluster_health['unassigned_shards'] != unassigned_shards:
            raise Status(
                'CRITICAL', "There are '%s' unassigned shards in the cluster" %
                (unassigned_shards))
        else:
            raise Status('OK',
                         "All shards in the cluster are currently assigned")


if __name__ == "__main__":
    ESShardsCheck().run()
コード例 #17
0
            node_esid_map[node] = n
            node_shard_map[node] = []

            if len(failure_domain) > 0:
                node_location_map[node] = tuple()
                try:
                    node_location_map[node] = (tuple(
                        map(lambda a: attrs[a], failure_domain)))
                except KeyError, e:
                    # Nodes that do not store shards (e.g.: 'client'
                    # nodes) cannot be expected to have been configured
                    # with locational attributes.
                    if 'data' not in attrs or booleanise(attrs['data']):
                        missing_attr = e.args[0]
                        raise Status('warning', ("Node '%s' missing location "
                                                 "attribute '%s'" %
                                                 (name, missing_attr), ))

        # Build index maps:
        #
        # - name_index_map
        #
        indices = es_state['metadata']['indices']
        n_indices = len(indices)
        n_closed_indices = 0
        for i in indices:
            if indices[i]["state"] == "close":
                n_closed_indices += 1
                continue
            idx_stns = indices[i]['settings']
            if version(es_version) < version("1.0.0"):
コード例 #18
0
    def check(self, opts, args):
        host = opts.host or "localhost"
        port = int(opts.port or '9200')

        failure_domain = []
        if (isinstance(opts.failure_domain, str)
                and len(opts.failure_domain) > 0):
            failure_domain.extend(opts.failure_domain.split(","))

        if opts.master_nodes is not None:
            try:
                if int(opts.master_nodes) < 1:
                    raise ValueError("'master_nodes' must be greater "
                                     "than zero")
            except ValueError:
                raise UsageError("Argument to -m/--master-nodes must "
                                 "be a natural number")

        #
        # Data retrieval
        #

        # Request "about" info, so we can figure out the ES version,
        # to allow for version-specific API changes.
        es_about = get_json(r'http://%s:%d/' % (host, port))
        es_version = es_about['version']['number']

        # Request cluster 'health'.  /_cluster/health is like a tl;dr
        # for /_cluster/state (see below).  There is very little useful
        # information here.  We are primarily interested in ES' cluster
        # 'health colour':  a little rating ES gives itself to describe
        # how much pain it is in.
        es_health = get_json(r'http://%s:%d/_cluster/health' % (host, port))

        self.health = HEALTH[es_health['status'].lower()]

        # Request cluster 'state'.  This be where all the meat at, yo.
        # Here, we can see a list of all nodes, indexes, and shards in
        # the cluster.  This response will also contain a map detailing
        # where all shards are living at this point in time.
        #es_state = get_json(r'http://%s:%d/_cluster/state' %
        #                    (host, port))

        # Request a bunch of useful numbers that we export as perfdata.
        # Details like the number of get, search, and indexing
        # operations come from here.
        #es_stats = get_json(r'http://%s:%d/_nodes/_local/'
        #                     'stats?all=true' % (host, port))

        #myid = es_stats['nodes'].keys()[0]

        n_nodes = es_health['number_of_nodes']
        n_dnodes = es_health['number_of_data_nodes']

        n_active_shards = es_health['active_shards']
        n_relocating_shards = es_health['relocating_shards']
        n_initialising_shards = es_health['initializing_shards']
        n_unassigned_shards = es_health['unassigned_shards']
        n_shards = (n_active_shards + n_relocating_shards +
                    n_initialising_shards + n_unassigned_shards)

        # Add cluster-wide metrics first.  If you monitor all of your ES
        # cluster nodes with this plugin, they should all report the
        # same figures for these labels.  Not ideal, but 'tis better to
        # graph this data multiple times than not graph it at all.
        metrics = [["cluster_nodes", n_nodes],
                   ["cluster_data_nodes", n_dnodes],
                   ["cluster_active_shards", n_active_shards],
                   ["cluster_relocating_shards", n_relocating_shards],
                   ["cluster_initialising_shards", n_initialising_shards],
                   ["cluster_unassigned_shards", n_unassigned_shards],
                   ["cluster_total_shards", n_shards]]

        #
        # Assertions
        #

        detail = []  # Collect error messages into this list

        msg = "Monitoring cluster '%s', status %s" % (
            es_health['cluster_name'], es_health['status'])

        # ES detected a problem that we did not.  This should never
        # happen.  (If it does, you should work out what happened, then
        # fix this code so that we can detect the problem if it happens
        # again.)  Obviously, in this case, we cannot provide any useful
        # output to the operator.
        raise Status(HEALTH_MAP[self.health],
                     (msg, None, "%s %s" % (msg, " ".join(detail))), metrics)
コード例 #19
0
        try:
            response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' %
                                       (host, port))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))

        response_body = response.read()

        try:
            nodes_jvm_data = json.loads(response_body)
        except ValueError:
            raise Status('unknown', ("API returned nonsense", ))

        criticals = 0
        critical_details = []
        warnings = 0
        warning_details = []

        nodes = nodes_jvm_data['nodes']
        for node in nodes:
            jvm_percentage = nodes[node]['jvm']['mem']['heap_used_percent']
            node_name = nodes[node]['host']
            if int(jvm_percentage) >= critical:
                criticals = criticals + 1
                critical_details.append(
                    "%s currently running at %s%% JVM mem " %
                    (node_name, jvm_percentage))
コード例 #20
0
        try:
            response = urllib2.urlopen(r'http://%s:%d/_cluster/health' %
                                       (host, port))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))

        response_body = response.read()

        try:
            es_cluster_health = json.loads(response_body)
        except ValueError:
            raise Status('unknown', ("API returned nonsense", ))

        cluster_status = es_cluster_health['status'].lower()

        if cluster_status == 'red':
            raise Status("CRITICAL",
                         "Cluster status is currently reporting as "
                         "Red")
        elif cluster_status == 'yellow':
            raise Status("WARNING", "Cluster status is currently reporting as "
                         "Yellow")
        else:
            raise Status("OK",
                         "Cluster status is currently reporting as Green")

コード例 #21
0
                response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' %
                                           (host, port))
            else:
                response = urllib2.urlopen(
                    r'http://%s:%d/_nodes/%s/stats/jvm' % (host, port, node))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))

        response_body = response.read()
        try:
            nodes_jvm_data = json.loads(response_body)
        except ValueError:
            raise Status('unknown', ("API returned nonsense", ))

        criticals = 0
        critical_details = []
        critical_message = []
        warnings = 0
        warning_details = []
        warning_message = []
        message = []

        nodes = nodes_jvm_data['nodes']
        if len(nodes) == 0:
            raise Status('unknown',
                         ("The Node %s is not a cluster member" % node, ))

        for node in nodes:
コード例 #22
0
class AmbariAlertsHealthCheck(NagiosCheck):
    def __init__(self):

        NagiosCheck.__init__(self)
        self.useragent = ''

        self.add_option('U', 'url', 'url', 'URL to Ambari')
        self.add_option('t', 'timeout', 'timeout',
                        'Timeout in seconds (default 15s)')
        self.add_option('u', 'username', 'username', 'Username')
        self.add_option('p', 'password', 'password', 'Password')

    def check(self, opts, args):
        # Ignore SSL Cert warnings
        requests.packages.urllib3.disable_warnings()

        url = opts.url
        username = opts.username
        password = opts.password
        timeout = opts.timeout

        cluster_url = r'%s/api/v1/clusters' % (url)

        if username and password:
            try:
                cluster_response = requests.get(cluster_url,
                                                auth=(username, password),
                                                verify=False)
            except requests.exceptions.RequestException as e:
                raise Status('Unknown', ('requests error: %s' % e, ))

        try:
            clusters_data = json.loads(cluster_response.content)
        except ValueError:
            raise Status('Unknown', ("API returned nonsense", ))

        clusters = clusters_data['items']

        for cluster in clusters:
            cluster_name = cluster['Clusters']['cluster_name']

            try:

                alerts_response = requests.get(
                    '%s/api/v1/clusters/%s/alerts?fields=*&Alert/state.in(WARNING,CRITICAL,UNKNOWN)&sortBy=Alert/state'
                    % (url, cluster_name),
                    auth=(username, password),
                    verify=False)
            except requests.exceptions.RequestException as e:
                raise Status('Unknown', ('requests error: %s' % e, ))

            try:
                alerts_data = json.loads(alerts_response.content)
            except ValueError, e:
                raise Status('Unknown', ("API returned nonsense", ))

            alerts = alerts_data['items']

        criticals = 0
        critical_details = []
        warnings = 0
        warning_details = []
        unknowns = 0
        unknown_details = []

        for alert_data in alerts:
            alert = alert_data['Alert']
            if alert['state'] == 'CRITICAL':
                criticals += 1
                critical_details.append(
                    "Cluster %s [%s]: %s" %
                    (alert['cluster_name'], alert['component_name'],
                     alert['text']))
            elif alert['state'] == 'WARNING':
                warnings += 1
                warning_details.append(
                    "Cluster %s [%s]: %s" %
                    (alert['cluster_name'], alert['component_name'],
                     alert['text']))
            elif alert['state'] == 'UNKNOWN':
                unknowns += 1
                unknown_details.append(
                    "Cluster %s [%s]: %s" %
                    (alert['cluster_name'], alert['component_name'],
                     alert['text']))

        if criticals > 0:
            raise Status(
                "Critical", "There are critical errors: \r\n%s" %
                (str("\r\n".join(critical_details))))
        elif warnings > 0:
            raise Status(
                "Warning", "There are warnings: \r\n%s" %
                (str("\r\n".join(warning_details))))
        elif unknowns > 0:
            raise Status(
                "Unknown", "There are unknowns: \r\n%s" %
                (str("\r\n".join(unknown_details))))
        else:
            raise Status("OK", "No alerts.")
コード例 #23
0
class ESSplitBrainCheck(NagiosCheck):
    def __init__(self):

        NagiosCheck.__init__(self)

        self.add_option('N', 'nodes', 'nodes', 'Cluster nodes')
        self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')
        self.add_option('u', 'username', 'username',
                        'Username for authentication')
        self.add_option('p', 'password', 'password',
                        'password for authentication')

    def check(self, opts, args):
        nodes = opts.nodes.split(",")
        port = int(opts.port or '9200')
        username = opts.username
        password = opts.password
        masters = []
        responding_nodes = []
        failed_nodes = []

        for node in nodes:
            try:
                url = r'http://%s:%d/_cluster/state/nodes,master_node/' % (
                    node, port)
                request = urllib2.Request(url)
                if username is not None and password is not None:
                    base64string = base64.encodestring(
                        '%s:%s' % (username, password)).replace('\n', '')
                    request.add_header("Authorization",
                                       "Basic %s" % base64string)
                response = urllib2.urlopen(request)
                response_body = response.read()
                response = json.loads(response_body)
            except (urllib2.HTTPError, urllib2.URLError), e:
                failed_nodes.append("%s - %s" % (node, e.reason))
                continue

            if type(response) is dict:
                cluster_name = str(response['cluster_name'])
                master = str(
                    response['nodes'][response['master_node']]['name'])
                responding_nodes.append(node)
                if master not in masters:
                    masters.append(master)

        if len(responding_nodes) == 0:
            raise Status(
                'Unknown', "All cluster nodes unresponsive:\r\n"
                "%s" % (str("\r\n".join(failed_nodes))))
        elif len(masters) != 1:
            raise Status(
                'Critical', "%d masters (%s) found in %s cluster" %
                (len(masters), str(", ".join(masters)), cluster_name))
        else:
            if len(failed_nodes) == 0:
                raise Status(
                    'OK', "%d/%d nodes have same master" %
                    (len(responding_nodes), len(nodes)))
            else:
                raise Status(
                    'OK', "%d/%d nodes have same master\r\n"
                    "%d unresponsive nodes:\r\n%s" %
                    (len(responding_nodes), len(nodes), len(failed_nodes),
                     str("\r\n".join(failed_nodes))))
コード例 #24
0
ファイル: checkEsJvm.py プロジェクト: diegugawa/nagiosChecks
        if opts.warn and heap_usage_percent >= float(opts.warn):
            raise Status(
                "warning", "JVM Heap Usage: %d%% %s/%s" %
                (heap_usage_percent, str(heap_used), str(heap_max)), perfdata)
        raise Status(
            "ok", "JVM Heap Usage: %d%% %s/%s" %
            (heap_usage_percent, str(heap_used), str(heap_max)), perfdata)


def get_json(uri):
    try:
        f = urllib2.urlopen(uri)
    except urllib2.HTTPError, e:
        raise Status('unknown',
                     ("API failure", None, "API failure:\n\n%s" % str(e)))
    except urllib2.URLError, e:
        # The server could be down; make this CRITICAL.
        raise Status('critical', (e.reason, ))

    body = f.read()

    try:
        j = json.loads(body)
    except ValueError:
        raise Status('unknown', ("API returned nonsense", ))

    return j


if __name__ == '__main__':
    ElasticSearchJvmCheck().run()
コード例 #25
0
ファイル: checkEsJvm.py プロジェクト: diegugawa/nagiosChecks
    def check(self, opts, args):
        host = opts.host or "localhost"
        port = int(opts.port or '9200')

        #
        # Data retrieval
        #

        # Request a bunch of useful numbers that we export as perfdata.
        # Details like the number of get, search, and indexing operations come from here.
        es_node = get_json(r'http://%s:%d/_nodes/_local/?all=true' %
                           (host, port))

        es_stats = get_json(r'http://%s:%d/_nodes/_local/'
                            'stats?all=true' % (host, port))

        myid = es_stats['nodes'].keys()[0]

        es_node_jvm = get_json(r'http://%s:%d/_nodes/%s/stats?jvm=true' %
                               (host, port, myid))

        #
        # Perfdata
        #

        perfdata = []

        def dict2perfdata(base, metrics):
            for metric in metrics:
                if len(metric) == 2:
                    label, path = metric
                    unit = ""
                elif len(metric) > 2:
                    label, path, unit = metric
                else:
                    continue

                keys = path.split(".")

                value = base
                for key in keys:
                    if value is None:
                        break
                    try:
                        value = value[key]
                    except KeyError:
                        value = None
                        break

                if value is not None:
                    metric = PerformanceMetric(label=label,
                                               value=value,
                                               unit=unit)
                    perfdata.append(metric)

        # Add cluster-wide metrics first.
        #
        metrics = [["heap_committed", 'heap_committed_in_bytes', "B"],
                   ["heap_used", 'heap_used_in_bytes', "B"],
                   ["non_heap_committed", 'non_heap_committed_in_bytes', "B"],
                   ["non_heap_used", 'non_heap_used_in_bytes', "B"],
                   ["old_gen_used", 'pools.CMS Old Gen.used_in_bytes', "B"],
                   ["perm_gen_used", 'pools.CMS Perm Gen.used_in_bytes', "B"],
                   ["eden_used", 'pools.Par Eden Space.used_in_bytes', "B"],
                   [
                       "survivor_used",
                       'pools.Par Survivor Space.used_in_bytes', "B"
                   ], ["code_cache", 'pools.Code Cache.used_in_bytes', "B"]]

        dict2perfdata(es_stats['nodes'][myid]['jvm']['mem'], metrics)

        metrics = [["heap_max", 'heap_max_in_bytes', "B"],
                   ["non_heap_max", 'non_heap_max_in_bytes', "B"],
                   ["direct_max", 'direct_max_in_bytes', "B"]]

        dict2perfdata(es_node['nodes'][myid]['jvm']['mem'], metrics)

        collectors = es_node_jvm["nodes"][myid]["jvm"]["gc"][
            "collectors"].keys()
        for collector in collectors:
            # Full collector names are too long for RRD; contract them
            # to initialisms.
            collector_initials = "".join([
                c for c in collector
                if (ord(c) >= ord('A') and ord(c) <= ord('Z'))
            ])
            collector_slug = collector_initials.lower()
            metrics = [
                [
                    "%s_collections" % collector_slug,
                    "collectors.%s.collection_count" % collector, "c"
                ],
                [
                    "%s_time_ms" % collector_slug,
                    "collectors.%s.time_in_millis" % collector, "c"
                ],
            ]
            dict2perfdata(es_node_jvm["nodes"][myid]["jvm"]["gc"], metrics)

        metrics = [
            ["collections", "collection_count", "c"],
            ["collection_time_ms", "collection_time_in_millis", "c"],
        ]
        dict2perfdata(es_node_jvm["nodes"][myid]["jvm"]["gc"], metrics)

        heap_used_b = int(
            es_stats['nodes'][myid]['jvm']['mem']['heap_used_in_bytes'])
        heap_max_b = int(
            es_node['nodes'][myid]['jvm']['mem']['heap_max_in_bytes'])
        # ES 1.0 changed their keys, dropping heap_used & heap_max.  If they don't exist, lets create them again from
        # their byte values.  Expected to be of the form '1.3gb'
        try:
            heap_used = es_stats['nodes'][myid]['jvm']['mem']['heap_used']
            heap_max = es_node['nodes'][myid]['jvm']['mem']['heap_max']
        except KeyError:
            heap_used = str(
                round(
                    es_stats['nodes'][myid]['jvm']['mem']['heap_used_in_bytes']
                    / 1048576.0, 2)) + 'gb'
            heap_max = str(
                round(
                    es_node['nodes'][myid]['jvm']['mem']['heap_max_in_bytes'] /
                    1048576.0, 2)) + 'gb'

        heap_usage_percent = (float(heap_used_b) / float(heap_max_b)) * 100

        if opts.crit and heap_usage_percent >= float(opts.crit):
            raise Status(
                "critical", "JVM Heap Usage: %d%% %s/%s" %
                (heap_usage_percent, str(heap_used), str(heap_max)), perfdata)
        if opts.warn and heap_usage_percent >= float(opts.warn):
            raise Status(
                "warning", "JVM Heap Usage: %d%% %s/%s" %
                (heap_usage_percent, str(heap_used), str(heap_max)), perfdata)
        raise Status(
            "ok", "JVM Heap Usage: %d%% %s/%s" %
            (heap_usage_percent, str(heap_used), str(heap_max)), perfdata)
コード例 #26
0
ファイル: test_critical.py プロジェクト: n2bh/pynagioscheck
 def check(self, opts, args):
     raise Status('critical', self.msg)
コード例 #27
0
 def check(self, opts, args):
     raise Status('ok', self.msg)
コード例 #28
0
ファイル: test_perfdata.py プロジェクト: n2bh/pynagioscheck
 def check(self, opts, args):
     raise Status('critical',
                  (self.msg,
                   None,
                   self.msg + "\n" + self.lines()),
                   PerformanceMetric('flibs', 10))
コード例 #29
0
        try:
            response = urllib2.urlopen(r'http://%s:%d/_cluster/health' %
                                       (host, port))
        except urllib2.HTTPError, e:
            raise Status('unknown',
                         ("API failure", None, "API failure:\n\n%s" % str(e)))
        except urllib2.URLError, e:
            raise Status('critical', (e.reason))

        response_body = response.read()

        try:
            es_cluster_health = json.loads(response_body)
        except ValueError:
            raise Status('unknown', ("API returned nonsense", ))

        active_cluster_nodes = es_cluster_health['number_of_nodes']

        if active_cluster_nodes != nodes_in_cluster:
            raise Status(
                'CRITICAL', "Number of nodes in the cluster is "
                "reporting as '%s' but we expected '%s'" %
                (active_cluster_nodes, nodes_in_cluster))
        else:
            raise Status(
                'OK', "Number of nodes in the cluster is '%s' as "
                "expected" % (nodes_in_cluster))


if __name__ == "__main__":