def cmd_orphans(args=False): """ Checks for checks that haven't been run in too long a time """ state = nplug.OK otype = 'service' unit = '' warning = critical = 0 max_age = 1800 for arg in args: if arg.startswith('--warning='): val = arg.split('=', 1)[1] if '%' in val: unit = '%' val = val.replace('%', '') warning = float(val) elif arg.startswith('--critical='): val = arg.split('=', 1)[1] if '%' in val: unit = '%' val = val.replace('%', '') critical = float(val) elif arg.startswith('--maxage=') or arg.startswith('--max-age='): max_age = arg.split('=', 1)[1] max_age = int(max_age) elif arg == 'host' or arg == 'service': otype = arg now = time.time() query = 'GET %ss\nFilter: should_be_scheduled = 1\nFilter: in_check_period = 1\nFilter: next_check < %s\nStats: state != 999' try: orphans = int(lsc.query_value(query % (otype, now - max_age))) if not warning and not critical: total = int( lsc.query_value( 'GET %ss\nFilter: should_be_scheduled = 1\nFilter: in_check_period = 1\nStats: state != 999' % (otype, ))) critical = total * 0.01 warning = total * 0.005 except livestatus.livestatus.MKLivestatusSocketError: print "UNKNOWN: Error asking livestatus for info, bailing out" sys.exit(nplug.UNKNOWN) if not warning and not critical: warning = critical = 1 if orphans > critical: state = nplug.CRITICAL elif orphans > warning: state = nplug.WARNING sys.stdout.write("%s: Orphaned %s checks: %d / %d" % (nplug.state_name(state), otype, orphans, total)) print("|'orphans'=%d;%d;%d;0;%d" % (orphans, warning, critical, total)) sys.exit(state)
def cmd_orphans(args=False): """ Checks for checks that have not been run in too long a time. """ state = nplug.OK otype = 'service' unit = '' warning = critical = 0 max_age = 1800 for arg in args: if arg.startswith('--warning='): val = arg.split('=', 1)[1] if '%' in val: unit = '%' val = val.replace('%', '') warning = float(val) elif arg.startswith('--critical='): val = arg.split('=', 1)[1] if '%' in val: unit = '%' val = val.replace('%', '') critical = float(val) elif arg.startswith('--maxage=') or arg.startswith('--max-age='): max_age = arg.split('=', 1)[1] max_age = int(max_age) elif arg == 'host' or arg == 'service': otype = arg now = time.time() query = 'GET %ss\nFilter: should_be_scheduled = 1\nFilter: in_check_period = 1\nFilter: next_check < %s\nStats: state != 999' try: orphans = int(lsc.query_value(query % (otype, now - max_age))) if not warning and not critical: total = int(lsc.query_value('GET %ss\nFilter: should_be_scheduled = 1\nFilter: in_check_period = 1\nStats: state != 999' % (otype,))) critical = total * 0.01 warning = total * 0.005 except livestatus.livestatus.MKLivestatusSocketError: print "UNKNOWN: Error asking livestatus for info, bailing out" sys.exit(nplug.UNKNOWN) if not warning and not critical: warning = critical = 1 if orphans > critical: state = nplug.CRITICAL elif orphans > warning: state = nplug.WARNING sys.stdout.write("%s: Orphaned %s checks: %d / %d" % (nplug.state_name(state), otype, orphans, total)) print("|'orphans'=%d;%d;%d;0;%d" % (orphans, warning, critical, total)) sys.exit(state)
def cmd_cores(args=False): """--warning=X --critical=X [--dir=] Checks for memory dumps resulting from segmentation violation from core parts of op5 Monitor. Detected core-files are moved to /tmp/mon-cores in order to keep working directories clean. --warning default is 0 --critical default is 1 (any corefile results in a critical alert) --dir lets you specify more paths to search for corefiles. This option can be given multiple times. --delete deletes corefiles not coming from 'merlind' or 'monitor' """ warn = 0 crit = 1 dirs = ['/opt/monitor', '/opt/monitor/op5/merlin'] delete = False debug = False for arg in args: if arg.startswith('--warning='): warn = int(arg.split('=', 1)[1]) elif arg.startswith('--critical='): crit = int(arg.split('=', 1)[1]) elif arg.startswith('--dir='): dirs.append(arg.split('=', 1)[1]) elif arg == '--delete' or arg == '-D': delete = True elif arg == '--debug' or arg == '-d': debug = True else: nplug.unknown("Unknown argument: %s" % arg) core_pattern = '^core\..*' result = [] for d in dirs: get_files(d, core_pattern, result) cores = 0 for corefile in result: core = coredump(corefile) core.examine() if core.invalid: if debug: print("Core is invalid: %s" % core.invalid_str()) elif delete: try: os.unlink(corefile) except OSError: pass continue cores += 1 if not cores: valid = '' if len(result): valid = '(valid) ' nplug.ok("No %scorefiles found|cores=0;%d;%d;;" % (valid, warn, crit)) state = nplug.STATE_OK if cores >= crit: state = nplug.STATE_CRITICAL elif cores >= warn: state = nplug.STATE_WARNING print("%s: %d corefiles found" % (nplug.state_name(state), cores)) sys.exit(state)
def check_min_avg_max(args, col, defaults=False, filter=False): order = ['min', 'avg', 'max'] thresh = {} otype = False mst = merlin_status(lsc, qh) if filter == False: filter = 'Filter: should_be_scheduled = 1\nFilter: active_checks_enabled = 1\nAnd: 2\n' for arg in args: if arg.startswith('--warning=') or arg.startswith('--critical'): (what, thvals) = arg[2:].split('=') thvals = thvals.split(',') if len(thvals) != 3: nplug.unknown("Bad argument: %s" % arg) thresh[what] = [] for th in thvals: thresh[what].append(float(th)) elif arg == 'host' or arg == 'service': otype = arg else: nplug.unknown("Unknown argument: %s" % arg) for t in ['critical', 'warning']: if not thresh.get(t, False) and defaults.get(t, False) != False: thresh[t] = defaults[t] if not otype: nplug.unknown("Need 'host' or 'service' as argument") state = nplug.STATE_OK try: values = mst.min_avg_max(otype, col, filter) except livestatus.livestatus.MKLivestatusSocketError: print "UNKNOWN: Error asking livestatus for info, bailing out" sys.exit(nplug.STATE_UNKNOWN) for thresh_type in ['critical', 'warning']: if state != nplug.STATE_OK: break thr = thresh[thresh_type] i = 0 for th in thr: what = order[i] if values[what] >= th: # since we set state for critical first, we can # just overwrite it here as we'll never get here for # warnings if we already have a critical issue state = nplug.state_code(thresh_type) i = 0 state = nplug.STATE_OK perfdata_prefix = "%s_%s_" % (otype, col) perfdata = '' for o in order: cval = thresh['critical'][i] wval = thresh['warning'][i] i += 1 value = values[o] perfdata = "%s '%s%s'=%.3f;%.3f;%.3f;0;" % ( perfdata, perfdata_prefix, o, value, wval, cval) if value >= cval: state = nplug.STATE_CRITICAL elif value >= wval and state != nplug.STATE_CRITICAL: state = nplug.STATE_WARNING print("%s: %s %s min/avg/max = %.2f/%.2f/%.2f|%s" % (nplug.state_name(state), otype, col, values['min'], values['avg'], values['max'], perfdata)) sys.exit(state)
def cmd_distribution(args): """[--no-perfdata] Checks to make sure work distribution works ok. Note that it's not expected to work properly the first couple of minutes after a new machine has been brought online or taken offline """ print_perfdata = True for arg in args: if arg == '--no-perfdata': print_perfdata = False total_checks = { 'host': mst.num_entries('host'), 'service': mst.num_entries('service'), } nodes = mst.status() state_str = "" should = {} if not nodes: print "UNKNOWN: No hosts found at all" sys.exit(nplug.UNKNOWN) masters = filter(lambda x: x['type'] == 'master', nodes) peers = filter(lambda x: x['type'] in ('peer', 'local'), nodes) pollers = filter(lambda x: x['type'] == 'poller', nodes) class check_objs(object): pdata = '' bad = {} state = nplug.OK def is_bad(self, actual, exp): return actual < exp[0] or actual > exp[1] def verify_executed_checks(self, info, exp): for ctype, num in exp.items(): actual = int(info[ctype + '_checks_executed']) ok_str = "%d:%d" % (num[0], num[1]) self.pdata += (" '%s_%ss'=%d;%s;%s;0;%d" % (info['name'], ctype, actual, ok_str, ok_str, total_checks[ctype])) if self.is_bad(actual, num): self.state = nplug.STATE_CRITICAL self.bad[info['name']] = { 'host': info['host_checks_executed'], 'service': info['service_checks_executed'] } o = check_objs() for info in masters: exp = should[info['name']] = {'host': (0, 0), 'service': (0, 0)} o.verify_executed_checks(info, exp) try: host_dis = lsc.query_value( 'GET hosts\nFilter: active_checks_enabled = 0\nStats: state != 999' ) svc_dis = lsc.query_value( 'GET services\nFilter: active_checks_enabled = 0\nStats: state != 999' ) except livestatus.livestatus.MKLivestatusSocketError: print "UNKNOWN: Error asking livestatus for info, bailing out" sys.exit(nplug.UNKNOWN) host_on_poller = 0 svc_on_poller = 0 for info in pollers: hhandled = int(info.get('host_checks_handled', 0)) / ( int(info.get('configured_peers', 0)) + 1) shandled = int(info.get('service_checks_handled', 0)) / ( int(info.get('configured_peers', 0)) + 1) exp = should[info['name']] = { 'host': (hhandled - host_dis, hhandled), 'service': (shandled - svc_dis, shandled) } host_on_poller += exp['host'][1] svc_on_poller += exp['service'][1] o.verify_executed_checks(info, exp) for info in peers: hhandled = int(info.get('host_checks_handled', 0)) / ( int(info.get('configured_peers', 0)) + 1) shandled = int(info.get('service_checks_handled', 0)) / ( int(info.get('configured_peers', 0)) + 1) exp = should[info['name']] = { 'host': (hhandled - host_dis - host_on_poller, hhandled), 'service': (shandled - svc_dis - svc_on_poller, shandled), } o.verify_executed_checks(info, exp) for name, b in o.bad.items(): if not len(b): continue state_str += ( "%s runs %d / %d checks (should be %s / %s). " % (name, b['host'], b['service'], (should[name]['host'][0] == should[name]['host'][1] and should[name]['host'][0]) or ("%s-%s" % (should[name]['host'][0], should[name]['host'][1])), (should[name]['service'][0] == should[name]['service'][1] and should[name]['service'][0]) or ("%s-%s" % (should[name]['service'][0], should[name]['service'][1])))) sys.stdout.write("%s: " % nplug.state_name(o.state)) if not state_str: state_str = "All %d nodes run their assigned checks." % len(nodes) sys.stdout.write("%s" % state_str.rstrip()) if print_perfdata: print("|%s" % o.pdata.lstrip()) else: sys.stdout.write("\n") sys.exit(o.state)
def cmd_distribution(args): """[--no-perfdata] Checks to make sure work distribution works ok. Note that it's not expected to work properly the first couple of minutes after a new machine has been brought online or taken offline """ print_perfdata = True for arg in args: if arg == '--no-perfdata': print_perfdata = False total_checks = { 'host': mst.num_entries('host'), 'service': mst.num_entries('service'), } nodes = mst.status() state_str = "" should = {} if not nodes: print "UNKNOWN: No hosts found at all" sys.exit(nplug.UNKNOWN) masters = filter(lambda x: x['type'] == 'master', nodes) peers = filter(lambda x: x['type'] in ('peer', 'local'), nodes) pollers = filter(lambda x: x['type'] == 'poller', nodes) class check_objs(object): pdata = '' bad = {} state = nplug.OK def is_bad(self, actual, exp): return actual < exp[0] or actual > exp[1] def verify_executed_checks(self, info, exp): for ctype, num in exp.items(): actual = int(info[ctype + '_checks_executed']) ok_str = "%d:%d" % (num[0], num[1]) self.pdata += (" '%s_%ss'=%d;%s;%s;0;%d" % (info['name'], ctype, actual, ok_str, ok_str, total_checks[ctype])) if self.is_bad(actual, num): self.state = nplug.STATE_CRITICAL self.bad[info['name']] = {'host': info['host_checks_executed'], 'service': info['service_checks_executed']} o = check_objs() for info in masters: exp = should[info['name']] = {'host': (0,0), 'service': (0,0)} o.verify_executed_checks(info, exp) try: host_dis = lsc.query_value('GET hosts\nFilter: active_checks_enabled = 0\nStats: state != 999') svc_dis = lsc.query_value('GET services\nFilter: active_checks_enabled = 0\nStats: state != 999') except livestatus.livestatus.MKLivestatusSocketError: print "UNKNOWN: Error asking livestatus for info, bailing out" sys.exit(nplug.UNKNOWN) host_on_poller = 0 svc_on_poller = 0 for info in pollers: hhandled = int(info.get('host_checks_handled', 0)) / (int(info.get('configured_peers', 0)) + 1) shandled = int(info.get('service_checks_handled', 0)) / (int(info.get('configured_peers', 0)) + 1) exp = should[info['name']] = { 'host': (hhandled - host_dis, hhandled), 'service': (shandled - svc_dis, shandled) } host_on_poller += exp['host'][1] svc_on_poller += exp['service'][1] o.verify_executed_checks(info, exp) for info in peers: hhandled = int(info.get('host_checks_handled', 0)) / (int(info.get('configured_peers', 0)) + 1) shandled = int(info.get('service_checks_handled', 0)) / (int(info.get('configured_peers', 0)) + 1) exp = should[info['name']] = { 'host': (hhandled - host_dis - host_on_poller, hhandled), 'service': (shandled - svc_dis - svc_on_poller, shandled), } o.verify_executed_checks(info, exp) for name, b in o.bad.items(): if not len(b): continue state_str += ("%s runs %d / %d checks (should be %s / %s). " % (name, b['host'], b['service'], (should[name]['host'][0] == should[name]['host'][1] and should[name]['host'][0]) or ("%s-%s" % (should[name]['host'][0], should[name]['host'][1])), (should[name]['service'][0] == should[name]['service'][1] and should[name]['service'][0]) or ("%s-%s" % (should[name]['service'][0], should[name]['service'][1])))) sys.stdout.write("%s: " % nplug.state_name(o.state)) if not state_str: state_str = "All %d nodes run their assigned checks." % len(nodes) sys.stdout.write("%s" % state_str.rstrip()) if print_perfdata: print("|%s" % o.pdata.lstrip()) else: sys.stdout.write("\n") sys.exit(o.state)