Beispiel #1
0
def cmd_orphans(args=False):
    """
	Checks for checks that haven't been run in too long a time
	"""
    state = nplug.OK
    otype = 'service'
    unit = ''
    warning = critical = 0
    max_age = 1800
    for arg in args:
        if arg.startswith('--warning='):
            val = arg.split('=', 1)[1]
            if '%' in val:
                unit = '%'
                val = val.replace('%', '')
            warning = float(val)
        elif arg.startswith('--critical='):
            val = arg.split('=', 1)[1]
            if '%' in val:
                unit = '%'
                val = val.replace('%', '')
            critical = float(val)
        elif arg.startswith('--maxage=') or arg.startswith('--max-age='):
            max_age = arg.split('=', 1)[1]
            max_age = int(max_age)
        elif arg == 'host' or arg == 'service':
            otype = arg

    now = time.time()
    query = 'GET %ss\nFilter: should_be_scheduled = 1\nFilter: in_check_period = 1\nFilter: next_check < %s\nStats: state != 999'
    try:
        orphans = int(lsc.query_value(query % (otype, now - max_age)))
        if not warning and not critical:
            total = int(
                lsc.query_value(
                    'GET %ss\nFilter: should_be_scheduled = 1\nFilter: in_check_period = 1\nStats: state != 999'
                    % (otype, )))
            critical = total * 0.01
            warning = total * 0.005
    except livestatus.livestatus.MKLivestatusSocketError:
        print "UNKNOWN: Error asking livestatus for info, bailing out"
        sys.exit(nplug.UNKNOWN)
    if not warning and not critical:
        warning = critical = 1

    if orphans > critical:
        state = nplug.CRITICAL
    elif orphans > warning:
        state = nplug.WARNING

    sys.stdout.write("%s: Orphaned %s checks: %d / %d" %
                     (nplug.state_name(state), otype, orphans, total))
    print("|'orphans'=%d;%d;%d;0;%d" % (orphans, warning, critical, total))
    sys.exit(state)
Beispiel #2
0
def cmd_orphans(args=False):
	"""
	Checks for checks that have not been run in too long a time.
	"""
	state = nplug.OK
	otype = 'service'
	unit = ''
	warning = critical = 0
	max_age = 1800
	for arg in args:
		if arg.startswith('--warning='):
			val = arg.split('=', 1)[1]
			if '%' in val:
				unit = '%'
				val = val.replace('%', '')
			warning = float(val)
		elif arg.startswith('--critical='):
			val = arg.split('=', 1)[1]
			if '%' in val:
				unit = '%'
				val = val.replace('%', '')
			critical = float(val)
		elif arg.startswith('--maxage=') or arg.startswith('--max-age='):
			max_age = arg.split('=', 1)[1]
			max_age = int(max_age)
		elif arg == 'host' or arg == 'service':
			otype = arg

	now = time.time()
	query = 'GET %ss\nFilter: should_be_scheduled = 1\nFilter: in_check_period = 1\nFilter: next_check < %s\nStats: state != 999'
	try:
		orphans = int(lsc.query_value(query % (otype, now - max_age)))
		if not warning and not critical:
			total = int(lsc.query_value('GET %ss\nFilter: should_be_scheduled = 1\nFilter: in_check_period = 1\nStats: state != 999' % (otype,)))
			critical = total * 0.01
			warning = total * 0.005
	except livestatus.livestatus.MKLivestatusSocketError:
		print "UNKNOWN: Error asking livestatus for info, bailing out"
		sys.exit(nplug.UNKNOWN)
	if not warning and not critical:
		warning = critical = 1

	if orphans > critical:
		state = nplug.CRITICAL
	elif orphans > warning:
		state = nplug.WARNING

	sys.stdout.write("%s: Orphaned %s checks: %d / %d" % (nplug.state_name(state), otype, orphans, total))
	print("|'orphans'=%d;%d;%d;0;%d" % (orphans, warning, critical, total))
	sys.exit(state)
Beispiel #3
0
def cmd_cores(args=False):
	"""--warning=X --critical=X [--dir=]
	Checks for memory dumps resulting from segmentation violation from
	core parts of op5 Monitor. Detected core-files are moved to
	/tmp/mon-cores in order to keep working directories clean.
	  --warning  default is 0
	  --critical default is 1 (any corefile results in a critical alert)
	  --dir      lets you specify more paths to search for corefiles. This
	             option can be given multiple times.
	  --delete   deletes corefiles not coming from 'merlind' or 'monitor'
	"""
	warn = 0
	crit = 1
	dirs = ['/opt/monitor', '/opt/monitor/op5/merlin']
	delete = False
	debug = False
	for arg in args:
		if arg.startswith('--warning='):
			warn = int(arg.split('=', 1)[1])
		elif arg.startswith('--critical='):
			crit = int(arg.split('=', 1)[1])
		elif arg.startswith('--dir='):
			dirs.append(arg.split('=', 1)[1])
		elif arg == '--delete' or arg == '-D':
			delete = True
		elif arg == '--debug' or arg == '-d':
			debug = True
		else:
			nplug.unknown("Unknown argument: %s" % arg)

	core_pattern = '^core\..*'
	result = []
	for d in dirs:
		get_files(d, core_pattern, result)
	cores = 0
	for corefile in result:
		core = coredump(corefile)
		core.examine()
		if core.invalid:
			if debug:
				print("Core is invalid: %s" % core.invalid_str())
			elif delete:
				try:
					os.unlink(corefile)
				except OSError:
					pass
			continue
		cores += 1
	if not cores:
		valid = ''
		if len(result):
			valid = '(valid) '
		nplug.ok("No %scorefiles found|cores=0;%d;%d;;" % (valid, warn, crit))

	state = nplug.STATE_OK
	if cores >= crit:
		state = nplug.STATE_CRITICAL
	elif cores >= warn:
		state = nplug.STATE_WARNING
	print("%s: %d corefiles found" % (nplug.state_name(state), cores))
	sys.exit(state)
Beispiel #4
0
def check_min_avg_max(args, col, defaults=False, filter=False):
	order = ['min', 'avg', 'max']
	thresh = {}
	otype = False

	mst = merlin_status(lsc, qh)
	if filter == False:
		filter = 'Filter: should_be_scheduled = 1\nFilter: active_checks_enabled = 1\nAnd: 2\n'

	for arg in args:
		if arg.startswith('--warning=') or arg.startswith('--critical'):
			(what, thvals) = arg[2:].split('=')
			thvals = thvals.split(',')
			if len(thvals) != 3:
				nplug.unknown("Bad argument: %s" % arg)

			thresh[what] = []
			for th in thvals:
				thresh[what].append(float(th))

		elif arg == 'host' or arg == 'service':
			otype = arg
		else:
			nplug.unknown("Unknown argument: %s" % arg)

	for t in ['critical', 'warning']:
		if not thresh.get(t, False) and defaults.get(t, False) != False:
			thresh[t] = defaults[t]

	if not otype:
		nplug.unknown("Need 'host' or 'service' as argument")

	state = nplug.STATE_OK
	try:
		values = mst.min_avg_max(otype, col, filter)
	except livestatus.livestatus.MKLivestatusSocketError:
		print "UNKNOWN: Error asking livestatus for info, bailing out"
		sys.exit(nplug.STATE_UNKNOWN)
	for thresh_type in ['critical', 'warning']:
		if state != nplug.STATE_OK:
			break
		thr = thresh[thresh_type]
		i = 0
		for th in thr:
			what = order[i]
			if values[what] >= th:
				# since we set state for critical first, we can
				# just overwrite it here as we'll never get here for
				# warnings if we already have a critical issue
				state = nplug.state_code(thresh_type)

	i = 0
	state = nplug.STATE_OK
	perfdata_prefix = "%s_%s_" % (otype, col)
	perfdata = ''
	for o in order:
		cval = thresh['critical'][i]
		wval = thresh['warning'][i]
		i += 1
		value = values[o]
		perfdata = "%s '%s%s'=%.3f;%.3f;%.3f;0;" % (
			perfdata, perfdata_prefix, o, value, wval, cval)
		if value >= cval:
			state = nplug.STATE_CRITICAL
		elif value >= wval and state != nplug.STATE_CRITICAL:
			state = nplug.STATE_WARNING
	print("%s: %s %s min/avg/max = %.2f/%.2f/%.2f|%s" %
		(nplug.state_name(state), otype, col, values['min'], values['avg'], values['max'], perfdata))
	sys.exit(state)
Beispiel #5
0
def cmd_distribution(args):
    """[--no-perfdata]
	Checks to make sure work distribution works ok. Note that it's
	not expected to work properly the first couple of minutes after
	a new machine has been brought online or taken offline
	"""

    print_perfdata = True
    for arg in args:
        if arg == '--no-perfdata':
            print_perfdata = False

    total_checks = {
        'host': mst.num_entries('host'),
        'service': mst.num_entries('service'),
    }

    nodes = mst.status()
    state_str = ""
    should = {}
    if not nodes:
        print "UNKNOWN: No hosts found at all"
        sys.exit(nplug.UNKNOWN)
    masters = filter(lambda x: x['type'] == 'master', nodes)
    peers = filter(lambda x: x['type'] in ('peer', 'local'), nodes)
    pollers = filter(lambda x: x['type'] == 'poller', nodes)

    class check_objs(object):
        pdata = ''
        bad = {}
        state = nplug.OK

        def is_bad(self, actual, exp):
            return actual < exp[0] or actual > exp[1]

        def verify_executed_checks(self, info, exp):
            for ctype, num in exp.items():
                actual = int(info[ctype + '_checks_executed'])
                ok_str = "%d:%d" % (num[0], num[1])
                self.pdata += (" '%s_%ss'=%d;%s;%s;0;%d" %
                               (info['name'], ctype, actual, ok_str, ok_str,
                                total_checks[ctype]))
                if self.is_bad(actual, num):
                    self.state = nplug.STATE_CRITICAL
                    self.bad[info['name']] = {
                        'host': info['host_checks_executed'],
                        'service': info['service_checks_executed']
                    }

    o = check_objs()

    for info in masters:
        exp = should[info['name']] = {'host': (0, 0), 'service': (0, 0)}
        o.verify_executed_checks(info, exp)

    try:
        host_dis = lsc.query_value(
            'GET hosts\nFilter: active_checks_enabled = 0\nStats: state != 999'
        )
        svc_dis = lsc.query_value(
            'GET services\nFilter: active_checks_enabled = 0\nStats: state != 999'
        )
    except livestatus.livestatus.MKLivestatusSocketError:
        print "UNKNOWN: Error asking livestatus for info, bailing out"
        sys.exit(nplug.UNKNOWN)

    host_on_poller = 0
    svc_on_poller = 0

    for info in pollers:
        hhandled = int(info.get('host_checks_handled', 0)) / (
            int(info.get('configured_peers', 0)) + 1)
        shandled = int(info.get('service_checks_handled', 0)) / (
            int(info.get('configured_peers', 0)) + 1)
        exp = should[info['name']] = {
            'host': (hhandled - host_dis, hhandled),
            'service': (shandled - svc_dis, shandled)
        }
        host_on_poller += exp['host'][1]
        svc_on_poller += exp['service'][1]
        o.verify_executed_checks(info, exp)

    for info in peers:
        hhandled = int(info.get('host_checks_handled', 0)) / (
            int(info.get('configured_peers', 0)) + 1)
        shandled = int(info.get('service_checks_handled', 0)) / (
            int(info.get('configured_peers', 0)) + 1)
        exp = should[info['name']] = {
            'host': (hhandled - host_dis - host_on_poller, hhandled),
            'service': (shandled - svc_dis - svc_on_poller, shandled),
        }
        o.verify_executed_checks(info, exp)

    for name, b in o.bad.items():
        if not len(b):
            continue
        state_str += (
            "%s runs %d / %d checks (should be %s / %s). " %
            (name, b['host'], b['service'],
             (should[name]['host'][0] == should[name]['host'][1]
              and should[name]['host'][0]) or
             ("%s-%s" % (should[name]['host'][0], should[name]['host'][1])),
             (should[name]['service'][0] == should[name]['service'][1]
              and should[name]['service'][0]) or
             ("%s-%s" %
              (should[name]['service'][0], should[name]['service'][1]))))

    sys.stdout.write("%s: " % nplug.state_name(o.state))
    if not state_str:
        state_str = "All %d nodes run their assigned checks." % len(nodes)
    sys.stdout.write("%s" % state_str.rstrip())
    if print_perfdata:
        print("|%s" % o.pdata.lstrip())
    else:
        sys.stdout.write("\n")
    sys.exit(o.state)
Beispiel #6
0
def cmd_distribution(args):
	"""[--no-perfdata]
	Checks to make sure work distribution works ok. Note that it's
	not expected to work properly the first couple of minutes after
	a new machine has been brought online or taken offline
	"""

	print_perfdata = True
	for arg in args:
		if arg == '--no-perfdata':
			print_perfdata = False

	total_checks = {
		'host': mst.num_entries('host'),
		'service': mst.num_entries('service'),
	}


	nodes = mst.status()
	state_str = ""
	should = {}
	if not nodes:
		print "UNKNOWN: No hosts found at all"
		sys.exit(nplug.UNKNOWN)
	masters = filter(lambda x: x['type'] == 'master', nodes)
	peers = filter(lambda x: x['type'] in ('peer', 'local'), nodes)
	pollers = filter(lambda x: x['type'] == 'poller', nodes)

	class check_objs(object):
		pdata = ''
		bad = {}
		state = nplug.OK

		def is_bad(self, actual, exp):
			return actual < exp[0] or actual > exp[1]

		def verify_executed_checks(self, info, exp):
			for ctype, num in exp.items():
				actual = int(info[ctype + '_checks_executed'])
				ok_str = "%d:%d" % (num[0], num[1])
				self.pdata += (" '%s_%ss'=%d;%s;%s;0;%d" %
					(info['name'], ctype, actual, ok_str, ok_str, total_checks[ctype]))
				if self.is_bad(actual, num):
					self.state = nplug.STATE_CRITICAL
					self.bad[info['name']] = {'host': info['host_checks_executed'], 'service': info['service_checks_executed']}

	o = check_objs()

	for info in masters:
		exp = should[info['name']] = {'host': (0,0), 'service': (0,0)}
		o.verify_executed_checks(info, exp)

	try:
		host_dis = lsc.query_value('GET hosts\nFilter: active_checks_enabled = 0\nStats: state != 999')
		svc_dis = lsc.query_value('GET services\nFilter: active_checks_enabled = 0\nStats: state != 999')
	except livestatus.livestatus.MKLivestatusSocketError:
		print "UNKNOWN: Error asking livestatus for info, bailing out"
		sys.exit(nplug.UNKNOWN)

	host_on_poller = 0
	svc_on_poller = 0

	for info in pollers:
		hhandled = int(info.get('host_checks_handled', 0)) / (int(info.get('configured_peers', 0)) + 1)
		shandled = int(info.get('service_checks_handled', 0)) / (int(info.get('configured_peers', 0)) + 1)
		exp = should[info['name']] = {
			'host': (hhandled - host_dis, hhandled),
			'service': (shandled - svc_dis, shandled)
		}
		host_on_poller += exp['host'][1]
		svc_on_poller += exp['service'][1]
		o.verify_executed_checks(info, exp)

	for info in peers:
		hhandled = int(info.get('host_checks_handled', 0)) / (int(info.get('configured_peers', 0)) + 1)
		shandled = int(info.get('service_checks_handled', 0)) / (int(info.get('configured_peers', 0)) + 1)
		exp = should[info['name']] = {
			'host': (hhandled - host_dis - host_on_poller, hhandled),
			'service': (shandled - svc_dis - svc_on_poller, shandled),
		}
		o.verify_executed_checks(info, exp)

	for name, b in o.bad.items():
		if not len(b):
			continue
		state_str += ("%s runs %d / %d checks (should be %s / %s). " %
			(name, b['host'], b['service'],
			(should[name]['host'][0] == should[name]['host'][1] and should[name]['host'][0]) or ("%s-%s" % (should[name]['host'][0], should[name]['host'][1])),
			 (should[name]['service'][0] == should[name]['service'][1] and should[name]['service'][0]) or ("%s-%s" % (should[name]['service'][0], should[name]['service'][1]))))

	sys.stdout.write("%s: " % nplug.state_name(o.state))
	if not state_str:
		state_str = "All %d nodes run their assigned checks." % len(nodes)
	sys.stdout.write("%s" % state_str.rstrip())
	if print_perfdata:
		print("|%s" % o.pdata.lstrip())
	else:
		sys.stdout.write("\n")
	sys.exit(o.state)