def daemonize(pid_file, cwd="/"): """Background the current process""" log.debug("daemonizing process") # BROKEN: the pid file may have already been created by write_pid # however, I'm not even using nagcat in daemon mode right now so # I'll just leave this commented out for now... # Also, this has a major race condition... #try: # # A trivial check to see if we are already running # pidfd = open(pid_file) # pid = int(pidfd.readline().strip()) # pidfd.close() # os.kill(pid, 0) #except (IOError, OSError): # pass # Assume all is well if the test raised errors #else: # log.error("PID file exits and process %s is running!" % pid) # sys.exit(1) try: pidfd = open(pid_file, 'w') except IOError, ex: log.error("Failed to open PID file %s" % pid_file) log.error("Error: %s" % (ex,)) sys.exit(1)
def init(options): """Prepare to start up NagCat""" # Set uid/gid/file_limit util.setup(options.user, options.group, options.file_limit, options.core_dumps) # Write out the pid to make the verify script happy if options.pidfile: util.write_pid(options.pidfile) log.init(options.logfile, options.loglevel) config = coil.parse_file(options.config, expand=False) init_plugins(options) try: if options.test: nagcat = simple.NagcatSimple(config, rradir=options.rradir, rrdcache=options.rrdcache, monitor_port=options.status_port, test_name=options.test, host=options.host, port=options.port) else: nagcat = nagios.NagcatNagios(config, rradir=options.rradir, rrdcache=options.rrdcache, monitor_port=options.status_port, nagios_cfg=options.nagios, tag=options.tag) except (errors.InitError, coil.errors.CoilError), ex: log.error(str(ex)) sys.exit(1)
def _set_peer_id_and_timestamp(self): """ Gets a peer_id and sets a timestamp for when it acquired the peer_id The peer_id comes from merlin, and is obtained by reading a database, which Merlin outputs data to.""" try: db = MySQLdb.connect( user=self._merlin_db_info['merlin_db_user'], host=self._merlin_db_info['merlin_db_host'], passwd=self._merlin_db_info['merlin_db_pass'], db=self._merlin_db_info['merlin_db_name']) curs = db.cursor() num_rows = curs.execute( """select * from merlin_peers where state=3;""") self._num_peers = num_rows log.debug("Setting self._num_peers = %s", self._num_peers) for i in range(num_rows): row = curs.fetchone() if row[0] == "localhost": self._peer_id = row[5] self._peer_id_timestamp = time.time() log.debug(("Setting self._peer_id = %s", str(self._peer_id)) + ("and self._peer_id_timestamp = %s", self._peer_id_timestamp)) except MySQLdb.Error, e: log.error("Error reading merlin db %d: %s" % (e.args[0], e.args[1]))
def _status(self, object_types=(), object_select=()): try: stat = nagios_objects.ObjectParser( self._status_file, object_types, object_select) except errors.InitError, ex: log.error("Failed to parse Nagios status file: %s" % ex) raise xmlrpc.Fault(1, "Failed to read Nagios status")
def _cleanup_spool(self): """Periodically clean up old things in the spool dir. This shouldn't normally be required but if things get screwed up we don't want the directory to get so huge that it keeps things slow after nagios is handling results again. """ # Note: It is entirely possible that the command to submit # this file is still in the writer queue, if that's the case # nagios will also log an error when it gets around to # reading from the queue. # Set the threshold to 5 minutes ago, if nagios hasn't been # able to keep up for the past 5 minutes we have problems. threshold = time.time() - 300 count = 0 for item in os.listdir(self.spool_dir): path = "%s/%s" % (self.spool_dir, item) try: info = os.stat(path) except: continue if info.st_mtime < threshold: try: os.unlink(path) except OSError, ex: log.error("Failed to remove %s: %s" % (path, ex)) else: count += 1
def do_shutdown(): calls['timer'].cancel() # This should not abort the daemon shutdown try: do_cmd(force=True) except: fail = failure.failure() log.error(fail)
def write_pid(pid_file): """Write out the current PID""" try: pidfd = open(pid_file, 'w') except IOError, ex: log.error("Failed to open PID file %s" % pid_file) log.error("Error: %s" % (ex,)) sys.exit(1)
def setup(user=None, group=None, file_limit=None, core_dumps=None): """Set the processes user, group, and file limits""" if file_limit: try: resource.setrlimit(resource.RLIMIT_NOFILE, (file_limit, file_limit)) except ValueError, ex: log.error("Failed to set limit on open files: %s" % ex) sys.exit(1)
def stop(result): reactor.stop() if isinstance(result, failure.Failure): if isinstance(result.value, NotificationError): log.error(str(result.value)) else: log.error(str(result)) exit_code[0] = 1 else: exit_code[0] = 0
def latency(self, last): now = time.time() self._latency_call = reactor.callLater(1.0, self.latency, now) latency = now - last - 1.0 self._latency.append(latency) if latency > 5.0: log.error("Callback latency: %s" % latency) elif latency > 1.5: log.warn("Callback latency: %s" % latency)
def schedule(self, runnable, delay=None): """(re)schedule a top level runnable""" if delay is None: delay = runnable.repeat if not delay: log.error("Task %s has no repeat value.", runnable) else: log.debug("Scheduling %s in %s seconds.", runnable, delay) deferred = task.deferLater(reactor, delay, runnable.start) deferred.addBoth(lambda x: self.schedule(runnable))
def _done(self, result): """Save the result, log unhandled errors""" log.debug("Stopping %s", self) log.debug("Result: %s", result) self.result = result self.lastrun = time.time() self.deferred = None if isinstance(result, failure.Failure): if isinstance(result.value, errors.TestError): if result.tb is not None: log.warn("TestError with a traceback in %s:\n%s" % (self, result.getTraceback())) else: log.error("Unhandled error in %s:\n%s" % (self, result.getTraceback()))
def _failure_tcp(self, result): """Catch common TCP failures and convert them to a TestError""" if isinstance(result.value, neterror.TimeoutError): raise errors.TestCritical("TCP handshake timeout") elif isinstance(result.value, neterror.ConnectionRefusedError): raise errors.TestCritical("TCP connection refused") elif isinstance(result.value, neterror.ConnectionLost): raise errors.TestCritical("TCP connection lost unexpectedly") elif isinstance(result.value, neterror.ConnectError): if result.value.osError == errno.EMFILE: log.error("Too many open files! Restart with a new ulimit -n") raise errors.TestAbort("NAGCAT ERROR: %s" % result.value) raise errors.TestCritical("TCP error: %s" % result.value) return result
def main(): """Start up NagCat, profiling things as requested""" options = parse_options() log.init(options.logfile, options.loglevel) if options.pidfile: util.write_pid(options.pidfile) site = monitor_api.MonitorSite() site.root.putChild("nagios", nagios_api.NagiosStatus(options.nagios)) if not options.read_only: try: rpc = nagios_api.NagiosXMLRPC(options.nagios) except errors.InitError, ex: log.error(str(ex)) sys.exit(1) site.root.putChild("RPC2", rpc)
def build_tests(self, templates, tag=None): """Setup tests based on the loaded Nagios config""" skels = self._parse_tests(tag) tests = [] for test_defaults, test_overrides in skels: testconf = templates.get(test_overrides['test'], None) if testconf is None: raise errors.InitError( "Test template '%s' not found in config!" % test_overrides['test']) # Copy the config so we can add instance specific values # such as host, port, etc. testconf = testconf.copy() for key, val in test_defaults.iteritems(): testconf.setdefault(key, val) for key, val in test_overrides.iteritems(): testconf[key] = val try: testobj = self.new_test(testconf) except (errors.InitError, CoilError), ex: raise errors.InitError( "Error in test %s: %s" % (test_overrides['test'], ex)) except Exception: log.error("Unknown error while loading test.") log.error("Test config: %s" % repr(testconf)) log.error(str(errors.Failure())) raise errors.InitError( "Error in test %s" % test_overrides['test'])
def setup(user=None, group=None, file_limit=None, core_dumps=None): """Set the processes user, group, and file limits""" if file_limit: try: resource.setrlimit(resource.RLIMIT_NOFILE, (file_limit, file_limit)) except ValueError, ex: log.error("Failed to set limit on open files: %s" % ex) sys.exit(1) if group: if not group.isdigit(): try: group = grp.getgrnam(group)[2] except KeyError: log.error("Unknown group '%s'" % group) sys.exit(1) else: group = int(group) try: os.setregid(group, group) except OSError, ex: log.error("Failed to set gid: %s" % ex) sys.exit(1) if user: if not user.isdigit(): try: user = pwd.getpwnam(user)[2] except KeyError:
log.init_stdio() try: config = coil.parse(DEFAULT_CONFIG) if method.defaults: if isinstance(method.defaults, str): config.merge(coil.parse(method.defaults)) else: config.merge(coil.struct.Struct(method.defaults)) if options.config: config.merge(coil.parse_file(options.config)) except coil.errors.CoilError, ex: log.error("Error parsing config: %s" % ex) sys.exit(1) except IOError, ex: log.error("Error reading config file: %s" % ex) sys.exit(1) if options.dump: print str(config) sys.exit(0) macros = Macros(os.environ) if not macros: log.error("No Nagios environment variables found.") sys.exit(1) if options.host: event_type = "host" elif options.service: event_type = "service"
def _report(self, result): """Generate a report of the final result, pass that report off to all registered report callbacks. (ie nagios reporting) """ def indent(string, prefix=" "): ret = "" for line in string.splitlines(): if line.strip(): line = prefix+line ret += line+'\n' return ret # Choose what to report at the main result if isinstance(result, failure.Failure): if isinstance(result.value, ChildError): # A child failed, find the worst failure level = -1 failed = None for subtest in self._subtests.itervalues(): if isinstance(subtest.result, failure.Failure): if isinstance(subtest.result.value, errors.TestError): # UNKNOWN beats CRITICAL # CRITICAL beats WARNING # but an OK assertion beats everything. if (subtest.result.value.index > level or subtest.result.value.state == "OK"): level = subtest.result.value.index failed = subtest.result else: # Unknown error, just use it failed = subtest.result break; assert failed is not None else: failed = result if (isinstance(failed, errors.Failure) and failed.result is not errors.NO_RESULT): output = failed.result else: output = "" if isinstance(failed.value, errors.TestError): state = failed.value.state else: state = "UNKNOWN" state = self._apply_time_limit(state) error = str(failed.value) summary = error else: output = result state = "OK" error = "" summary = result # Grab the first 40 characters of the first line if summary: summary = summary.split('\n', 1)[0][:40] if state == "OK" and self.label: summary = "%s %s" % (summary, self.label) # Fill in the Extra Output area and all valid values extra = "" results = {} for subname, subtest in self._subtests.iteritems(): subextra = "" for savedname, savedval in subtest.saved.iteritems(): subextra += " %s:\n" % savedname subextra += indent(str(savedval), " "*8) if isinstance(subtest.result, failure.Failure): results[subname] = "" if (isinstance(subtest.result, errors.Failure) and subtest.result.result is not errors.NO_RESULT): subout = str(subtest.result.result) else: subout = "" if isinstance(subtest.result.value, errors.TestError): suberr = str(subtest.result.value) else: suberr = str(subtest.result) else: results[subname] = subtest.result subout = str(subtest.result) suberr = "" if subout and subout != output: subextra += " Output:\n" subextra += indent(subout, " "*8) if suberr and suberr != error: subextra += " Error:\n" subextra += indent(suberr, " "*8) if subextra: extra += indent("%s:\n%s" % (subname, subextra)) assert state in STATES if state == "OK": text = TEMPLATE_OK else: text = TEMPLATE_BAD report = { 'test': self._test, 'state': state, 'state_id': STATES.index(state), 'summary': summary, 'output': output, 'error': error, 'extra': extra, 'host': self.host, 'addr': self.addr, 'port': self._port, 'time': self._now, 'documentation': self._documentation, 'investigation': self._investigation, 'priority': self._priority, 'url': self._url, 'results': results, } text = text % report report['text'] = text # Don't fire callbacks (which write out to stuff) during shutdown if reactor.running: for (func, args, kwargs) in self._report_callbacks: try: func(report, *args, **kwargs) except: log.error("Report callback failed: %s" % failure.Failure()) return report
def main(): options, method = parse_options() log.init(options.logfile, options.loglevel) if not options.dump and options.daemonize: if os.fork() > 0: os._exit(0) os.chdir("/") os.setsid() if os.fork() > 0: os._exit(0) log.init_stdio() try: config = coil.parse(DEFAULT_CONFIG) if method.defaults: if isinstance(method.defaults, str): config.merge(coil.parse(method.defaults)) else: config.merge(coil.struct.Struct(method.defaults)) if options.config: config.merge(coil.parse_file(options.config)) except coil.errors.CoilError, ex: log.error("Error parsing config: %s" % ex) sys.exit(1)