Example #1
0
def daemonize(pid_file, cwd="/"):
    """Background the current process"""

    log.debug("daemonizing process")

    # BROKEN: the pid file may have already been created by write_pid
    # however, I'm not even using nagcat in daemon mode right now so
    # I'll just leave this commented out for now...
    # Also, this has a major race condition...
    #try:
    #    # A trivial check to see if we are already running
    #    pidfd = open(pid_file)
    #    pid = int(pidfd.readline().strip())
    #    pidfd.close()
    #    os.kill(pid, 0)
    #except (IOError, OSError):
    #    pass # Assume all is well if the test raised errors
    #else:
    #    log.error("PID file exits and process %s is running!" % pid)
    #    sys.exit(1)

    try:
        pidfd = open(pid_file, 'w')
    except IOError, ex:
        log.error("Failed to open PID file %s" % pid_file)
        log.error("Error: %s" % (ex,))
        sys.exit(1)
Example #2
0
def init(options):
    """Prepare to start up NagCat"""

    # Set uid/gid/file_limit
    util.setup(options.user, options.group,
               options.file_limit,
               options.core_dumps)

    # Write out the pid to make the verify script happy
    if options.pidfile:
        util.write_pid(options.pidfile)

    log.init(options.logfile, options.loglevel)
    config = coil.parse_file(options.config, expand=False)

    init_plugins(options)

    try:
        if options.test:
            nagcat = simple.NagcatSimple(config,
                    rradir=options.rradir,
                    rrdcache=options.rrdcache,
                    monitor_port=options.status_port,
                    test_name=options.test,
                    host=options.host, port=options.port)
        else:
            nagcat = nagios.NagcatNagios(config,
                    rradir=options.rradir,
                    rrdcache=options.rrdcache,
                    monitor_port=options.status_port,
                    nagios_cfg=options.nagios, tag=options.tag)
    except (errors.InitError, coil.errors.CoilError), ex:
        log.error(str(ex))
        sys.exit(1)
Example #3
0
 def _set_peer_id_and_timestamp(self):
     """ Gets a peer_id and sets a timestamp for when it acquired the peer_id
     The peer_id comes from merlin, and is obtained by reading a database,
     which Merlin outputs data to."""
     try:
         db = MySQLdb.connect(
             user=self._merlin_db_info['merlin_db_user'],
             host=self._merlin_db_info['merlin_db_host'],
             passwd=self._merlin_db_info['merlin_db_pass'],
             db=self._merlin_db_info['merlin_db_name'])
         curs = db.cursor()
         num_rows = curs.execute(
             """select * from merlin_peers where state=3;""")
         self._num_peers = num_rows
         log.debug("Setting self._num_peers = %s", self._num_peers)
         for i in range(num_rows):
             row = curs.fetchone()
             if row[0] == "localhost":
                 self._peer_id = row[5]
                 self._peer_id_timestamp = time.time()
                 log.debug(("Setting self._peer_id = %s",
                     str(self._peer_id)) +
                     ("and self._peer_id_timestamp = %s",
                     self._peer_id_timestamp))
     except MySQLdb.Error, e:
         log.error("Error reading merlin db %d: %s" % (e.args[0], e.args[1]))
Example #4
0
 def _status(self, object_types=(), object_select=()):
     try:
         stat = nagios_objects.ObjectParser(
                 self._status_file, object_types, object_select)
     except errors.InitError, ex:
         log.error("Failed to parse Nagios status file: %s" % ex)
         raise xmlrpc.Fault(1, "Failed to read Nagios status")
Example #5
0
    def _cleanup_spool(self):
        """Periodically clean up old things in the spool dir.

        This shouldn't normally be required but if things get screwed
        up we don't want the directory to get so huge that it keeps
        things slow after nagios is handling results again.
        """

        # Note: It is entirely possible that the command to submit
        # this file is still in the writer queue, if that's the case
        # nagios will also log an error when it gets around to
        # reading from the queue.

        # Set the threshold to 5 minutes ago, if nagios hasn't been
        # able to keep up for the past 5 minutes we have problems.
        threshold = time.time() - 300
        count = 0
        for item in os.listdir(self.spool_dir):
            path = "%s/%s" % (self.spool_dir, item)
            try:
               info = os.stat(path)
            except:
                continue
            if info.st_mtime < threshold:
                try:
                    os.unlink(path)
                except OSError, ex:
                    log.error("Failed to remove %s: %s" % (path, ex))
                else:
                    count += 1
Example #6
0
 def do_shutdown():
     calls['timer'].cancel()
     # This should not abort the daemon shutdown
     try:
         do_cmd(force=True)
     except:
         fail = failure.failure()
         log.error(fail)
Example #7
0
def write_pid(pid_file):
    """Write out the current PID"""

    try:
        pidfd = open(pid_file, 'w')
    except IOError, ex:
        log.error("Failed to open PID file %s" % pid_file)
        log.error("Error: %s" % (ex,))
        sys.exit(1)
Example #8
0
def setup(user=None, group=None, file_limit=None, core_dumps=None):
    """Set the processes user, group, and file limits"""

    if file_limit:
        try:
            resource.setrlimit(resource.RLIMIT_NOFILE, (file_limit, file_limit))
        except ValueError, ex:
            log.error("Failed to set limit on open files: %s" % ex)
            sys.exit(1)
Example #9
0
 def stop(result):
     reactor.stop()
     if isinstance(result, failure.Failure):
         if isinstance(result.value, NotificationError):
             log.error(str(result.value))
         else:
             log.error(str(result))
         exit_code[0] = 1
     else:
         exit_code[0] = 0
Example #10
0
    def latency(self, last):
        now = time.time()
        self._latency_call = reactor.callLater(1.0, self.latency, now)

        latency = now - last - 1.0
        self._latency.append(latency)

        if latency > 5.0:
            log.error("Callback latency: %s" % latency)
        elif latency > 1.5:
            log.warn("Callback latency: %s" % latency)
Example #11
0
    def schedule(self, runnable, delay=None):
        """(re)schedule a top level runnable"""
        if delay is None:
            delay = runnable.repeat

        if not delay:
            log.error("Task %s has no repeat value.", runnable)
        else:
            log.debug("Scheduling %s in %s seconds.", runnable, delay)
            deferred = task.deferLater(reactor, delay, runnable.start)
            deferred.addBoth(lambda x: self.schedule(runnable))
Example #12
0
    def _done(self, result):
        """Save the result, log unhandled errors"""

        log.debug("Stopping %s", self)
        log.debug("Result: %s", result)
        self.result = result
        self.lastrun = time.time()
        self.deferred = None

        if isinstance(result, failure.Failure):
            if isinstance(result.value, errors.TestError):
                if result.tb is not None:
                    log.warn("TestError with a traceback in %s:\n%s" %
                            (self, result.getTraceback()))
            else:
                log.error("Unhandled error in %s:\n%s" %
                        (self, result.getTraceback()))
Example #13
0
    def _failure_tcp(self, result):
        """Catch common TCP failures and convert them to a TestError"""

        if isinstance(result.value, neterror.TimeoutError):
            raise errors.TestCritical("TCP handshake timeout")

        elif isinstance(result.value, neterror.ConnectionRefusedError):
            raise errors.TestCritical("TCP connection refused")

        elif isinstance(result.value, neterror.ConnectionLost):
            raise errors.TestCritical("TCP connection lost unexpectedly")

        elif isinstance(result.value, neterror.ConnectError):
            if result.value.osError == errno.EMFILE:
                log.error("Too many open files! Restart with a new ulimit -n")
                raise errors.TestAbort("NAGCAT ERROR: %s" % result.value)
            raise errors.TestCritical("TCP error: %s" % result.value)

        return result
Example #14
0
def main():
    """Start up NagCat, profiling things as requested"""

    options = parse_options()
    log.init(options.logfile, options.loglevel)

    if options.pidfile:
        util.write_pid(options.pidfile)

    site = monitor_api.MonitorSite()
    site.root.putChild("nagios", nagios_api.NagiosStatus(options.nagios))

    if not options.read_only:
        try:
            rpc = nagios_api.NagiosXMLRPC(options.nagios)
        except errors.InitError, ex:
            log.error(str(ex))
            sys.exit(1)
        site.root.putChild("RPC2", rpc)
Example #15
0
File: nagios.py Project: mv/nagcat
    def build_tests(self, templates, tag=None):
        """Setup tests based on the loaded Nagios config"""

        skels = self._parse_tests(tag)
        tests = []

        for test_defaults, test_overrides in skels:
            testconf = templates.get(test_overrides['test'], None)
            if testconf is None:
                raise errors.InitError(
                        "Test template '%s' not found in config!"
                        % test_overrides['test'])

            # Copy the config so we can add instance specific values
            # such as host, port, etc.
            testconf = testconf.copy()

            for key, val in test_defaults.iteritems():
                testconf.setdefault(key, val)

            for key, val in test_overrides.iteritems():
                testconf[key] = val

            try:
                testobj = self.new_test(testconf)
            except (errors.InitError, CoilError), ex:
                raise errors.InitError(
                        "Error in test %s: %s" % (test_overrides['test'], ex))
            except Exception:
                log.error("Unknown error while loading test.")
                log.error("Test config: %s" % repr(testconf))
                log.error(str(errors.Failure()))
                raise errors.InitError(
                        "Error in test %s" % test_overrides['test'])
Example #16
0
def setup(user=None, group=None, file_limit=None, core_dumps=None):
    """Set the processes user, group, and file limits"""

    if file_limit:
        try:
            resource.setrlimit(resource.RLIMIT_NOFILE, (file_limit, file_limit))
        except ValueError, ex:
            log.error("Failed to set limit on open files: %s" % ex)
            sys.exit(1)

    if group:
        if not group.isdigit():
            try:
                group = grp.getgrnam(group)[2]
            except KeyError:
                log.error("Unknown group '%s'" % group)
                sys.exit(1)
        else:
            group = int(group)

        try:
            os.setregid(group, group)
        except OSError, ex:
            log.error("Failed to set gid: %s" % ex)
            sys.exit(1)

    if user:
        if not user.isdigit():
            try:
                user = pwd.getpwnam(user)[2]
            except KeyError:
Example #17
0
        log.init_stdio()

    try:
        config = coil.parse(DEFAULT_CONFIG)
        if method.defaults:
            if isinstance(method.defaults, str):
                config.merge(coil.parse(method.defaults))
            else:
                config.merge(coil.struct.Struct(method.defaults))
        if options.config:
            config.merge(coil.parse_file(options.config))
    except coil.errors.CoilError, ex:
        log.error("Error parsing config: %s" % ex)
        sys.exit(1)
    except IOError, ex:
        log.error("Error reading config file: %s" % ex)
        sys.exit(1)

    if options.dump:
        print str(config)
        sys.exit(0)

    macros = Macros(os.environ)
    if not macros:
        log.error("No Nagios environment variables found.")
        sys.exit(1)

    if options.host:
        event_type = "host"
    elif options.service:
        event_type = "service"
Example #18
0
    def _report(self, result):
        """Generate a report of the final result, pass that report off
        to all registered report callbacks. (ie nagios reporting)
        """

        def indent(string, prefix="    "):
            ret = ""
            for line in string.splitlines():
                if line.strip():
                    line = prefix+line
                ret += line+'\n'
            return ret

        # Choose what to report at the main result
        if isinstance(result, failure.Failure):
            if isinstance(result.value, ChildError):
                # A child failed, find the worst failure
                level = -1
                failed = None

                for subtest in self._subtests.itervalues():
                    if isinstance(subtest.result, failure.Failure):
                        if isinstance(subtest.result.value, errors.TestError):
                            # UNKNOWN beats CRITICAL
                            # CRITICAL beats WARNING
                            # but an OK assertion beats everything.
                            if (subtest.result.value.index > level or
                                    subtest.result.value.state == "OK"):
                                level = subtest.result.value.index
                                failed = subtest.result
                        else:
                            # Unknown error, just use it
                            failed = subtest.result
                            break;

                assert failed is not None
            else:
                failed = result

            if (isinstance(failed, errors.Failure) and
                    failed.result is not errors.NO_RESULT):
                output = failed.result
            else:
                output = ""

            if isinstance(failed.value, errors.TestError):
                state = failed.value.state
            else:
                state = "UNKNOWN"

            state = self._apply_time_limit(state)
            error = str(failed.value)
            summary = error
        else:
            output = result
            state = "OK"
            error = ""
            summary = result

        # Grab the first 40 characters of the first line
        if summary:
            summary = summary.split('\n', 1)[0][:40]
            if state == "OK" and self.label:
                summary = "%s %s" % (summary, self.label)

        # Fill in the Extra Output area and all valid values
        extra = ""
        results = {}
        for subname, subtest in self._subtests.iteritems():
            subextra = ""
            for savedname, savedval in subtest.saved.iteritems():
                subextra += "    %s:\n" % savedname
                subextra += indent(str(savedval), " "*8)

            if isinstance(subtest.result, failure.Failure):
                results[subname] = ""

                if (isinstance(subtest.result, errors.Failure) and
                        subtest.result.result is not errors.NO_RESULT):
                    subout = str(subtest.result.result)
                else:
                    subout = ""

                if isinstance(subtest.result.value, errors.TestError):
                    suberr = str(subtest.result.value)
                else:
                    suberr = str(subtest.result)
            else:
                results[subname] = subtest.result
                subout = str(subtest.result)
                suberr = ""

            if subout and subout != output:
                subextra += "    Output:\n"
                subextra += indent(subout, " "*8)

            if suberr and suberr != error:
                subextra += "    Error:\n"
                subextra += indent(suberr, " "*8)

            if subextra:
                extra += indent("%s:\n%s" % (subname, subextra))

        assert state in STATES

        if state == "OK":
            text = TEMPLATE_OK
        else:
            text = TEMPLATE_BAD

        report = {
                'test': self._test,
                'state': state,
                'state_id': STATES.index(state),
                'summary': summary,
                'output': output,
                'error': error,
                'extra': extra,
                'host': self.host,
                'addr': self.addr,
                'port': self._port,
                'time': self._now,
                'documentation': self._documentation,
                'investigation': self._investigation,
                'priority': self._priority,
                'url': self._url,
                'results': results,
                }

        text = text % report
        report['text'] = text

        # Don't fire callbacks (which write out to stuff) during shutdown
        if reactor.running:
            for (func, args, kwargs) in self._report_callbacks:
                try:
                    func(report, *args, **kwargs)
                except:
                    log.error("Report callback failed: %s" % failure.Failure())

        return report
Example #19
-1
def main():
    options, method = parse_options()

    log.init(options.logfile, options.loglevel)

    if not options.dump and options.daemonize:
        if os.fork() > 0:
            os._exit(0)
        os.chdir("/")
        os.setsid()
        if os.fork() > 0:
            os._exit(0)
        log.init_stdio()

    try:
        config = coil.parse(DEFAULT_CONFIG)
        if method.defaults:
            if isinstance(method.defaults, str):
                config.merge(coil.parse(method.defaults))
            else:
                config.merge(coil.struct.Struct(method.defaults))
        if options.config:
            config.merge(coil.parse_file(options.config))
    except coil.errors.CoilError, ex:
        log.error("Error parsing config: %s" % ex)
        sys.exit(1)