def _send_report(self, report, host_name, service_description): log.debug("Submitting report for %s %s to Nagios", host_name, service_description) self._nagios_cmd.command(report['time'], 'PROCESS_SERVICE_CHECK_RESULT', host_name, service_description, report['state_id'], report['text'])
def _set_peer_id_and_timestamp(self): """ Gets a peer_id and sets a timestamp for when it acquired the peer_id The peer_id comes from merlin, and is obtained by reading a database, which Merlin outputs data to.""" try: db = MySQLdb.connect( user=self._merlin_db_info['merlin_db_user'], host=self._merlin_db_info['merlin_db_host'], passwd=self._merlin_db_info['merlin_db_pass'], db=self._merlin_db_info['merlin_db_name']) curs = db.cursor() num_rows = curs.execute( """select * from merlin_peers where state=3;""") self._num_peers = num_rows log.debug("Setting self._num_peers = %s", self._num_peers) for i in range(num_rows): row = curs.fetchone() if row[0] == "localhost": self._peer_id = row[5] self._peer_id_timestamp = time.time() log.debug(("Setting self._peer_id = %s", str(self._peer_id)) + ("and self._peer_id_timestamp = %s", self._peer_id_timestamp)) except MySQLdb.Error, e: log.error("Error reading merlin db %d: %s" % (e.args[0], e.args[1]))
def _apply_time_limit(self, state): if not self._warning_time_limit or state != "WARNING": return state status = self._nagcat.nagios_status() found = None for service in status['service']: if (service['service_description'] == self._description and service['host_name'] == self.host): found = service break if not found: return state if found['last_hard_state'] != '1': # WARNING return state limit = (int(found['last_hard_state_change']) + self._warning_time_limit) if self._now > limit: log.debug("Warning time limit of %s exceeded for %s", self._warning_time_limit, self) state = "CRITICAL" return state
def daemonize(pid_file, cwd="/"): """Background the current process""" log.debug("daemonizing process") # BROKEN: the pid file may have already been created by write_pid # however, I'm not even using nagcat in daemon mode right now so # I'll just leave this commented out for now... # Also, this has a major race condition... #try: # # A trivial check to see if we are already running # pidfd = open(pid_file) # pid = int(pidfd.readline().strip()) # pidfd.close() # os.kill(pid, 0) #except (IOError, OSError): # pass # Assume all is well if the test raised errors #else: # log.error("PID file exits and process %s is running!" % pid) # sys.exit(1) try: pidfd = open(pid_file, 'w') except IOError, ex: log.error("Failed to open PID file %s" % pid_file) log.error("Error: %s" % (ex,)) sys.exit(1)
def start(self): """Decides whether or not to start the test, based on _should_run.""" if self._should_run(): log.debug("Running test %s", self) return super(MerlinTest,self).start() else: log.debug("Skipping start of %s", self) return defer.succeed(None)
def _start_dependencies(self): if self.__depends: log.debug("Starting dependencies for %s", self) deferlist = [] for dep in self.__depends: deferlist.append(dep.start()) return defer.DeferredList(deferlist) else: return defer.succeed(None)
def _filter_without_default(self, result): log.debug("Fetching cell %s,%s from table", self.row, self.col) try: sniffer = csv.Sniffer() dialect = sniffer.sniff(result) reader = csv.reader(StringIO(result), dialect) table = list(reader) except csv.Error, ex: raise errors.TestCritical("Failed to parse table: %s" % ex)
def filter(self, result): log.debug("Converting date using format '%s'", self.arguments) try: return str(time.mktime(time.strptime(result, self.arguments))) except ValueError: if self.default is not None: return self.default else: raise errors.TestCritical( "Failed to parse date with format '%s'" % self.arguments)
def _should_run(self): """Decides whether or not a test should be run, based on its task index and the schedulers peer_id. Returns True if it should run, False if it should not.""" peer_id, num_peers = self._nagcat.get_peer_id_num_peers() log.debug("Running should_run, test_index=%s, num_peers=%s, peer_id=%s", str(self._test_index), num_peers, peer_id) if peer_id and num_peers: if self._test_index % num_peers != peer_id: return False return True
def schedule(self, runnable, delay=None): """(re)schedule a top level runnable""" if delay is None: delay = runnable.repeat if not delay: log.error("Task %s has no repeat value.", runnable) else: log.debug("Scheduling %s in %s seconds.", runnable, delay) deferred = task.deferLater(reactor, delay, runnable.start) deferred.addBoth(lambda x: self.schedule(runnable))
def _update_peer_id(self): log.debug("Updating peer_id with _merlin_db_info=%s", self._merlin_db_info) if self._peer_id and self._peer_id_timestamp: if time.time() - self._peer_id_timestamp >= 60: # peer_id should be refreshed. self._set_peer_id_and_timestamp() else: # peer_id is still valid, return. return else: # We are missing peer_id or peer_id_timestamp... if self._merlin_db_info: self._set_peer_id_and_timestamp()
def filter(self, result): def format(data): if etree.iselement(data): ret = etree.tostring(data, pretty_print=True) else: ret = str(data) return ret.strip() log.debug("Fetching XML element %s", self.arguments) try: root = etree.fromstring(result) except etree.XMLSyntaxError, ex: raise errors.TestCritical("Invalid XML: %s" % ex)
def filter(self, result): log.debug("Matching regex '%s'", self.arguments) match = self.regex.search(result) if match: if match.groups(): return match.group(1) else: return match.group(0) elif self.default is not None: return self.default else: raise errors.TestCritical( "Failed to match regex '%s'" % self.arguments)
def _done(self, result): """Save the result, log unhandled errors""" log.debug("Stopping %s", self) log.debug("Result: %s", result) self.result = result self.lastrun = time.time() self.deferred = None if isinstance(result, failure.Failure): if isinstance(result.value, errors.TestError): if result.tb is not None: log.warn("TestError with a traceback in %s:\n%s" % (self, result.getTraceback())) else: log.error("Unhandled error in %s:\n%s" % (self, result.getTraceback()))
def start(self): """Start up the scheduler!""" assert self._startup and not self._shutdown self._startup = False self._shutdown = deferred = defer.Deferred() del self._group_index if not self._registered: self.stop() return deferred if self.monitor: reactor.listenTCP(self._monitor_port, self.monitor) self._log_stats() # Collect runnables that query the same host so that we can # avoid hitting a host with many queries at once host_groups = {} for runnable in self._registered: runnable.finalize() if runnable.host in host_groups: host_groups[runnable.host].append(runnable) else: host_groups[runnable.host] = [runnable] for host_name, host_group in host_groups.iteritems(): log.debug("Scheduling host %s", host_name) # The first runnable in the group will start between now and # the end of the slot time period. Any remaining runnables will # start after the number of seconds in the slot. This should # evenly distribute queries that are sent to the same host. slot = 60.0 / len(host_group) assert slot delay = random.random() * slot for runnable in host_group: self.schedule(runnable, delay) delay += slot # Start latency self-checker self._latency_call = reactor.callLater(1.0, self.latency, time.time()) log.info("Startup complete, running...") return deferred
def maybe_read(key, private=False): filetype = self.conf[key+'_type'] path = self.conf[key] if not path: return None log.debug("Loading %s from %s", key, path) try: fd = open(path) try: data = fd.read() finally: fd.close() except IOError, ex: self.init_errors.append("Failed to read %s file %s: %s" % (key, path, ex.strerror)) return None
def _startProcess(self, command): command = [str(x) for x in command] log.debug("Running process: %s", command) proto = SubprocessProtocol() proto.factory = self # Setup timeout call_id = reactor.callLater(self.conf['timeout'], proto.timeout) self.deferred.addBoth(self._cancelTimeout, call_id) # Setup shutdown cleanup call_id = reactor.addSystemEventTrigger('after', 'shutdown', proto.timeout) self.deferred.addBoth(self._cancelCleanup, call_id) process.Process.__init__(self, reactor, command[0], command, self.conf['environment'], path=None, proto=proto)
def filter(self, result): log.debug("Grepping regex '%s'", self.arguments) output = "" for line in result.splitlines(True): if self.regex.search(line): if not self.invert: output += line else: if self.invert: output += line if output: return output elif self.default is not None: return self.default else: raise errors.TestCritical( "Failed to match regex '%s'" % self.arguments)
def start(self): """Start a Runnable object""" # Don't start again if we are already running if self.deferred is not None: return self.deferred # Reuse old results if our time isn't up yet elif self.lastrun + self.repeat.seconds > time.time(): log.debug("Skipping start of %s", self) return defer.succeed(None) else: # use deferred instead of self.deferred because # __done could have been called already self.deferred = deferred = self._start_dependencies() deferred.addBoth(lambda x: self._start_self()) deferred.addBoth(self._done) return deferred
def maybe_read(key, private=False): # Only support PEM for now filetype = crypto.FILETYPE_PEM path = self.conf[key] filetype = self.conf[key+'_type'] if not path: return None log.debug("Loading %s from %s", key, path) try: fd = open(path) try: data = fd.read() finally: fd.close() except IOError, ex: raise errors.InitError("Failed to read %s file %s: %s" % (path, key, ex.strerror))
def _computeReturn(self): if self._compound: data = {'NOW': util.MathString(self._now)} for name, subtest in self._subtests.iteritems(): if isinstance(subtest.result, failure.Failure): raise ChildError() data[name] = util.MathString(subtest.result) log.debug("Evaluating return '%s' with data = %s", self._return, data) result = str(eval(self._return, {'data': data})) else: subtest = self._subtests['query'] if isinstance(subtest.result, failure.Failure): raise ChildError() else: result = subtest.result return result
def new_query(self, conf, qcls=None): """Create a new query and register it or return an existing one""" # Find the correct Query class for this type if not qcls: qtype = conf.get('type') qcls = plugin.search(IQuery, qtype, None) if not qcls: raise errors.ConfigError(conf, "Unknown query type '%s'" % qtype) qobj = qcls(self._nagcat, conf) key = str(qobj) if key in self._queries: log.debug("Reusing query '%s'", key) qobj = self._queries[key] qobj.update(conf) else: log.debug("Adding query '%s'", key) self._queries[key] = qobj return qobj
def _parse_tests(self, tag): """Get the list of NagCat services in the object cache""" parser = nagios_objects.ObjectParser( self._nagios_obj, ('host', 'service')) hosts = {} tests = [] for host in parser['host']: hosts[host['host_name']] = host for service in parser['service']: host = hosts[service['host_name']] if "_TEST" not in service: continue elif tag and service.get("_TAG", host.get('_TAG', None)) != tag: continue test_defaults = { 'host': service['host_name'], 'addr': host['address'], 'description': service['service_description']} test_overrides = {} for key in service: if len(key) < 2 or key[0] != "_": continue # save all vars that start with '_' # coil is normally in lower case and Nagios is case insensitive test_overrides[key[1:].lower()] = service[key] log.debug("Found Nagios service: %s", test_defaults) log.debug("Service overrides: %s", test_overrides) tests.append((test_defaults, test_overrides)) return tests
def _start_self(self): log.debug("Starting %s", self) return task.deferLater(reactor, 0, self._start)