def _connectedEb(self, f): if f.check(errors.ConnectionFailedError): # switch the failure and return an UNKNOWN status msg = "Unable to connect to manager." f = failure.Failure(util.NagiosUnknown(msg)) util.unknown(msg) if f.check(errors.ConnectionRefusedError): # switch the failure and return a CRITICAL status msg = "Manager refused connection." f = failure.Failure(util.NagiosCritical(msg)) util.critical(msg) # all other failures get forwarded to the managerDeferred errback as-is self.managerDeferred.errback(f)
def do(self, args): processes = getProcesses(prefix='flumotion') # convert to a dict of (worker pid, component name) -> component pid components = {} for p in processes.values(): if not p.cmd.startswith('flumotion-job'): continue # ignore workerPid 1, which is init - see orphaned for that if p.ppid == 1: continue t = (p.ppid, p.component) if not t in components.keys(): components[t] = [] components[t].append(str(p.pid)) # count the number of tuples with more than one component running which = [(t, p) for t, p in components.items() if len(p) > 1] if not which: return util.ok('No multiple component jobs running.') l = [] for (workerPid, component), pids in which: l.append('worker %d: component %s (%s)' % (workerPid, component, ", ".join(pids))) return util.critical('%d multiple component job(s) running (%s).' % (len(which), ", ".join(l)))
def _detect_flipflops(self, component_state): f = FlipFlopDetector(self.timeout, self.flipflops, self.mood_a, self.mood_b, component_state) f.start() d = f.wait() return d.addCallbacks(util.ok, lambda f: util.critical(f.getErrorMessage()))
def do(self, args): processes = getProcesses(prefix='flumotion') # convert to a dict of (worker pid, component name) -> component pid components = {} for p in processes.values(): if not p.cmd.startswith('flumotion-job'): continue # ignore workerPid 1, which is init - see orphaned for that if p.ppid == 1: continue t = (p.ppid, p.component) if not t in components.keys(): components[t] = [] components[t].append(str(p.pid)) # count the number of tuples with more than one component running which = [(t, p) for t, p in components.items() if len(p) > 1] if not which: return util.ok('No multiple component jobs running.') l = [] for (workerPid, component), pids in which: l.append('worker %d: component %s (%s)' % ( workerPid, component, ", ".join(pids))) return util.critical('%d multiple component job(s) running (%s).' % ( len(which), ", ".join(l)))
def do(self, args): which = getMultiple('flumotion-worke') if which: return util.critical( '%d worker service(s) running more than once (%s)' % ( len(which), ", ".join(which))) return util.ok('no worker services running more than once')
def do(self, args): which = getMultiple('flumotion-worke') if which: return util.critical( '%d worker service(s) running more than once (%s)' % (len(which), ", ".join(which))) return util.ok('no worker services running more than once')
def do(self, args): # get a list of pid, vsize and sort on vsize in reverse order processes = getProcesses(prefix='flumotion-job') orphaned = [str(pid) for pid, p in processes.items() if p.ppid == 1] if not orphaned: return util.ok('No orphaned job processes running.') return util.critical('%d orphaned job process(es) running (%s).' % ( len(orphaned), ", ".join(orphaned)))
def do(self, args): # get a list of pid, vsize and sort on vsize in reverse order processes = getProcesses(prefix='flumotion-job') orphaned = [str(pid) for pid, p in processes.items() if p.ppid == 1] if not orphaned: return util.ok('No orphaned job processes running.') return util.critical('%d orphaned job process(es) running (%s).' % (len(orphaned), ", ".join(orphaned)))
def do(self, args): if not args: return util.unknown('Please specify a log file to check.') if len(args) > 1: return util.unknown('Please specify only one log file to check.') command = "grep '%s' %s | tail -n 1" % ( self.options.string, " ".join(args)) self.debug('executing %s' % command) output = commands.getoutput(command) self.debug('output: %s' % output) if not output: return util.unknown('Could not find string %s in log file' % self.options.string) level = output[:5].strip() if level not in ['ERROR', 'WARN', 'INFO', 'DEBUG', 'LOG']: return util.unknown("Last line is not a log line: '%s'" % output) # matches flumotion.extern.log.log # level pid object cat time # 5 + 1 + 7 + 1 + 32 + 1 + 17 + 1 + 15 == 80 position = 5 + 1 + 7 + 1 + 32 + 1 + 17 + 1 # log timestrings are currently in local time, which might be a mistake timestring = output[position:position + 15] timetuple = time.strptime(timestring, "%b %d %H:%M:%S") now = time.time() nowtuple = time.localtime(now) # since the year does not get logged, assume the log line is from this # year, or last year if the delta becomes negative timelist = list(timetuple) timelist[0] = nowtuple[0] if time.mktime(tuple(timelist)) > time.mktime(nowtuple): self.debug('timestamp is past now, so assume it is from last year') timelist[0] = nowtuple[0] - 1 # mktime also works in local time, which hopefully matches the log's # local time timestamp = time.mktime(tuple(timelist)) delta = now - int(timestamp) msg = 'Last log line%s is %s old.' % ( self.options.string and " with '%s'" % self.options.string or '', formatting.formatTime(delta, fractional=2)) if delta > int(self.options.critical): return util.critical(msg) elif delta > int(self.options.warning): return util.warning(msg) else: return util.ok(msg)
def do(self, args): if not args: return util.unknown('Please specify a log file to check.') if len(args) > 1: return util.unknown('Please specify only one log file to check.') command = "grep '%s' %s | tail -n 1" % (self.options.string, " ".join(args)) self.debug('executing %s' % command) output = commands.getoutput(command) self.debug('output: %s' % output) if not output: return util.unknown('Could not find string %s in log file' % self.options.string) level = output[:5].strip() if level not in ['ERROR', 'WARN', 'INFO', 'DEBUG', 'LOG']: return util.unknown("Last line is not a log line: '%s'" % output) # matches flumotion.extern.log.log # level pid object cat time # 5 + 1 + 7 + 1 + 32 + 1 + 17 + 1 + 15 == 80 position = 5 + 1 + 7 + 1 + 32 + 1 + 17 + 1 # log timestrings are currently in local time, which might be a mistake timestring = output[position:position + 15] timetuple = time.strptime(timestring, "%b %d %H:%M:%S") now = time.time() nowtuple = time.localtime(now) # since the year does not get logged, assume the log line is from this # year, or last year if the delta becomes negative timelist = list(timetuple) timelist[0] = nowtuple[0] if time.mktime(tuple(timelist)) > time.mktime(nowtuple): self.debug('timestamp is past now, so assume it is from last year') timelist[0] = nowtuple[0] - 1 # mktime also works in local time, which hopefully matches the log's # local time timestamp = time.mktime(tuple(timelist)) delta = now - int(timestamp) msg = 'Last log line%s is %s old.' % ( self.options.string and " with '%s'" % self.options.string or '', formatting.formatTime(delta, fractional=2)) if delta > int(self.options.critical): return util.critical(msg) elif delta > int(self.options.warning): return util.warning(msg) else: return util.ok(msg)
def gotPlanetStateCb(result): self.debug("gotPlanetStateCb") c = util.findComponent(result, self._component) if not c: return util.unknown("Could not find component %s" % self._component) moodValue = c.get("mood") moodName = planet.moods.get(moodValue).name if moodName in self._critical: return util.critical("Component %s is %s" % (self._component, moodName)) if moodName in self._warning: return util.warning("Component %s is %s" % (self._component, moodName)) return util.ok("Component %s is %s" % (self._component, moodName))
def gotPlanetStateCb(result): self.debug('gotPlanetStateCb') c = util.findComponent(result, self._component) if not c: return util.unknown('Could not find component %s' % self._component) moodValue = c.get('mood') moodName = planet.moods.get(moodValue).name if moodName in self._critical: return util.critical('Component %s is %s' % (self._component, moodName)) if moodName in self._warning: return util.warning('Component %s is %s' % (self._component, moodName)) return util.ok('Component %s is %s' % (self._component, moodName))
def do(self, args): # get a list of pid, vsize and sort on vsize in reverse order l = [] processes = getProcesses(prefix=self.prefix) if not processes: return util.ok('No job processes running.') for process in processes.values(): l.append((process.pid, process.vsize)) l.sort(key=lambda t: t[1]) l.reverse() # check the one with the mostest pid, vsize = l[0] warning = parseSize(self.options.warning) critical = parseSize(self.options.critical) if vsize >= critical: # count number of critical jobs which = [t for t in l if t[1] >= critical] return util.critical( '%d %s(s) above critical level - highest is %d at %s' % (len(which), self.process_type, pid, formatting.formatStorage(vsize))) if vsize >= warning: # count number of warning jobs which = [t for t in l if t[1] >= warning] return util.warning( '%d %s(s) above warning level - highest is %d at %s' % (len(which), self.process_type, pid, formatting.formatStorage(vsize))) return util.ok( 'No %s processes above warning level ' '(highest is %d at %s)' % (self.process_type, pid, formatting.formatStorage(vsize)))
def do(self, args): # get a list of pid, vsize and sort on vsize in reverse order l = [] processes = getProcesses(prefix=self.prefix) if not processes: return util.ok('No job processes running.') for process in processes.values(): l.append((process.pid, process.vsize)) l.sort(key=lambda t: t[1]) l.reverse() # check the one with the mostest pid, vsize = l[0] warning = parseSize(self.options.warning) critical = parseSize(self.options.critical) if vsize >= critical: # count number of critical jobs which = [t for t in l if t[1] >= critical] return util.critical( '%d %s(s) above critical level - highest is %d at %s' % ( len(which), self.process_type, pid, formatting.formatStorage(vsize))) if vsize >= warning: # count number of warning jobs which = [t for t in l if t[1] >= warning] return util.warning( '%d %s(s) above warning level - highest is %d at %s' % ( len(which), self.process_type, pid, formatting.formatStorage(vsize))) return util.ok('No %s processes above warning level ' '(highest is %d at %s)' % ( self.process_type, pid, formatting.formatStorage(vsize)))
def failure(result): util.critical('Error: %s' % result) reactor.stop()
def noauthenticate(result): util.critical('Error: %s' % result) reactor.stop()
def critical(self, message): return util.critical('%s: %s [dump at %s]' % (self._url, message, self._tmpfile))
def _connectedEb(self, failure): if failure.check(errors.ConnectionFailedError): util.unknown("Unable to connect to manager.") if failure.check(errors.ConnectionRefusedError): util.critical("Manager refused connection.") self.managerDeferred.errback(failure)