def doCollection(driver): self.log.debug("doCollection(): starting collection cycle") reactor.callLater(self.options.dataCollectInterval, self.runCollection) if not self.options.cycle: self.stop() if self.running: self.log.error("last appengine collection is still running") return self.running = True jobs = NJobs(200, self.collectAppEngine, self.datasourceMap.keys()) yield jobs.start() driver.next() self.log.debug("doCollection(): exiting collection cycle") self.sendEvents( self.rrdStats.gauge('instances', self.options.dataCollectInterval, len(self.datasourceMap)) + self.rrdStats.counter('dataPoints', self.options.dataCollectInterval, self.rrd.dataPoints) + self.rrdStats.gauge('cyclePoints', self.options.dataCollectInterval, self.rrd.endCycle()) )
def _collectCallback(self): jobs = NJobs(self._preferences.options.parallel, self._collectJMX, self._taskConfig.jmxDataSourceConfigs.values()) deferred = jobs.start() return deferred
def doPeriodic(driver): """ Generator function to create deferred jobs. """ yield self.getDevicePingIssues() self.downDevices = Set([d[0] for d in driver.next()]) self.scanning = NJobs(self.parallelJobs, self.oneDevice, self.devices().values()) yield self.scanning.start() driver.next()
def discoverDevices(self, ips, devicepath="/Discovered", prodState=1000): """ Discover devices by active ips that are not associated with a device. @param ips: list of IP addresses @type ips: list of strings @param devicepath: where in the DMD to put any discovered devices @type devicepath: string @param prodState: production state (see Admin Guide for a description) @type prodState: integer @return: Twisted/Zenoss Python iterable @rtype: Python iterable """ def discoverDevice(ip): """ Discover a particular device NB: Wrapper around self.discoverDevice() @param ip: IP address @type ip: string @return: Twisted/Zenoss Python iterable @rtype: Python iterable """ return self.discoverDevice(ip, devicepath, prodState) return NJobs(self.options.parallel, discoverDevice, ips).start()
def discoverRanges(self, driver): """ Ping all IPs in the range and create devices for the ones that come back. @param ranges: list of ranges to discover @type ranges: list """ if isinstance(self.options.range, basestring): self.options.range = [self.options.range] # in case someone uses 10.0.0.0-5,192.168.0.1-5 instead of # --range 10.0.0.0-5 --range 192.168.0.1-5 if (isinstance(self.options.range, list) and self.options.range[0].find(",") > -1): self.options.range = [ n.strip() for n in self.options.range[0].split(',') ] ips = [] goodCount = 0 for iprange in self.options.range: # Parse to find ips included ips.extend(parse_iprange(iprange)) yield NJobs(self.options.chunkSize, self.ping, ips).start() results = driver.next() goodips = [v.ipaddr for v in results if not isinstance(v, Failure)] badips = [v.value.ipaddr for v in results if isinstance(v, Failure)] goodCount += len(goodips) self.log.debug("Got %d good IPs and %d bad IPs", len(goodips), len(badips)) yield self.discoverDevices(goodips) yield succeed("Discovered %d active IPs" % goodCount) driver.next()
def doCollection(driver): self.log.debug("doCollection(): starting collection cycle") reactor.callLater(self.options.dataCollectInterval, self.runCollection) if not self.options.cycle: self.stop() if self.running: self.log.error("last appengine collection is still running") return self.running = True jobs = NJobs(200, self.collectAppEngine, self.datasourceMap.keys()) yield jobs.start() driver.next() self.log.debug("doCollection(): exiting collection cycle") self.sendEvents( self.rrdStats.gauge( 'instances', self.options.dataCollectInterval, len(self.datasourceMap)) + self.rrdStats.counter( 'dataPoints', self.options.dataCollectInterval, self.rrd.dataPoints) + self.rrdStats.gauge('cyclePoints', self.options. dataCollectInterval, self.rrd.endCycle()))
def inner(driver): """ Twisted driver class to iterate through devices @param driver: Zenoss driver @type driver: Zenoss driver @return: successful result is a list of IPs that were added @rtype: Twisted deferred """ ips = [] goodCount = 0 # it would be nice to interleave ping/discover for net in nets: if self.options.subnets and len(net.children()) > 0: continue if not getattr(net, "zAutoDiscover", False): self.log.info( "Skipping network %s because zAutoDiscover is False" % net.getNetworkName()) continue self.log.info("Discover network '%s'", net.getNetworkName()) yield NJobs(self.options.chunkSize, self.ping, net.fullIpList()).start() results = driver.next() goodips = [ v.ipaddr for v in results if not isinstance(v, Failure) ] badips = [ v.value.ipaddr for v in results if isinstance(v, Failure) ] goodCount += len(goodips) self.log.debug("Got %d good IPs and %d bad IPs", len(goodips), len(badips)) yield self.config().callRemote('pingStatus', net, goodips, badips, self.options.resetPtr, self.options.addInactive) ips += driver.next() self.log.info("Discovered %s active ips", goodCount) # make sure this is the return result for the driver yield succeed(ips) driver.next()
class zensunprocess(SnmpDaemon): """ Daemon class to connect to an SNMP agent and determine the processes that are running on that server. """ statusEvent = { 'eventClass' : Status_OSProcess, 'eventGroup' : 'Process' } initialServices = SnmpDaemon.initialServices + ['ZenPacks.community.SunMibMonitor.services.SunMibProcessConfig'] processConfigInterval = 20*60 processCycleInterval = 5*60 properties = SnmpDaemon.properties + ('processCycleInterval',) missing = 0 restarted = 0 parallelJobs = DEFAULT_PARALLEL_JOBS def __init__(self, noopts=False): SnmpDaemon.__init__(self, 'zensunprocess', noopts) self._devices = {} self.scanning = None self.downDevices = Set() def devices(self): """ Return the list of devices that are available @return: device list @rtype: dictionary of device name, device object """ return dict([(k, v) for k, v in self._devices.items() if k not in self.downDevices]) def fetchConfig(self): """ Get configuration values from zenhub @return: Twisted deferred @rtype: Twisted deferred """ def doFetchConfig(driver): now = time.time() yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() yield self.model().callRemote('getZenProcessParallelJobs') self.parallelJobs = int(driver.next()) yield self.model().callRemote('propertyItems') self.setPropertyItems(driver.next()) self.rrd = RRDUtil(createCommand, self.processCycleInterval) yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) devices = [] if self.options.device: devices = [self.options.device] yield self.model().callRemote('getSunMibProcessConf', devices) driver.next() self.sendEvents( self.rrdStats.gauge('configTime', self.processConfigInterval, time.time() - now) ) return drive(doFetchConfig) def remote_deleteDevice(self, doomed): """ Called from zenhub to remove a device from our configuration @parameter doomed: device to delete @type doomed: string """ self.log.debug("zenhub asks us to delete device %s" % doomed) if doomed in self._devices: del self._devices[doomed] self.clearSnmpError(doomed, "Device %s removed from SNMP collection") def remote_updateDeviceList(self, devices): """ Called from zenhub to update the devices to monitor @parameter devices: devices to monitor @type devices: list of (device, changetime) tuples """ self.log.debug("Received updated device list from zenhub %s" % devices) doomed = Set(self._devices.keys()) updated = [] for device, lastChange in devices: # Ignore updates for devices if we've only asked for one device if self.options.device and \ device != self.options.device: self.log.debug("Ignoring update for %s as we only want %s", device, self.options.device) continue cfg = self._devices.get(device, None) if not cfg or self._devices[device].lastChange < lastChange: updated.append(device) doomed.discard(device) if updated: log.info("Fetching the config for %s", updated) d = self.model().callRemote('getSunMibProcessConf', devices) d.addCallback(self.updateDevices, updated) d.addErrback(self.error) if doomed: log.info("Removing %s", doomed) for device in doomed: del self._devices[device] self.clearSnmpError(device, "device %s removed" % device) def clearSnmpError(self, name, message): """ Send an event to clear other events. @parameter name: device for which the event applies @type name: string @parameter message: clear text @type message: string """ if name in self._devices: if self._devices[name].snmpStatus > 0: self._devices[name].snmpStatus = 0 self.sendEvent(self.statusEvent, eventClass=Status_Snmp, component="process", device=name, summary=message, agent='zensunprocess', severity=Event.Clear) def remote_updateDevice(self, cfg): """ Twisted remote callback, to allow zenhub to remotely update this daemon. @parameter cfg: configuration information returned from zenhub @type cfg: object """ self.log.debug("Configuration update from zenhub for %s", cfg.name) self.updateDevices([cfg],[]) def updateDevices(self, cfgs, fetched): """ Called when the zenhub service getSnmpStatus completes. @parameter cfgs: configuration information returned from zenhub @type cfgs: list of objects @parameter fetched: names we want zenhub to return information about @type fetched: list of strings """ received = Set() log.debug("Fetched configs from Zenhub using string %s"% fetched) log.debug("Configs: %s"% cfgs) for cfg in cfgs: received.add(cfg.name) d = self._devices.setdefault(cfg.name, cfg) d.updateConfig(cfg) self.thresholds.updateForDevice(cfg.name, cfg.thresholds) for doomed in Set(fetched) - received: if doomed in self._devices: del self._devices[doomed] def start(self, driver): """ Read the basic config needed to do anything, and to reread the configuration information on a periodic basis. """ log.debug("Fetching configuration from zenhub") devices = self._devices.keys() yield self.fetchConfig() self.updateDevices(driver.next(), devices) yield self.model().callRemote('getSnmpStatus', self.options.device) self.updateSnmpStatus(driver.next()) yield self.model().callRemote('getProcessStatus', self.options.device) self.updateProcessStatus(driver.next()) driveLater(self.configCycleInterval * 60, self.start) def updateSnmpStatus(self, updates): """ Called when the zenhub service getSnmpStatus completes. @parameter updates: List of names and error counts @type updates: list of (string, int) """ for name, count in updates: d = self._devices.get(name) if d: d.snmpStatus = count def updateProcessStatus(self, status): """ Called when the zenhub service getProcessStatus completes. @parameter status: List of names, component names and error counts @type status: list of (string, string, int) """ down = {} for device, component, count in status: down[ (device, component) ] = count for name, device in self._devices.items(): for p in device.processes.values(): p.status = down.get( (name, p.originalName), 0) def oneDevice(self, device): """ Contact one device and return a deferred which gathers data from the device. @parameter device: proxy object to the remote computer @type device: Device object @return: job to scan a device @rtype: Twisted deferred object """ def go(driver): """ Generator object to gather information from a device. """ try: device.open() yield self.scanDevice(device) driver.next() # Only fetch performance data if status data was found. if device.snmpStatus == 0: yield self.fetchPerf(device) driver.next() else: log.warn("Failed to find performance data for %s", device.name) except: log.debug('Failed to scan device %s' % device.name) def close(res): """ Twisted closeBack and errBack function which closes any open connections. """ try: device.close() except: log.debug("Failed to close device %s" % device.name) d = drive(go) d.addBoth(close) return d def scanDevice(self, device): """ Fetch all the process info for a device using SNMP table gets @parameter device: proxy connection object @type device: Device object @return: Twisted deferred @rtype: Twisted deferred """ device.lastScan = time.time() tables = [NAMETABLE] d = device.getTables(tables) d.addCallback(self.storeProcessNames, device) d.addErrback(self.deviceFailure, device) return d def deviceFailure(self, reason, device): """ Twisted errBack to log the exception for a single device. @parameter reason: explanation of the failure @type reason: Twisted error instance @parameter device: proxy connection object @type device: Device object """ self.sendEvent(self.statusEvent, eventClass=Status_Snmp, component="process", device=device.name, summary='Unable to read processes on device %s' % device.name, severity=Event.Error) device.snmpStatus += 1 if isinstance(reason.value, error.TimeoutError): self.log.debug('Timeout on device %s' % device.name) else: self.logError('Error on device %s' % device.name, reason.value) def mapResultsToDicts(self, results): """ Parse the process tables and reconstruct the list of processes that are on the device. @parameter results: results of SNMP table gets ie (OID + pid, value) @type results: dictionary of dictionaries @return: maps relating names and pids to each other @rtype: dictionary, dictionary, dictionary, list of tuples """ def extract(dictionary, oid, value): """ Helper function to extract SNMP table data. """ pid = int(oid.split('.')[-1]) dictionary[pid] = value names = {} if self.options.showrawtables: log.info("NAMETABLE = %r", results[NAMETABLE]) for row in results[NAMETABLE].items(): extract(names, *row) procs = [] for pid, name in names.items(): path = '' if path and path.find('\\') == -1: name = path procs.append( (pid, (name, '') ) ) return names, procs def showProcessList(self, device_name, procs): """ Display the processes in a sane manner. @parameter device_name: name of the device @type device_name: string @parameter procs: list of (pid, (name)) @type procs: list of tuples """ proc_list = [ '%s %s' % (pid, name) for pid, name \ in sorted(procs)] proc_list.append('') log.info("#===== Processes on %s:\n%s", device_name, '\n'.join(proc_list)) def storeProcessNames(self, results, device): """ Parse the process tables and reconstruct the list of processes that are on the device. @parameter results: results of SNMP table gets @type results: dictionary of dictionaries @parameter device: proxy connection object @type device: Device object """ if not results or not results[NAMETABLE]: summary = 'Device %s does not publish Sun MIB' % device.name resolution="Verify with snmpwalk -v1 -c community %s %s" % ( device.name, NAMETABLE ) self.sendEvent(self.statusEvent, device=device.name, summary=summary, resolution=resolution, severity=Event.Error) log.info(summary) return if device.snmpStatus > 0: summary = 'Process table up for device %s' % device.name self.clearSnmpError(device.name, summary) names, procs = self.mapResultsToDicts(results) if self.options.showprocs: self.showProcessList(device.name, procs) # look for changes in processes before = Set(device.pids.keys()) after = {} for p in device.processes.values(): for pid, name in procs: #log.debug('Checking process %s with %s is %s'%(name,pid,p.match(name[0]))) if p.match(name[0]): log.debug("Found process %d on %s" % (pid, p.name)) after[pid] = p afterSet = Set(after.keys()) afterByConfig = reverseDict(after) new = afterSet - before dead = before - afterSet # report pid restarts restarted = {} for p in dead: config = device.pids[p] config.discardPid(p) if config in afterByConfig: self.restarted += 1 if config.restart: restarted[config] = True summary = 'Process restarted: %s' % config.originalName self.sendEvent(self.statusEvent, device=device.name, summary=summary, component=config.originalName, severity=config.severity) log.info(summary) # report alive processes for config, pids in afterByConfig.items(): if config in restarted: continue summary = "Process up: %s" % config.originalName self.sendEvent(self.statusEvent, device=device.name, summary=summary, component=config.originalName, severity=Event.Clear) config.status = 0 log.debug(summary) for p in new: log.debug("Found new %s pid %d on %s" % ( after[p].originalName, p, device.name)) device.pids = after # Look for missing processes for config in device.processes.values(): if config not in afterByConfig: self.missing += 1 config.status += 1 summary = 'Process not running: %s' % config.originalName self.sendEvent(self.statusEvent, device=device.name, summary=summary, component=config.originalName, severity=config.severity) log.warning(summary) # Store per-device, per-process statistics pidCounts = dict([(p, 0) for p in device.processes]) for pids, pidConfig in device.pids.items(): pidCounts[pidConfig.name] += 1 for name, count in pidCounts.items(): self.save(device.name, name, 'count_count', count, 'GAUGE') def periodic(self, unused=None): """ Main loop that drives all other processing. """ reactor.callLater(self.processCycleInterval, self.periodic) if self.scanning: running, unstarted, finished = self.scanning.status() runningDevices = [ d.name for d in self.devices().values() \ if d.proxy is not None] if runningDevices or unstarted > 0: log.warning("Process scan not finishing: " "%d running, %d waiting, %d finished" % ( running, unstarted, finished)) log.warning("Problem devices: %r", runningDevices) return start = time.time() def doPeriodic(driver): """ Generator function to create deferred jobs. """ yield self.getDevicePingIssues() self.downDevices = Set([d[0] for d in driver.next()]) self.scanning = NJobs(self.parallelJobs, self.oneDevice, self.devices().values()) yield self.scanning.start() driver.next() def checkResults(results): """ Process the results from all deferred objects. """ for result in results: if isinstance(result , Exception): log.error("Error scanning device: %s", result) break self.cycleTime = time.time() - start self.heartbeat() drive(doPeriodic).addCallback(checkResults) def fetchPerf(self, device): """ Get performance data for all the monitored processes on a device @parameter device: proxy object to the remote computer @type device: Device object """ oids = [] for pid, pidConf in device.pids.items(): oids.extend([CPU + str(pid), MEM + str(pid)]) if not oids: return defer.succeed(([], device)) d = Chain(device.get, iter(chunk(oids, device.maxOidsPerRequest))).run() d.addCallback(self.storePerfStats, device) d.addErrback(self.deviceFailure, device) return d def storePerfStats(self, results, device): """ Save the process performance data in RRD files @parameter results: results of SNMP table gets @type results: list of (success, result) tuples @parameter device: proxy object to the remote computer @type device: Device object """ for success, result in results: if not success: self.deviceFailure(result, device) return results self.clearSnmpError(device.name, 'Process table up for device %s' % device.name) parts = {} for success, values in results: if success: parts.update(values) results = parts byConf = reverseDict(device.pids) for pidConf, pids in byConf.items(): if len(pids) != 1: log.info("There are %d pids by the name %s", len(pids), pidConf.name) pidName = pidConf.name for pid in pids: cpu = results.get(CPU + str(pid), None) mem = results.get(MEM + str(pid), None) pidConf.updateCpu(pid, cpu) pidConf.updateMemory(pid, mem) self.save(device.name, pidName, 'cpu_cpu', pidConf.getCpu(), 'DERIVE', min=0) self.save(device.name, pidName, 'mem_mem', pidConf.getMemory() * 1024, 'GAUGE') def save(self, deviceName, pidName, statName, value, rrdType, min='U', max='U'): """ Save a value into an RRD file @param deviceName: name of the remote device (ie a hostname) @type deviceName: string @param pidName: process id of the monitored process @type pidName: string @param statName: metric name @type statName: string @param value: data to be stored @type value: number @param rrdType: RRD data type (eg ABSOLUTE, DERIVE, COUNTER) @type rrdType: string @param min: minimum value acceptable for this metric @type min: number @param max: maximum value acceptable for this metric @type max: number """ path = 'Devices/%s/os/processes/%s/%s' % (deviceName, pidName, statName) try: value = self.rrd.save(path, value, rrdType, min=min, max=max) except Exception, ex: summary= "Unable to save data for process-monitor RRD %s" % \ path self.log.critical( summary ) message= "Data was value= %s, type=%s, min=%s, max=%s" % \ ( value, rrdType, min, max, ) self.log.critical( message ) self.log.exception( ex ) import traceback trace_info= traceback.format_exc() evid= self.sendEvent(dict( dedupid="%s|%s" % (self.options.monitor, 'RRD write failure'), severity=Critical, device=self.options.monitor, eventClass=Status_Perf, component="RRD", pidName=pidName, statName=statName, path=path, message=message, traceback=trace_info, summary=summary)) # Skip thresholds return for ev in self.thresholds.check(path, time.time(), value): self.sendThresholdEvent(**ev)
self.log.exception("Error performing net discovery on %s", ex) def discoverDevice(ip): """ Discover a particular device NB: Wrapper around self.discoverDevice() @param ip: IP address @type ip: string @return: Twisted/Zenoss Python iterable @rtype: Python iterable """ return self.discoverDevice(ip, self.options.deviceclass, self.options.productionState) yield NJobs(self.options.parallel, discoverDevice, devices).start() yield succeed("Discovered %d devices" % count) driver.next() def printResults(self, results): """ Display the results that we've obtained @param results: what we've discovered @type results: string """ if isinstance(results, Failure): if results.type is NoIPAddress: self.log.error("Error: %s", results.value) else: self.log.error("Error: %s", results)