def _collectCallback(self):

        jobs = NJobs(self._preferences.options.parallel,
                     self._collectJMX,
                     self._taskConfig.jmxDataSourceConfigs.values())
        deferred = jobs.start()
        return deferred
 def doCollection(driver):
     self.log.debug("doCollection(): starting collection cycle")
     reactor.callLater(self.options.dataCollectInterval, self.runCollection)
     if not self.options.cycle:
         self.stop()
     if self.running:
         self.log.error("last appengine collection is still running")
         return
     self.running = True
     jobs = NJobs(200,
                  self.collectAppEngine,
                  self.datasourceMap.keys())
     yield jobs.start()
     driver.next()
     self.log.debug("doCollection(): exiting collection cycle")
     self.sendEvents(
     self.rrdStats.gauge('instances',
                         self.options.dataCollectInterval,
                         len(self.datasourceMap)) +
     self.rrdStats.counter('dataPoints',
                           self.options.dataCollectInterval,
                           self.rrd.dataPoints) +
     self.rrdStats.gauge('cyclePoints',
                         self.options.dataCollectInterval,
                         self.rrd.endCycle())
     )
Beispiel #3
0
    def _collectCallback(self):

        jobs = NJobs(self._preferences.options.parallel,
                     self._collectJMX,
                     self._taskConfig.jmxDataSourceConfigs.values())
        deferred = jobs.start()
        return deferred
Beispiel #4
0
 def doCollection(driver):
     self.log.debug("doCollection(): starting collection cycle")
     reactor.callLater(self.options.dataCollectInterval,
                       self.runCollection)
     if not self.options.cycle:
         self.stop()
     if self.running:
         self.log.error("last appengine collection is still running")
         return
     self.running = True
     jobs = NJobs(200, self.collectAppEngine, self.datasourceMap.keys())
     yield jobs.start()
     driver.next()
     self.log.debug("doCollection(): exiting collection cycle")
     self.sendEvents(
         self.rrdStats.gauge(
             'instances', self.options.dataCollectInterval,
             len(self.datasourceMap)) + self.rrdStats.counter(
                 'dataPoints', self.options.dataCollectInterval,
                 self.rrd.dataPoints) +
         self.rrdStats.gauge('cyclePoints', self.options.
                             dataCollectInterval, self.rrd.endCycle()))
Beispiel #5
0
class zensunprocess(SnmpDaemon):
    """
    Daemon class to connect to an SNMP agent and determine the processes
    that are running on that server.
    """
    statusEvent = { 'eventClass' : Status_OSProcess,
                    'eventGroup' : 'Process' }
    initialServices = SnmpDaemon.initialServices + ['ZenPacks.community.SunMibMonitor.services.SunMibProcessConfig']
    processConfigInterval = 20*60
    processCycleInterval = 5*60
    properties = SnmpDaemon.properties + ('processCycleInterval',)
    missing = 0
    restarted = 0
    parallelJobs = DEFAULT_PARALLEL_JOBS

    def __init__(self, noopts=False):
        SnmpDaemon.__init__(self, 'zensunprocess', noopts)
        self._devices = {}
        self.scanning = None
        self.downDevices = Set()

    def devices(self):
        """
        Return the list of devices that are available

        @return: device list
        @rtype: dictionary of device name, device object
        """
        return dict([(k, v) for k, v in self._devices.items()
                     if k not in self.downDevices])

    def fetchConfig(self):
        """
        Get configuration values from zenhub

        @return: Twisted deferred
        @rtype: Twisted deferred
        """
        def doFetchConfig(driver):
            now = time.time()

            yield self.model().callRemote('getDefaultRRDCreateCommand')
            createCommand = driver.next()

            yield self.model().callRemote('getZenProcessParallelJobs')
            self.parallelJobs = int(driver.next())

            yield self.model().callRemote('propertyItems')
            self.setPropertyItems(driver.next())

            self.rrd = RRDUtil(createCommand, self.processCycleInterval)

            yield self.model().callRemote('getThresholdClasses')
            self.remote_updateThresholdClasses(driver.next())

            yield self.model().callRemote('getCollectorThresholds')
            self.rrdStats.config(self.options.monitor,
                                 self.name,
                                 driver.next(),
                                 createCommand)

            devices = []
            if self.options.device:
                devices = [self.options.device]
            yield self.model().callRemote('getSunMibProcessConf', devices)
            driver.next()
            self.sendEvents(
                self.rrdStats.gauge('configTime',
                                    self.processConfigInterval,
                                    time.time() - now)
                )

        return drive(doFetchConfig)

    def remote_deleteDevice(self, doomed):
        """
        Called from zenhub to remove a device from our configuration

        @parameter doomed: device to delete
        @type doomed: string
        """
        self.log.debug("zenhub asks us to delete device %s" % doomed)
        if doomed in self._devices:
             del self._devices[doomed]
        self.clearSnmpError(doomed, "Device %s removed from SNMP collection")

    def remote_updateDeviceList(self, devices):
        """
        Called from zenhub to update the devices to monitor

        @parameter devices: devices to monitor
        @type devices: list of (device, changetime) tuples
        """
        self.log.debug("Received updated device list from zenhub %s" % devices)
        doomed = Set(self._devices.keys())
        updated = []
        for device, lastChange in devices:
            # Ignore updates for devices if we've only asked for one device
            if self.options.device and \
               device != self.options.device:
                self.log.debug("Ignoring update for %s as we only want %s",
                               device, self.options.device)
                continue

            cfg = self._devices.get(device, None)
            if not cfg or self._devices[device].lastChange < lastChange:
                updated.append(device)
            doomed.discard(device)

        if updated:
            log.info("Fetching the config for %s", updated)
            d = self.model().callRemote('getSunMibProcessConf', devices)
            d.addCallback(self.updateDevices, updated)
            d.addErrback(self.error)

        if doomed:
            log.info("Removing %s", doomed)
            for device in doomed:
                del self._devices[device]
                self.clearSnmpError(device, "device %s removed" % device)


    def clearSnmpError(self, name, message):
        """
        Send an event to clear other events.

        @parameter name: device for which the event applies
        @type name: string
        @parameter message: clear text
        @type message: string
        """
        if name in self._devices:
            if self._devices[name].snmpStatus > 0:
                self._devices[name].snmpStatus = 0
                self.sendEvent(self.statusEvent,
                               eventClass=Status_Snmp,
                               component="process",
                               device=name,
                               summary=message,
                               agent='zensunprocess',
                               severity=Event.Clear)


    def remote_updateDevice(self, cfg):
        """
        Twisted remote callback, to allow zenhub to remotely update
        this daemon.

        @parameter cfg: configuration information returned from zenhub
        @type cfg: object
        """
        self.log.debug("Configuration update from zenhub for %s", cfg.name)
        self.updateDevices([cfg],[])


    def updateDevices(self, cfgs, fetched):
        """
        Called when the zenhub service getSnmpStatus completes.

        @parameter cfgs: configuration information returned from zenhub
        @type cfgs: list of objects
        @parameter fetched: names we want zenhub to return information about
        @type fetched: list of strings
        """
        received = Set()
	log.debug("Fetched configs from Zenhub using string %s"% fetched)
	log.debug("Configs: %s"% cfgs)
        for cfg in cfgs:
            received.add(cfg.name)
            d = self._devices.setdefault(cfg.name, cfg)
            d.updateConfig(cfg)
            self.thresholds.updateForDevice(cfg.name, cfg.thresholds)

        for doomed in Set(fetched) - received:
            if doomed in self._devices:
                del self._devices[doomed]

    def start(self, driver):
        """
        Read the basic config needed to do anything, and to reread
        the configuration information on a periodic basis.
        """
        log.debug("Fetching configuration from zenhub")
        devices = self._devices.keys()
        yield self.fetchConfig()
        self.updateDevices(driver.next(), devices)

        yield self.model().callRemote('getSnmpStatus', self.options.device)
        self.updateSnmpStatus(driver.next())

        yield self.model().callRemote('getProcessStatus', self.options.device)
        self.updateProcessStatus(driver.next())

        driveLater(self.configCycleInterval * 60, self.start)


    def updateSnmpStatus(self, updates):
        """
        Called when the zenhub service getSnmpStatus completes.

        @parameter updates: List of names and error counts
        @type updates: list of (string, int)
        """
        for name, count in updates:
            d = self._devices.get(name)
            if d:
                d.snmpStatus = count


    def updateProcessStatus(self, status):
        """
        Called when the zenhub service getProcessStatus completes.

        @parameter status: List of names, component names and error counts
        @type status: list of (string, string, int)
        """
        down = {}
        for device, component, count in status:
            down[ (device, component) ] = count
        for name, device in self._devices.items():
            for p in device.processes.values():
                p.status = down.get( (name, p.originalName), 0)


    def oneDevice(self, device):
        """
        Contact one device and return a deferred which gathers data from
        the device.

        @parameter device: proxy object to the remote computer
        @type device: Device object
        @return: job to scan a device
        @rtype: Twisted deferred object
        """
        def go(driver):
            """
            Generator object to gather information from a device.
            """
            try:
                device.open()
                yield self.scanDevice(device)
                driver.next()

                # Only fetch performance data if status data was found.
                if device.snmpStatus == 0:
                    yield self.fetchPerf(device)
                    driver.next()
                else:
                    log.warn("Failed to find performance data for %s",
                             device.name)
            except:
                log.debug('Failed to scan device %s' % device.name)

        def close(res):
            """
            Twisted closeBack and errBack function which closes any
            open connections.
            """
            try:
                device.close()
            except:
                log.debug("Failed to close device %s" % device.name)

        d = drive(go)
        d.addBoth(close)
        return d


    def scanDevice(self, device):
        """
        Fetch all the process info for a device using SNMP table gets

        @parameter device: proxy connection object
        @type device: Device object
        @return: Twisted deferred
        @rtype: Twisted deferred
        """
        device.lastScan = time.time()
        tables = [NAMETABLE]
        d = device.getTables(tables)
        d.addCallback(self.storeProcessNames, device)
        d.addErrback(self.deviceFailure, device)
        return d


    def deviceFailure(self, reason, device):
        """
        Twisted errBack to log the exception for a single device.

        @parameter reason: explanation of the failure
        @type reason: Twisted error instance
        @parameter device: proxy connection object
        @type device: Device object
        """
        self.sendEvent(self.statusEvent,
                       eventClass=Status_Snmp,
                       component="process",
                       device=device.name,
                       summary='Unable to read processes on device %s' % device.name,
                       severity=Event.Error)
        device.snmpStatus += 1
        if isinstance(reason.value, error.TimeoutError):
            self.log.debug('Timeout on device %s' % device.name)
        else:
            self.logError('Error on device %s' % device.name, reason.value)

    def mapResultsToDicts(self, results):
        """
        Parse the process tables and reconstruct the list of processes
        that are on the device.

        @parameter results: results of SNMP table gets ie (OID + pid, value)
        @type results: dictionary of dictionaries
        @return: maps relating names and pids to each other
        @rtype: dictionary, dictionary, dictionary, list of tuples
        """
        def extract(dictionary, oid, value):
            """
            Helper function to extract SNMP table data.
            """
            pid = int(oid.split('.')[-1])
            dictionary[pid] = value

        names = {}
        if self.options.showrawtables:
            log.info("NAMETABLE = %r", results[NAMETABLE])
        for row in results[NAMETABLE].items():
            extract(names, *row)

        procs = []
        for pid, name in names.items():
            path = ''
            if path and path.find('\\') == -1:
                name = path
            procs.append( (pid, (name, '') ) )

        return names, procs

    def showProcessList(self, device_name, procs):
        """
        Display the processes in a sane manner.

        @parameter device_name: name of the device
        @type device_name: string
        @parameter procs: list of (pid, (name))
        @type procs: list of tuples
        """
        proc_list = [ '%s %s' % (pid, name) for pid, name \
                         in sorted(procs)]
        proc_list.append('')
        log.info("#===== Processes on %s:\n%s", device_name, '\n'.join(proc_list))

    def storeProcessNames(self, results, device):
        """
        Parse the process tables and reconstruct the list of processes
        that are on the device.

        @parameter results: results of SNMP table gets
        @type results: dictionary of dictionaries
        @parameter device: proxy connection object
        @type device: Device object
        """
        if not results or not results[NAMETABLE]:
            summary = 'Device %s does not publish Sun MIB' % device.name
            resolution="Verify with snmpwalk -v1 -c community %s %s" % (
                device.name, NAMETABLE )
            self.sendEvent(self.statusEvent,
                           device=device.name,
                           summary=summary,
                           resolution=resolution,
                           severity=Event.Error)
            log.info(summary)
            return
        if device.snmpStatus > 0:
            summary = 'Process table up for device %s' % device.name
            self.clearSnmpError(device.name, summary)

        names, procs = self.mapResultsToDicts(results)
        if self.options.showprocs:
            self.showProcessList(device.name, procs)

        # look for changes in processes
        before = Set(device.pids.keys())
        after = {}
        for p in device.processes.values():
            for pid, name in procs:
		#log.debug('Checking process %s with %s is %s'%(name,pid,p.match(name[0])))
                if p.match(name[0]):
                    log.debug("Found process %d on %s" % (pid, p.name))
                    after[pid] = p
        afterSet = Set(after.keys())
        afterByConfig = reverseDict(after)
        new =  afterSet - before
        dead = before - afterSet

        # report pid restarts
        restarted = {}
        for p in dead:
            config = device.pids[p]
            config.discardPid(p)
            if config in afterByConfig:
                self.restarted += 1
                if config.restart:
                    restarted[config] = True
                    summary = 'Process restarted: %s' % config.originalName
                    self.sendEvent(self.statusEvent,
                                   device=device.name,
                                   summary=summary,
                                   component=config.originalName,
                                   severity=config.severity)
                    log.info(summary)

        # report alive processes
        for config, pids in afterByConfig.items():
            if config in restarted: continue
            summary = "Process up: %s" % config.originalName
            self.sendEvent(self.statusEvent,
                           device=device.name,
                           summary=summary,
                           component=config.originalName,
                           severity=Event.Clear)
            config.status = 0
            log.debug(summary)

        for p in new:
            log.debug("Found new %s pid %d on %s" % (
                after[p].originalName, p, device.name))
        device.pids = after

        # Look for missing processes
        for config in device.processes.values():
            if config not in afterByConfig:
                self.missing += 1
                config.status += 1
                summary = 'Process not running: %s' % config.originalName
                self.sendEvent(self.statusEvent,
                               device=device.name,
                               summary=summary,
                               component=config.originalName,
                               severity=config.severity)
                log.warning(summary)

        # Store per-device, per-process statistics
        pidCounts = dict([(p, 0) for p in device.processes])
        for pids, pidConfig in device.pids.items():
            pidCounts[pidConfig.name] += 1
        for name, count in pidCounts.items():
            self.save(device.name, name, 'count_count', count, 'GAUGE')


    def periodic(self, unused=None):
        """
        Main loop that drives all other processing.
        """
        reactor.callLater(self.processCycleInterval, self.periodic)

        if self.scanning:
            running, unstarted, finished = self.scanning.status()
            runningDevices = [ d.name for d in self.devices().values() \
                    if d.proxy is not None]

            if runningDevices or unstarted > 0:
                log.warning("Process scan not finishing: "
                    "%d running, %d waiting, %d finished" % (
                        running, unstarted, finished))
                log.warning("Problem devices: %r", runningDevices)
                return

        start = time.time()

        def doPeriodic(driver):
            """
            Generator function to create deferred jobs.
            """
            yield self.getDevicePingIssues()
            self.downDevices = Set([d[0] for d in driver.next()])

            self.scanning = NJobs(self.parallelJobs,
                                  self.oneDevice,
                                  self.devices().values())
            yield self.scanning.start()
            driver.next()

        def checkResults(results):
            """
            Process the results from all deferred objects.
            """
            for result in results:
                if isinstance(result , Exception):
                    log.error("Error scanning device: %s", result)
                    break
            self.cycleTime = time.time() - start
            self.heartbeat()

        drive(doPeriodic).addCallback(checkResults)


    def fetchPerf(self, device):
        """
        Get performance data for all the monitored processes on a device

        @parameter device: proxy object to the remote computer
        @type device: Device object
        """
        oids = []
        for pid, pidConf in device.pids.items():
            oids.extend([CPU + str(pid), MEM + str(pid)])
        if not oids:
            return defer.succeed(([], device))

        d = Chain(device.get, iter(chunk(oids, device.maxOidsPerRequest))).run()
        d.addCallback(self.storePerfStats, device)
        d.addErrback(self.deviceFailure, device)
        return d


    def storePerfStats(self, results, device):
        """
        Save the process performance data in RRD files

        @parameter results: results of SNMP table gets
        @type results: list of (success, result) tuples
        @parameter device: proxy object to the remote computer
        @type device: Device object
        """
        for success, result in results:
            if not success:
                self.deviceFailure(result, device)
                return results
        self.clearSnmpError(device.name,
                            'Process table up for device %s' % device.name)
        parts = {}
        for success, values in results:
            if success:
                parts.update(values)
        results = parts
        byConf = reverseDict(device.pids)
        for pidConf, pids in byConf.items():
            if len(pids) != 1:
                log.info("There are %d pids by the name %s",
                         len(pids), pidConf.name)
            pidName = pidConf.name
            for pid in pids:
                cpu = results.get(CPU + str(pid), None)
                mem = results.get(MEM + str(pid), None)
                pidConf.updateCpu(pid, cpu)
                pidConf.updateMemory(pid, mem)
            self.save(device.name, pidName, 'cpu_cpu', pidConf.getCpu(),
                      'DERIVE', min=0)
            self.save(device.name, pidName, 'mem_mem', pidConf.getMemory() * 1024,
                      'GAUGE')


    def save(self, deviceName, pidName, statName, value, rrdType,
             min='U', max='U'):
        """
        Save a value into an RRD file

        @param deviceName: name of the remote device (ie a hostname)
        @type deviceName: string
        @param pidName: process id of the monitored process
        @type pidName: string
        @param statName: metric name
        @type statName: string
        @param value: data to be stored
        @type value: number
        @param rrdType: RRD data type (eg ABSOLUTE, DERIVE, COUNTER)
        @type rrdType: string
        @param min: minimum value acceptable for this metric
        @type min: number
        @param max: maximum value acceptable for this metric
        @type max: number
        """
        path = 'Devices/%s/os/processes/%s/%s' % (deviceName, pidName, statName)
        try:
            value = self.rrd.save(path, value, rrdType, min=min, max=max)

        except Exception, ex:
            summary= "Unable to save data for process-monitor RRD %s" % \
                              path
            self.log.critical( summary )

            message= "Data was value= %s, type=%s, min=%s, max=%s" % \
                     ( value, rrdType, min, max, )
            self.log.critical( message )
            self.log.exception( ex )

            import traceback
            trace_info= traceback.format_exc()

            evid= self.sendEvent(dict(
                dedupid="%s|%s" % (self.options.monitor, 'RRD write failure'),
                severity=Critical,
                device=self.options.monitor,
                eventClass=Status_Perf,
                component="RRD",
                pidName=pidName,
                statName=statName,
                path=path,
                message=message,
                traceback=trace_info,
                summary=summary))

            # Skip thresholds
            return

        for ev in self.thresholds.check(path, time.time(), value):
            self.sendThresholdEvent(**ev)