def doFetchConfig(driver): now = time.time() yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() yield self.model().callRemote('getZenProcessParallelJobs') self.parallelJobs = int(driver.next()) yield self.model().callRemote('propertyItems') self.setPropertyItems(driver.next()) self.rrd = RRDUtil(createCommand, self.processCycleInterval) yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) devices = [] if self.options.device: devices = [self.options.device] yield self.model().callRemote('getSunMibProcessConf', devices) driver.next() self.sendEvents( self.rrdStats.gauge('configTime', self.processConfigInterval, time.time() - now) )
def _writeRRD(self, rrdCmd): # Stores the value into an RRD file # rrd.put() params: # @name : RRD Name (String) # @value : Data value to be stored (Number) # @rrd_type : RRD Data type, Example: GAUGE, COUNTER, DERIVE, ... (String) try: from Products.ZenModel.PerformanceConf import performancePath from Products.ZenRRD.RRDUtil import RRDUtil log.info('Writing into RRD File in %s... | Value: %r (%r)' % ( performancePath('totalPower.rrd'), self._preferences.totalPower, type(self._preferences.totalPower))) # 1st param: RRD Create Command # 2nd param: Step rrd = RRDUtil(rrdCmd, 300) value2 = rrd.save("totalPower", self._preferences.totalPower, "GAUGE", min=0, max=None) # This command will write to zenoss/perf/ log.info("Finished Writing. Return Value: %s" % (value2)) except Exception, ex: summary = "Unable to save data value into RRD file %s . (Exception: \'%s\')" % \ ("totalPower.rrd", ex) log.error(summary) # Send Error Event to Zenoss self._eventService.sendEvent(dict( summary = summary, message = 'Error Test', component = self._preferences.collectorName, eventClass = '/Perf/Snmp', device = None, severity = Error, agent = self._preferences.collectorName ))
def afterSetUp(self): super(TestRRDImpl, self).afterSetUp() # Make a valid test device testdev = str(self.__class__.__name__) self.name = testdev self.createFakeDevice( testdev ) self.zem = self.dmd.ZenEventManager # We're not connected to zenhub so the following # always will be None perfServer = self.dev.getPerformanceServer() if perfServer: self.defrrdcmd= perfServer.getDefaultRRDCreateCommand() else: # We will always use this :( self.defrrdcmd= 'RRA:AVERAGE:0.5:1:600\nRRA:AVERAGE:0.5:6:600\nRRA:AVERAGE:0.5:24:600\nRRA:AVERAGE:0.5:288:600\nRRA:MAX:0.5:6:600\nRRA:MAX:0.5:24:600\nRRA:MAX:0.5:288:600' # default RRD create command, cycle interval rrd= RRDUtil( self.defrrdcmd, 60 ) # Save the following info for our tearDown() script self.perfpath= rrd.performancePath( "tests" ) self.dev.rrdPath= lambda: "tests"
def inner(driver): self.log.debug("fetchConfig(): Fetching config from zenhub") yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() yield self.model().callRemote('propertyItems') self.setPropertyItems(driver.next()) self.rrd = RRDUtil(createCommand, DEFAULT_HEARTBEAT_TIME) yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) devices = self.getDevices() instances = self.getInstances() yield self.model().callRemote('getDeviceConfigs', instances, devices) configs = driver.next() self.log.debug('Fetched %i configs' % len(configs)) if len(configs) == 0: self.log.info("fetchConfig(): No configs returned from zenhub") else: for instance in configs.keys(): deviceConfigs = configs[instance] self.updateConfig(instance, deviceConfigs) self.log.debug("fetchConfig(): Done fetching config from zenhub")
def afterSetUp(self): super(TestRRDImpl, self).afterSetUp() # Make a valid test device testdev = str(self.__class__.__name__) self.name = testdev self.createFakeDevice(testdev) self.zem = self.dmd.ZenEventManager # We're not connected to zenhub so the following # always will be None perfServer = self.dev.getPerformanceServer() if perfServer: self.defrrdcmd = perfServer.getDefaultRRDCreateCommand() else: # We will always use this :( self.defrrdcmd = 'RRA:AVERAGE:0.5:1:600\nRRA:AVERAGE:0.5:6:600\nRRA:AVERAGE:0.5:24:600\nRRA:AVERAGE:0.5:288:600\nRRA:MAX:0.5:6:600\nRRA:MAX:0.5:24:600\nRRA:MAX:0.5:288:600' # default RRD create command, cycle interval rrd = RRDUtil(self.defrrdcmd, 60) # Save the following info for our tearDown() script self.perfpath = rrd.performancePath("tests") self.dev.rrdPath = lambda: "tests"
def testGoodSave(self): """ Sanity check to make sure that RRD stores work """ rrd= RRDUtil( self.createcmd, 60 ) # Create a new file, and add a value after creation path= os.path.join( self.path, "%f" % random() ) self.assertEquals( rrd.save( path, 666.0, 'COUNTER' ), None ) self.assertEquals( rrd.save( path, 666.0, 'COUNTER' ), None )
def testGoodSave(self): """ Sanity check to make sure that RRD stores work """ rrd = RRDUtil(self.createcmd, 60) # Create a new file, and add a value after creation path = os.path.join(self.path, "%f" % random()) self.assertEquals(rrd.save(path, 666.0, 'COUNTER'), None) self.assertEquals(rrd.save(path, 666.0, 'COUNTER'), None)
def testNotWritableRRD(self): """ Can't write to a file """ # Verify that we're not root first... if os.geteuid() == 0: print "Can't run testNotWritableRRD check if running as root" return rrd= RRDUtil( self.createcmd, 60 ) rrd.performancePath= lambda(x): "/" self.assertRaises( Exception, rrd.save, "/", 666.0, 'COUNTER' )
def testNotWritableRRD(self): """ Can't write to a file """ # Verify that we're not root first... if os.geteuid() == 0: print "Can't run testNotWritableRRD check if running as root" return rrd = RRDUtil(self.createcmd, 60) rrd.performancePath = lambda (x): "/" self.assertRaises(Exception, rrd.save, "/", 666.0, 'COUNTER')
def testBadDefaultCreateCmd(self): """ Bad default command """ rrd= RRDUtil( '', 60 ) # If the file is already created, then it doesn't get tested path= os.path.join( self.path, "%f" % random() ) self.assertRaises( Exception, rrd.save, path, 666.0, 'COUNTER' ) path= os.path.join( self.path, "%f" % random() ) self.assertRaises( Exception, rrd.save, path, 666.0, 'COUNTER', rrdCommand='' ) path= os.path.join( self.path, "%f" % random() ) self.assertEquals( rrd.save( path, 666.0, 'COUNTER', rrdCommand=self.createcmd ), None )
def testBadMinmax(self): """ Illegal values for min, max """ rrd= RRDUtil( self.createcmd, 60 ) path= os.path.join( self.path, "%f" % random() ) self.assertEquals( rrd.save( path, 666.0, 'COUNTER', min=-100 ), None ) path= os.path.join( self.path, "%f" % random() ) self.assertEquals( rrd.save( path, 666.0, 'COUNTER', min=None ), None ) path= os.path.join( self.path, "%f" % random() ) self.assertEquals( rrd.save( path, 666.0, 'COUNTER', min='U' ), None ) path= os.path.join( self.path, "%f" % random() ) self.assertEquals( rrd.save( path, 666.0, 'COUNTER', min=[] ), None )
def inner(driver): self.log.debug("fetchConfig(): Fetching config from zenhub") yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() yield self.model().callRemote('propertyItems') self.setPropertyItems(driver.next()) self.rrd = RRDUtil(createCommand, DEFAULT_HEARTBEAT_TIME) yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) devices=self.getDevices() instances=self.getInstances() yield self.model().callRemote('getDeviceConfigs', instances, devices ) configs = driver.next() self.log.debug('Fetched %i configs' % len( configs ) ) if len(configs) == 0: self.log.info("fetchConfig(): No configs returned from zenhub") else: for instance in configs.keys(): deviceConfigs=configs[instance] self.updateConfig(instance, deviceConfigs) self.log.debug("fetchConfig(): Done fetching config from zenhub")
def run(args, processId): try: perfPath = "/".join([args.perf_path, str(processId)]) create_cmd = 'RRA:AVERAGE:0.5:1:600\nRRA:AVERAGE:0.5:6:600\nRRA:AVERAGE:0.5:24:600\nRRA:AVERAGE:0.5:288:600\nRRA:MAX:0.5:6:600\nRRA:MAX:0.5:24:600\nRRA:MAX:0.5:288:600' rrd = RRDUtil(create_cmd, args.cycle_time) cycle_begin = time() dp_count = 0 for device in map(str, range(args.device_count)): begin = time() for device_dp in map(str, range(args.device_datapoints)): dp_count += 1 rrd.save(os.path.join(perfPath, device, device_dp), 42, 'GAUGE') for interface in map(str, range(args.component_count)): for interface_dp in map(str, range(args.component_datapoints)): dp_count += 1 path = os.path.join(perfPath, device, 'os', 'interfaces', interface, interface_dp) rrd.save(path, 42, 'DERIVE') rrdtool.fetch(rrd.performancePath(path) + '.rrd', 'AVERAGE', '-s', 'now-%d' % (args.cycle_time*2), '-e', 'now') cycle_duration = time() - cycle_begin return (cycle_duration, dp_count) except KeyboardInterrupt: return (0, 0)
def afterSetUp(self): super(TestRRDUtil, self).afterSetUp() # Make a valid test device testdev = str(self.__class__.__name__) self.name = testdev # name, path, dataStorageType, rrdCreateCommand, minmax self.path = os.path.join("tests", testdev) self.dev = self.dmd.Devices.createInstance(testdev) #createcmd= self.dmd.Devices.findDevice(testdev).getPerformanceServer().getDefaultRRDCreateCommand() self.createcmd = 'RRA:AVERAGE:0.5:1:600\nRRA:AVERAGE:0.5:6:600\nRRA:AVERAGE:0.5:24:600\nRRA:AVERAGE:0.5:288:600\nRRA:MAX:0.5:6:600\nRRA:MAX:0.5:24:600\nRRA:MAX:0.5:288:600' rrd = RRDUtil('', 60) self.perfpath = rrd.performancePath("tests")
def afterSetUp(self): super(TestRRDUtil, self).afterSetUp() # Make a valid test device testdev = str(self.__class__.__name__) self.name = testdev # name, path, dataStorageType, rrdCreateCommand, minmax self.path= os.path.join( "tests", testdev ) self.dev = self.dmd.Devices.createInstance(testdev) #createcmd= self.dmd.Devices.findDevice(testdev).getPerformanceServer().getDefaultRRDCreateCommand() self.createcmd= 'RRA:AVERAGE:0.5:1:600\nRRA:AVERAGE:0.5:6:600\nRRA:AVERAGE:0.5:24:600\nRRA:AVERAGE:0.5:288:600\nRRA:MAX:0.5:6:600\nRRA:MAX:0.5:24:600\nRRA:MAX:0.5:288:600' rrd= RRDUtil( '', 60 ) self.perfpath= rrd.performancePath( "tests" )
def testBadType(self): """ Bad data type which only gets used at creation time """ rrd = RRDUtil(self.createcmd, 60) path = os.path.join(self.path, "%f" % random()) self.assertRaises(Exception, rrd.save, path, 666.0, 'BOGO') path = os.path.join(self.path, "%f" % random()) self.assertRaises(Exception, rrd.save, path, 666.0, ':BOGO')
def testBadDefaultCreateCmd(self): """ Bad default command """ rrd = RRDUtil('', 60) # If the file is already created, then it doesn't get tested path = os.path.join(self.path, "%f" % random()) self.assertRaises(Exception, rrd.save, path, 666.0, 'COUNTER') path = os.path.join(self.path, "%f" % random()) self.assertRaises(Exception, rrd.save, path, 666.0, 'COUNTER', rrdCommand='') path = os.path.join(self.path, "%f" % random()) self.assertEquals( rrd.save(path, 666.0, 'COUNTER', rrdCommand=self.createcmd), None)
def testMinmaxReversed(self): """ What happens when the min/max values of the data point are reversed? """ rrd = RRDUtil(self.createcmd, 60) path = os.path.join(self.path, "%f" % random()) self.assertRaises(Exception, rrd.save, path, 666.0, 'COUNTER', min=100, max=1)
def testBadValues(self): """ Bad data values """ rrd = RRDUtil(self.createcmd, 60) #print "Expecting: '%s'" % ".ERROR:zen.RRDUtil:rrd error not a simple integer: '666.0' tests/testsnmpdev" self.assertEquals(rrd.save(self.path, None, 'COUNTER'), None) # A little inconsistent self.assertEquals(rrd.save(self.path, [], 'COUNTER'), None) self.assertEqual(rrd.save(self.path, [], 'ABSOLUTE'), None) self.assertEquals(rrd.save(self.path, "hello world", 'COUNTER'), None) self.assertEqual(rrd.save(self.path, "hello world", 'ABSOLUTE'), None)
def testBadValues(self): """ Bad data values """ rrd= RRDUtil( self.createcmd, 60 ) #print "Expecting: '%s'" % ".ERROR:zen.RRDUtil:rrd error not a simple integer: '666.0' tests/testsnmpdev" self.assertEquals( rrd.save( self.path, None, 'COUNTER' ), None ) # A little inconsistent self.assertEquals( rrd.save( self.path, [], 'COUNTER' ), None ) self.assertEqual( rrd.save(self.path, [], 'ABSOLUTE'), None ) self.assertEquals( rrd.save( self.path, "hello world", 'COUNTER' ), None ) self.assertEqual( rrd.save(self.path, "hello world", 'ABSOLUTE'), None )
def testBadMinmax(self): """ Illegal values for min, max """ rrd = RRDUtil(self.createcmd, 60) path = os.path.join(self.path, "%f" % random()) self.assertEquals(rrd.save(path, 666.0, 'COUNTER', min=-100), None) path = os.path.join(self.path, "%f" % random()) self.assertEquals(rrd.save(path, 666.0, 'COUNTER', min=None), None) path = os.path.join(self.path, "%f" % random()) self.assertEquals(rrd.save(path, 666.0, 'COUNTER', min='U'), None) path = os.path.join(self.path, "%f" % random()) self.assertEquals(rrd.save(path, 666.0, 'COUNTER', min=[]), None)
class AppEnginePerf(RRDDaemon): initialServices = RRDDaemon.initialServices + [ 'ZenPacks.chudler.GoogleAppEngine.services.AppEnginePerfConfigService' ] #Hold on to datasources as a map of instances to applications to datasources datasourceMap = {} def __init__(self): RRDDaemon.__init__(self, 'appengineperf') #map of deviceId -> deviceConfig self.deviceConfigs = {} self.running = False def connected(self): def configTask(driver): self.log.debug("configTask(): fetching config") yield self.fetchConfig() driver.next() driveLater(self.configCycleInterval * 60, configTask) d = drive(configTask) d.addCallbacks(self.runCollection, self.errorStop) def remote_updateDeviceConfig(self, configMap): self.log.debug("ASYNC instance configuration update") for instance, configs in configMap.iteritems(): self.updateConfig(instance, configs) def updateConfig(self, instance, datasourceConfigs): """ update device configurations to be collected """ self.log.debug("updateConfig(): updating config for instance %s" % instance) self.log.debug("updateConfig(): adding %i configs to monitor" % len(datasourceConfigs)) applicationsMap = {} for datasourceConfig in datasourceConfigs: if datasourceConfig.id in applicationsMap: applicationsMap[datasourceConfig.id].append(datasourceConfig) else: applicationsMap[datasourceConfig.id] = [] self.log.debug("updateConfig(): datasource %s on %s: %s" % (datasourceConfig.datasourceId, datasourceConfig.id, datasourceConfig.rrdPath)) self.thresholds.updateList(datasourceConfig.thresholds) self.datasourceMap.update({instance: applicationsMap}) def fetchConfig(self): """ Get configuration values from ZenHub """ def inner(driver): self.log.debug("fetchConfig(): Fetching config from zenhub") yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() yield self.model().callRemote('propertyItems') self.setPropertyItems(driver.next()) self.rrd = RRDUtil(createCommand, DEFAULT_HEARTBEAT_TIME) yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) devices = self.getDevices() instances = self.getInstances() yield self.model().callRemote('getDeviceConfigs', instances, devices) configs = driver.next() self.log.debug('Fetched %i configs' % len(configs)) if len(configs) == 0: self.log.info("fetchConfig(): No configs returned from zenhub") else: for instance in configs.keys(): deviceConfigs = configs[instance] self.updateConfig(instance, deviceConfigs) self.log.debug("fetchConfig(): Done fetching config from zenhub") return drive(inner) def getDevices(self): devices = [] if self.options.device: devices = [self.options.device] return devices def getInstances(self): instances = [] if self.options.instance: instances = [self.options.instance] return instances def storeRRD(self, dsConfig, dpValue): """ store a value into an RRD file """ rrdConf = dsConfig.rrdConfig.values()[0] dpPath = dsConfig.rrdPath value = self.rrd.save(dpPath, dpValue, rrdConf.rrdType, rrdConf.command) for ev in self.thresholds.check(dpPath, time.time(), value): eventKey = dsConfig.eventKey if ev.has_key('eventKey'): ev['eventKey'] = '%s|%s' % (eventKey, ev['eventKey']) else: ev['eventKey'] = eventKey self.sendThresholdEvent(**ev) def collectAppEngine(self, instance): def remoteCall(driver): ref_map = {} reverse_map = {} for application, dsConfigs in self.datasourceMap[ instance].iteritems(): for dsConfig in dsConfigs: reverse_map[(dsConfig.metricId.counterId, instance,\ application)] = dsConfig try: # TODO 2FEB10: Need way to retrieve only a certain application metrics = gaeClient.retrieveDataPoints() processMetrics(metrics, reverse_map) except Exception, ex: self.log.warning('Error trying to retrieve' + ' and save metrics: %s' % [str(value) for value in ref_map.values()]) self.log.exception(ex) yield defer.succeed("Collected %s datapoints" % instance.id) driver.next() def processMetrics(metrics, cfgmap): self.log.info('cfgmap:%s' % (cfgmap)) for app_id, data in metrics.iteritems(): for counterId, datapoint in data.iteritems(): #self.log.info('CounterId:%s, datapoint:%s, app_id:%s' % (counterId, data, app_id)) cfgKey = (counterId, instance, app_id) # ignore metrics sent that we didn't ask for if not cfgmap.has_key(cfgKey): #self.log.info('REJECTING counterid:%s datapoint:%s.\ Instance is:%s Application is:%s' % (counterId, datapoint, instance, app_id)) continue self.log.info('Accepting counterid:%s datapoint:%s.\ Instance is:%s Application is:%s' % (counterId, datapoint, instance, app_id)) dsConfig = cfgmap[cfgKey] self.storeRRD(dsConfig, datapoint) gaeClient = self.getGAEClient(instance) return drive(remoteCall)
class AppEnginePerf(RRDDaemon): initialServices = RRDDaemon.initialServices + [ 'ZenPacks.chudler.GoogleAppEngine.services.AppEnginePerfConfigService' ] #Hold on to datasources as a map of instances to applications to datasources datasourceMap={} def __init__(self): RRDDaemon.__init__(self, 'appengineperf') #map of deviceId -> deviceConfig self.deviceConfigs = {} self.running = False def connected(self): def configTask(driver): self.log.debug("configTask(): fetching config") yield self.fetchConfig() driver.next() driveLater(self.configCycleInterval*60, configTask) d = drive(configTask) d.addCallbacks(self.runCollection, self.errorStop) def remote_updateDeviceConfig( self, configMap ): self.log.debug( "ASYNC instance configuration update" ) for instance, configs in configMap.iteritems(): self.updateConfig( instance, configs ) def updateConfig(self, instance, datasourceConfigs): """ update device configurations to be collected """ self.log.debug("updateConfig(): updating config for instance %s" % instance) self.log.debug("updateConfig(): adding %i configs to monitor" % len( datasourceConfigs ) ) applicationsMap = {} for datasourceConfig in datasourceConfigs: if datasourceConfig.id in applicationsMap: applicationsMap[datasourceConfig.id].append(datasourceConfig) else: applicationsMap[datasourceConfig.id] = [] self.log.debug("updateConfig(): datasource %s on %s: %s" % (datasourceConfig.datasourceId, datasourceConfig.id, datasourceConfig.rrdPath)) self.thresholds.updateList(datasourceConfig.thresholds) self.datasourceMap.update({ instance : applicationsMap }) def fetchConfig(self): """ Get configuration values from ZenHub """ def inner(driver): self.log.debug("fetchConfig(): Fetching config from zenhub") yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() yield self.model().callRemote('propertyItems') self.setPropertyItems(driver.next()) self.rrd = RRDUtil(createCommand, DEFAULT_HEARTBEAT_TIME) yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) devices=self.getDevices() instances=self.getInstances() yield self.model().callRemote('getDeviceConfigs', instances, devices ) configs = driver.next() self.log.debug('Fetched %i configs' % len( configs ) ) if len(configs) == 0: self.log.info("fetchConfig(): No configs returned from zenhub") else: for instance in configs.keys(): deviceConfigs=configs[instance] self.updateConfig(instance, deviceConfigs) self.log.debug("fetchConfig(): Done fetching config from zenhub") return drive(inner) def getDevices(self): devices=[] if self.options.device: devices = [self.options.device] return devices def getInstances(self): instances=[] if self.options.instance: instances = [self.options.instance] return instances def storeRRD(self, dsConfig, dpValue): """ store a value into an RRD file """ rrdConf = dsConfig.rrdConfig.values()[0] dpPath = dsConfig.rrdPath value = self.rrd.save(dpPath, dpValue, rrdConf.rrdType, rrdConf.command) for ev in self.thresholds.check(dpPath, time.time(), value): eventKey = dsConfig.eventKey if ev.has_key('eventKey'): ev['eventKey'] = '%s|%s' % (eventKey, ev['eventKey']) else: ev['eventKey'] = eventKey self.sendThresholdEvent(**ev) def collectAppEngine(self, instance): def remoteCall(driver): ref_map = {} reverse_map = {} for application, dsConfigs in self.datasourceMap[instance].iteritems(): for dsConfig in dsConfigs: reverse_map[(dsConfig.metricId.counterId, instance,\ application)] = dsConfig try: # TODO 2FEB10: Need way to retrieve only a certain application metrics = gaeClient.retrieveDataPoints() processMetrics(metrics, reverse_map) except Exception, ex: self.log.warning( 'Error trying to retrieve' + ' and save metrics: %s' % [str(value) for value in ref_map.values() ] ) self.log.exception(ex) yield defer.succeed("Collected %s datapoints" % instance.id) driver.next() def processMetrics(metrics, cfgmap): self.log.info('cfgmap:%s' % (cfgmap)) for app_id, data in metrics.iteritems(): for counterId, datapoint in data.iteritems(): #self.log.info('CounterId:%s, datapoint:%s, app_id:%s' % (counterId, data, app_id)) cfgKey = (counterId, instance, app_id) # ignore metrics sent that we didn't ask for if not cfgmap.has_key(cfgKey): #self.log.info('REJECTING counterid:%s datapoint:%s.\ Instance is:%s Application is:%s' % (counterId, datapoint, instance, app_id)) continue self.log.info('Accepting counterid:%s datapoint:%s.\ Instance is:%s Application is:%s' % (counterId, datapoint, instance, app_id)) dsConfig = cfgmap[cfgKey] self.storeRRD(dsConfig, datapoint) gaeClient = self.getGAEClient(instance) return drive(remoteCall)
def testLowLevelFuncs(self): """ Verify info function succeeds. """ rrd= RRDUtil( self.createcmd, 60 ) path= os.path.join( self.path, "%f" % random() ) # setup RRD file, add values to it startTime = time.time() - 10 * 60 for i in range (0, 10): rrd.save( path, i * 100, 'COUNTER', useRRDDaemon=False, timestamp=int(startTime+i*60), start=startTime) # check info function import rrdtool filename = rrd.performancePath(path) + '.rrd' info = rrdtool.info(filename) self.assertEquals(info['ds[ds0].index'], 0L) # self.assertEquals(info['ds[ds0].last_ds'], '90.0') self.assertEquals(info['ds[ds0].max'], None) self.assertEquals(info['ds[ds0].min'], None) self.assertEquals(info['ds[ds0].minimal_heartbeat'], 180L) self.assertEquals(info['ds[ds0].type'], 'COUNTER') # test fetch data = rrdtool.fetch(filename, 'AVERAGE', '--start', "%d" % startTime) # check the middle of the fetch for 1.7/s rate self.failUnlessAlmostEqual(data[2][2][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][3][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][4][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][5][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][6][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][7][0], 1.7, places=1) # test fetch, with daemon pointing to bad socket file self.assertRaises(rrdtool.error, rrdtool.fetch, filename, 'AVERAGE', '--start', "%d" % startTime, '--daemon' '/tmp/blah') # test graph imFile = rrd.performancePath(path) + ".png" rrdtool.graph(imFile, "-w", "400", "-h", "100", "--full-size-mode", "DEF:ds0a=%s:ds0:AVERAGE" % filename, "LINE1:ds0a#0000FF:'default'", ) def readPNGsize(fname): """ PNG spec defines 16-byte header, followed by width and height as unsigned 4-byte integers. """ import struct with open(fname, "rb") as pngfile: first24 = pngfile.read(24) sizebytes = first24[-8:] width,height = struct.unpack_from(">II",sizebytes) return width,height self.assertEquals(readPNGsize(imFile), (400, 100))
class zensunprocess(SnmpDaemon): """ Daemon class to connect to an SNMP agent and determine the processes that are running on that server. """ statusEvent = { 'eventClass' : Status_OSProcess, 'eventGroup' : 'Process' } initialServices = SnmpDaemon.initialServices + ['ZenPacks.community.SunMibMonitor.services.SunMibProcessConfig'] processConfigInterval = 20*60 processCycleInterval = 5*60 properties = SnmpDaemon.properties + ('processCycleInterval',) missing = 0 restarted = 0 parallelJobs = DEFAULT_PARALLEL_JOBS def __init__(self, noopts=False): SnmpDaemon.__init__(self, 'zensunprocess', noopts) self._devices = {} self.scanning = None self.downDevices = Set() def devices(self): """ Return the list of devices that are available @return: device list @rtype: dictionary of device name, device object """ return dict([(k, v) for k, v in self._devices.items() if k not in self.downDevices]) def fetchConfig(self): """ Get configuration values from zenhub @return: Twisted deferred @rtype: Twisted deferred """ def doFetchConfig(driver): now = time.time() yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() yield self.model().callRemote('getZenProcessParallelJobs') self.parallelJobs = int(driver.next()) yield self.model().callRemote('propertyItems') self.setPropertyItems(driver.next()) self.rrd = RRDUtil(createCommand, self.processCycleInterval) yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) devices = [] if self.options.device: devices = [self.options.device] yield self.model().callRemote('getSunMibProcessConf', devices) driver.next() self.sendEvents( self.rrdStats.gauge('configTime', self.processConfigInterval, time.time() - now) ) return drive(doFetchConfig) def remote_deleteDevice(self, doomed): """ Called from zenhub to remove a device from our configuration @parameter doomed: device to delete @type doomed: string """ self.log.debug("zenhub asks us to delete device %s" % doomed) if doomed in self._devices: del self._devices[doomed] self.clearSnmpError(doomed, "Device %s removed from SNMP collection") def remote_updateDeviceList(self, devices): """ Called from zenhub to update the devices to monitor @parameter devices: devices to monitor @type devices: list of (device, changetime) tuples """ self.log.debug("Received updated device list from zenhub %s" % devices) doomed = Set(self._devices.keys()) updated = [] for device, lastChange in devices: # Ignore updates for devices if we've only asked for one device if self.options.device and \ device != self.options.device: self.log.debug("Ignoring update for %s as we only want %s", device, self.options.device) continue cfg = self._devices.get(device, None) if not cfg or self._devices[device].lastChange < lastChange: updated.append(device) doomed.discard(device) if updated: log.info("Fetching the config for %s", updated) d = self.model().callRemote('getSunMibProcessConf', devices) d.addCallback(self.updateDevices, updated) d.addErrback(self.error) if doomed: log.info("Removing %s", doomed) for device in doomed: del self._devices[device] self.clearSnmpError(device, "device %s removed" % device) def clearSnmpError(self, name, message): """ Send an event to clear other events. @parameter name: device for which the event applies @type name: string @parameter message: clear text @type message: string """ if name in self._devices: if self._devices[name].snmpStatus > 0: self._devices[name].snmpStatus = 0 self.sendEvent(self.statusEvent, eventClass=Status_Snmp, component="process", device=name, summary=message, agent='zensunprocess', severity=Event.Clear) def remote_updateDevice(self, cfg): """ Twisted remote callback, to allow zenhub to remotely update this daemon. @parameter cfg: configuration information returned from zenhub @type cfg: object """ self.log.debug("Configuration update from zenhub for %s", cfg.name) self.updateDevices([cfg],[]) def updateDevices(self, cfgs, fetched): """ Called when the zenhub service getSnmpStatus completes. @parameter cfgs: configuration information returned from zenhub @type cfgs: list of objects @parameter fetched: names we want zenhub to return information about @type fetched: list of strings """ received = Set() log.debug("Fetched configs from Zenhub using string %s"% fetched) log.debug("Configs: %s"% cfgs) for cfg in cfgs: received.add(cfg.name) d = self._devices.setdefault(cfg.name, cfg) d.updateConfig(cfg) self.thresholds.updateForDevice(cfg.name, cfg.thresholds) for doomed in Set(fetched) - received: if doomed in self._devices: del self._devices[doomed] def start(self, driver): """ Read the basic config needed to do anything, and to reread the configuration information on a periodic basis. """ log.debug("Fetching configuration from zenhub") devices = self._devices.keys() yield self.fetchConfig() self.updateDevices(driver.next(), devices) yield self.model().callRemote('getSnmpStatus', self.options.device) self.updateSnmpStatus(driver.next()) yield self.model().callRemote('getProcessStatus', self.options.device) self.updateProcessStatus(driver.next()) driveLater(self.configCycleInterval * 60, self.start) def updateSnmpStatus(self, updates): """ Called when the zenhub service getSnmpStatus completes. @parameter updates: List of names and error counts @type updates: list of (string, int) """ for name, count in updates: d = self._devices.get(name) if d: d.snmpStatus = count def updateProcessStatus(self, status): """ Called when the zenhub service getProcessStatus completes. @parameter status: List of names, component names and error counts @type status: list of (string, string, int) """ down = {} for device, component, count in status: down[ (device, component) ] = count for name, device in self._devices.items(): for p in device.processes.values(): p.status = down.get( (name, p.originalName), 0) def oneDevice(self, device): """ Contact one device and return a deferred which gathers data from the device. @parameter device: proxy object to the remote computer @type device: Device object @return: job to scan a device @rtype: Twisted deferred object """ def go(driver): """ Generator object to gather information from a device. """ try: device.open() yield self.scanDevice(device) driver.next() # Only fetch performance data if status data was found. if device.snmpStatus == 0: yield self.fetchPerf(device) driver.next() else: log.warn("Failed to find performance data for %s", device.name) except: log.debug('Failed to scan device %s' % device.name) def close(res): """ Twisted closeBack and errBack function which closes any open connections. """ try: device.close() except: log.debug("Failed to close device %s" % device.name) d = drive(go) d.addBoth(close) return d def scanDevice(self, device): """ Fetch all the process info for a device using SNMP table gets @parameter device: proxy connection object @type device: Device object @return: Twisted deferred @rtype: Twisted deferred """ device.lastScan = time.time() tables = [NAMETABLE] d = device.getTables(tables) d.addCallback(self.storeProcessNames, device) d.addErrback(self.deviceFailure, device) return d def deviceFailure(self, reason, device): """ Twisted errBack to log the exception for a single device. @parameter reason: explanation of the failure @type reason: Twisted error instance @parameter device: proxy connection object @type device: Device object """ self.sendEvent(self.statusEvent, eventClass=Status_Snmp, component="process", device=device.name, summary='Unable to read processes on device %s' % device.name, severity=Event.Error) device.snmpStatus += 1 if isinstance(reason.value, error.TimeoutError): self.log.debug('Timeout on device %s' % device.name) else: self.logError('Error on device %s' % device.name, reason.value) def mapResultsToDicts(self, results): """ Parse the process tables and reconstruct the list of processes that are on the device. @parameter results: results of SNMP table gets ie (OID + pid, value) @type results: dictionary of dictionaries @return: maps relating names and pids to each other @rtype: dictionary, dictionary, dictionary, list of tuples """ def extract(dictionary, oid, value): """ Helper function to extract SNMP table data. """ pid = int(oid.split('.')[-1]) dictionary[pid] = value names = {} if self.options.showrawtables: log.info("NAMETABLE = %r", results[NAMETABLE]) for row in results[NAMETABLE].items(): extract(names, *row) procs = [] for pid, name in names.items(): path = '' if path and path.find('\\') == -1: name = path procs.append( (pid, (name, '') ) ) return names, procs def showProcessList(self, device_name, procs): """ Display the processes in a sane manner. @parameter device_name: name of the device @type device_name: string @parameter procs: list of (pid, (name)) @type procs: list of tuples """ proc_list = [ '%s %s' % (pid, name) for pid, name \ in sorted(procs)] proc_list.append('') log.info("#===== Processes on %s:\n%s", device_name, '\n'.join(proc_list)) def storeProcessNames(self, results, device): """ Parse the process tables and reconstruct the list of processes that are on the device. @parameter results: results of SNMP table gets @type results: dictionary of dictionaries @parameter device: proxy connection object @type device: Device object """ if not results or not results[NAMETABLE]: summary = 'Device %s does not publish Sun MIB' % device.name resolution="Verify with snmpwalk -v1 -c community %s %s" % ( device.name, NAMETABLE ) self.sendEvent(self.statusEvent, device=device.name, summary=summary, resolution=resolution, severity=Event.Error) log.info(summary) return if device.snmpStatus > 0: summary = 'Process table up for device %s' % device.name self.clearSnmpError(device.name, summary) names, procs = self.mapResultsToDicts(results) if self.options.showprocs: self.showProcessList(device.name, procs) # look for changes in processes before = Set(device.pids.keys()) after = {} for p in device.processes.values(): for pid, name in procs: #log.debug('Checking process %s with %s is %s'%(name,pid,p.match(name[0]))) if p.match(name[0]): log.debug("Found process %d on %s" % (pid, p.name)) after[pid] = p afterSet = Set(after.keys()) afterByConfig = reverseDict(after) new = afterSet - before dead = before - afterSet # report pid restarts restarted = {} for p in dead: config = device.pids[p] config.discardPid(p) if config in afterByConfig: self.restarted += 1 if config.restart: restarted[config] = True summary = 'Process restarted: %s' % config.originalName self.sendEvent(self.statusEvent, device=device.name, summary=summary, component=config.originalName, severity=config.severity) log.info(summary) # report alive processes for config, pids in afterByConfig.items(): if config in restarted: continue summary = "Process up: %s" % config.originalName self.sendEvent(self.statusEvent, device=device.name, summary=summary, component=config.originalName, severity=Event.Clear) config.status = 0 log.debug(summary) for p in new: log.debug("Found new %s pid %d on %s" % ( after[p].originalName, p, device.name)) device.pids = after # Look for missing processes for config in device.processes.values(): if config not in afterByConfig: self.missing += 1 config.status += 1 summary = 'Process not running: %s' % config.originalName self.sendEvent(self.statusEvent, device=device.name, summary=summary, component=config.originalName, severity=config.severity) log.warning(summary) # Store per-device, per-process statistics pidCounts = dict([(p, 0) for p in device.processes]) for pids, pidConfig in device.pids.items(): pidCounts[pidConfig.name] += 1 for name, count in pidCounts.items(): self.save(device.name, name, 'count_count', count, 'GAUGE') def periodic(self, unused=None): """ Main loop that drives all other processing. """ reactor.callLater(self.processCycleInterval, self.periodic) if self.scanning: running, unstarted, finished = self.scanning.status() runningDevices = [ d.name for d in self.devices().values() \ if d.proxy is not None] if runningDevices or unstarted > 0: log.warning("Process scan not finishing: " "%d running, %d waiting, %d finished" % ( running, unstarted, finished)) log.warning("Problem devices: %r", runningDevices) return start = time.time() def doPeriodic(driver): """ Generator function to create deferred jobs. """ yield self.getDevicePingIssues() self.downDevices = Set([d[0] for d in driver.next()]) self.scanning = NJobs(self.parallelJobs, self.oneDevice, self.devices().values()) yield self.scanning.start() driver.next() def checkResults(results): """ Process the results from all deferred objects. """ for result in results: if isinstance(result , Exception): log.error("Error scanning device: %s", result) break self.cycleTime = time.time() - start self.heartbeat() drive(doPeriodic).addCallback(checkResults) def fetchPerf(self, device): """ Get performance data for all the monitored processes on a device @parameter device: proxy object to the remote computer @type device: Device object """ oids = [] for pid, pidConf in device.pids.items(): oids.extend([CPU + str(pid), MEM + str(pid)]) if not oids: return defer.succeed(([], device)) d = Chain(device.get, iter(chunk(oids, device.maxOidsPerRequest))).run() d.addCallback(self.storePerfStats, device) d.addErrback(self.deviceFailure, device) return d def storePerfStats(self, results, device): """ Save the process performance data in RRD files @parameter results: results of SNMP table gets @type results: list of (success, result) tuples @parameter device: proxy object to the remote computer @type device: Device object """ for success, result in results: if not success: self.deviceFailure(result, device) return results self.clearSnmpError(device.name, 'Process table up for device %s' % device.name) parts = {} for success, values in results: if success: parts.update(values) results = parts byConf = reverseDict(device.pids) for pidConf, pids in byConf.items(): if len(pids) != 1: log.info("There are %d pids by the name %s", len(pids), pidConf.name) pidName = pidConf.name for pid in pids: cpu = results.get(CPU + str(pid), None) mem = results.get(MEM + str(pid), None) pidConf.updateCpu(pid, cpu) pidConf.updateMemory(pid, mem) self.save(device.name, pidName, 'cpu_cpu', pidConf.getCpu(), 'DERIVE', min=0) self.save(device.name, pidName, 'mem_mem', pidConf.getMemory() * 1024, 'GAUGE') def save(self, deviceName, pidName, statName, value, rrdType, min='U', max='U'): """ Save a value into an RRD file @param deviceName: name of the remote device (ie a hostname) @type deviceName: string @param pidName: process id of the monitored process @type pidName: string @param statName: metric name @type statName: string @param value: data to be stored @type value: number @param rrdType: RRD data type (eg ABSOLUTE, DERIVE, COUNTER) @type rrdType: string @param min: minimum value acceptable for this metric @type min: number @param max: maximum value acceptable for this metric @type max: number """ path = 'Devices/%s/os/processes/%s/%s' % (deviceName, pidName, statName) try: value = self.rrd.save(path, value, rrdType, min=min, max=max) except Exception, ex: summary= "Unable to save data for process-monitor RRD %s" % \ path self.log.critical( summary ) message= "Data was value= %s, type=%s, min=%s, max=%s" % \ ( value, rrdType, min, max, ) self.log.critical( message ) self.log.exception( ex ) import traceback trace_info= traceback.format_exc() evid= self.sendEvent(dict( dedupid="%s|%s" % (self.options.monitor, 'RRD write failure'), severity=Critical, device=self.options.monitor, eventClass=Status_Perf, component="RRD", pidName=pidName, statName=statName, path=path, message=message, traceback=trace_info, summary=summary)) # Skip thresholds return for ev in self.thresholds.check(path, time.time(), value): self.sendThresholdEvent(**ev)
def writeRRD(self, devId, compType, compId, dpName, value): """ Write the given data to its RRD file. Also check any thresholds and send events if value is out of bounds. Note that if the write does not succeed, a None value is returned. @param devId: device name (as known by DMD) @type devId: string @param compType: component type (found in objects meta_type field) @type compType: string @param compId: name of the component @type compId: string @param dpName: name of the data point @type dpName: string @param value: performance metric to store @type value: number @return: valid value (ie long or float) or None @rtype: number or None """ log.debug('Writing %s %s' % (dpName, value)) dev = self.getDeviceOrComponent(devId, compType, compId) dp = dev.getRRDDataPoint(dpName) if not dp: log.warn('Did not find datapoint %s on device %s', dpName, devId) return None rrdKey = (dev.getPrimaryPath(), dp.getPrimaryPath()) rrdCreateCmd = None if rrdKey in self.rrd: rrd = self.rrd[rrdKey] else: rrdCreateCmd = dp.createCmd or self.getDefaultRRDCreateCommand(dev) rrd = RRDUtil(rrdCreateCmd, dp.datasource.cycletime) self.rrd[rrdKey] = rrd # convert value to a long if our data point uses a long type if dp.rrdtype in RRDImpl.LONG_RRD_TYPES: try: value = long(value) except ValueError: log.warn("Value '%s' received for data point '%s' that " \ "could not be converted to a long" % \ (value, dp.rrdtype)) # see if there are any thresholds defined for this datapoint, so we can # choose a more optimal RRD storage method if there aren't any dp_has_threshold = self.hasThreshold(dp) if dp_has_threshold: rrd_write_fn = rrd.save else: rrd_write_fn = rrd.put path = os.path.join(dev.rrdPath(), dp.name()) try: value = rrd_write_fn(path, value, dp.rrdtype, rrdCreateCmd, dp.datasource.cycletime, dp.rrdmin, dp.rrdmax) except Exception, ex: summary= "Unable to save data in zenhub for RRD %s" % \ path log.critical(summary) message= "Data was value= %s, type=%s, min=%s, max=%s" % \ ( value, dp.rrdtype, dp.rrdmin, dp.rrdmax, ) log.critical(message) log.exception(ex) import traceback trace_info = traceback.format_exc() evid = self.zem.sendEvent( dict(dedupid="%s|%s" % (devId, 'RRD write failure'), severity=Critical, device=devId, eventClass=Status_Perf, component="RRD", compType=compType, compId=compId, datapoint=dpName, message=message, traceback=trace_info, summary=summary)) # Skip thresholds return
def testLowLevelFuncs(self): """ Verify info function succeeds. """ rrd = RRDUtil(self.createcmd, 60) path = os.path.join(self.path, "%f" % random()) # setup RRD file, add values to it startTime = time.time() - 10 * 60 for i in range(0, 10): rrd.save(path, i * 100, 'COUNTER', useRRDDaemon=False, timestamp=int(startTime + i * 60), start=startTime) # check info function import rrdtool filename = rrd.performancePath(path) + '.rrd' info = rrdtool.info(filename) self.assertEquals(info['ds[ds0].index'], 0L) # self.assertEquals(info['ds[ds0].last_ds'], '90.0') self.assertEquals(info['ds[ds0].max'], None) self.assertEquals(info['ds[ds0].min'], None) self.assertEquals(info['ds[ds0].minimal_heartbeat'], 180L) self.assertEquals(info['ds[ds0].type'], 'COUNTER') # test fetch data = rrdtool.fetch(filename, 'AVERAGE', '--start', "%d" % startTime) # check the middle of the fetch for 1.7/s rate self.failUnlessAlmostEqual(data[2][2][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][3][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][4][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][5][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][6][0], 1.7, places=1) self.failUnlessAlmostEqual(data[2][7][0], 1.7, places=1) # test fetch, with daemon pointing to bad socket file self.assertRaises(rrdtool.error, rrdtool.fetch, filename, 'AVERAGE', '--start', "%d" % startTime, '--daemon' '/tmp/blah') # test graph imFile = rrd.performancePath(path) + ".png" rrdtool.graph( imFile, "-w", "400", "-h", "100", "--full-size-mode", "DEF:ds0a=%s:ds0:AVERAGE" % filename, "LINE1:ds0a#0000FF:'default'", ) def readPNGsize(fname): """ PNG spec defines 16-byte header, followed by width and height as unsigned 4-byte integers. """ import struct with open(fname, "rb") as pngfile: first24 = pngfile.read(24) sizebytes = first24[-8:] width, height = struct.unpack_from(">II", sizebytes) return width, height self.assertEquals(readPNGsize(imFile), (400, 100))