def __init__(self, preferences, taskSplitter, configurationListener=DUMMY_LISTENER, initializationCallback=None, stoppingCallback=None): """ Constructs a new instance of the CollectorDaemon framework. Normally only a singleton instance of a CollectorDaemon should exist within a process, but this is not enforced. @param preferences: the collector configuration @type preferences: ICollectorPreferences @param taskSplitter: the task splitter to use for this collector @type taskSplitter: ITaskSplitter @param initializationCallback: a callable that will be executed after connection to the hub but before retrieving configuration information @type initializationCallback: any callable @param stoppingCallback: a callable that will be executed first during the stopping process. Exceptions will be logged but otherwise ignored. @type stoppingCallback: any callable """ # create the configuration first, so we have the collector name # available before activating the rest of the Daemon class hierarchy. if not ICollectorPreferences.providedBy(preferences): raise TypeError("configuration must provide ICollectorPreferences") else: self._prefs = ObservableProxy(preferences) self._prefs.attachAttributeObserver('configCycleInterval', self._rescheduleConfig) if not ITaskSplitter.providedBy(taskSplitter): raise TypeError("taskSplitter must provide ITaskSplitter") else: self._taskSplitter = taskSplitter if not IConfigurationListener.providedBy(configurationListener): raise TypeError( "configurationListener must provide IConfigurationListener") self._configListener = ConfigListenerNotifier() self._configListener.addListener(configurationListener) self._configListener.addListener(DeviceGuidListener(self)) self._initializationCallback = initializationCallback self._stoppingCallback = stoppingCallback # register the various interfaces we provide the rest of the system so # that collector implementors can easily retrieve a reference back here # if needed zope.component.provideUtility(self, ICollector) zope.component.provideUtility(self, IEventService) zope.component.provideUtility(self, IDataService) # setup daemon statistics self._statService = StatisticsService() self._statService.addStatistic("devices", "GAUGE") self._statService.addStatistic("dataPoints", "DERIVE") self._statService.addStatistic("runningTasks", "GAUGE") self._statService.addStatistic("taskCount", "GAUGE") self._statService.addStatistic("queuedTasks", "GAUGE") self._statService.addStatistic("missedRuns", "GAUGE") zope.component.provideUtility(self._statService, IStatisticsService) # register the collector's own preferences object so it may be easily # retrieved by factories, tasks, etc. zope.component.provideUtility(self.preferences, ICollectorPreferences, self.preferences.collectorName) super(CollectorDaemon, self).__init__(name=self.preferences.collectorName) self._deviceGuids = {} self._devices = set() self._unresponsiveDevices = set() self._rrd = None self._metric_writer = None self._derivative_tracker = None self.reconfigureTimeout = None # keep track of pending tasks if we're doing a single run, and not a # continuous cycle if not self.options.cycle: self._completedTasks = 0 self._pendingTasks = [] frameworkFactory = zope.component.queryUtility(IFrameworkFactory, self._frameworkFactoryName) self._configProxy = frameworkFactory.getConfigurationProxy() self._scheduler = frameworkFactory.getScheduler() self._scheduler.maxTasks = self.options.maxTasks self._ConfigurationLoaderTask = frameworkFactory.getConfigurationLoaderTask() # OLD - set the initialServices attribute so that the PBDaemon class # will load all of the remote services we need. self.initialServices = PBDaemon.initialServices +\ [self.preferences.configurationService] # trap SIGUSR2 so that we can display detailed statistics signal.signal(signal.SIGUSR2, self._signalHandler) # let the configuration do any additional startup it might need self.preferences.postStartup() self.addedPostStartupTasks = False
class CollectorDaemon(RRDDaemon): """ The daemon class for the entire ZenCollector framework. This class bridges the gap between the older daemon framework and ZenCollector. New collectors no longer should extend this class to implement a new collector. """ zope.interface.implements(ICollector, IDataService, IEventService) _frameworkFactoryName = "" @property def preferences(self): """ Preferences for this daemon """ return self._prefs def __init__(self, preferences, taskSplitter, configurationListener=DUMMY_LISTENER, initializationCallback=None, stoppingCallback=None): """ Constructs a new instance of the CollectorDaemon framework. Normally only a singleton instance of a CollectorDaemon should exist within a process, but this is not enforced. @param preferences: the collector configuration @type preferences: ICollectorPreferences @param taskSplitter: the task splitter to use for this collector @type taskSplitter: ITaskSplitter @param initializationCallback: a callable that will be executed after connection to the hub but before retrieving configuration information @type initializationCallback: any callable @param stoppingCallback: a callable that will be executed first during the stopping process. Exceptions will be logged but otherwise ignored. @type stoppingCallback: any callable """ # create the configuration first, so we have the collector name # available before activating the rest of the Daemon class hierarchy. if not ICollectorPreferences.providedBy(preferences): raise TypeError("configuration must provide ICollectorPreferences") else: self._prefs = ObservableProxy(preferences) self._prefs.attachAttributeObserver('configCycleInterval', self._rescheduleConfig) if not ITaskSplitter.providedBy(taskSplitter): raise TypeError("taskSplitter must provide ITaskSplitter") else: self._taskSplitter = taskSplitter if not IConfigurationListener.providedBy(configurationListener): raise TypeError( "configurationListener must provide IConfigurationListener") self._configListener = ConfigListenerNotifier() self._configListener.addListener(configurationListener) self._configListener.addListener(DeviceGuidListener(self)) self._initializationCallback = initializationCallback self._stoppingCallback = stoppingCallback # register the various interfaces we provide the rest of the system so # that collector implementors can easily retrieve a reference back here # if needed zope.component.provideUtility(self, ICollector) zope.component.provideUtility(self, IEventService) zope.component.provideUtility(self, IDataService) # setup daemon statistics self._statService = StatisticsService() self._statService.addStatistic("devices", "GAUGE") self._statService.addStatistic("dataPoints", "DERIVE") self._statService.addStatistic("runningTasks", "GAUGE") self._statService.addStatistic("taskCount", "GAUGE") self._statService.addStatistic("queuedTasks", "GAUGE") self._statService.addStatistic("missedRuns", "GAUGE") zope.component.provideUtility(self._statService, IStatisticsService) # register the collector's own preferences object so it may be easily # retrieved by factories, tasks, etc. zope.component.provideUtility(self.preferences, ICollectorPreferences, self.preferences.collectorName) super(CollectorDaemon, self).__init__(name=self.preferences.collectorName) self._deviceGuids = {} self._devices = set() self._unresponsiveDevices = set() self._rrd = None self._metric_writer = None self._derivative_tracker = None self.reconfigureTimeout = None # keep track of pending tasks if we're doing a single run, and not a # continuous cycle if not self.options.cycle: self._completedTasks = 0 self._pendingTasks = [] frameworkFactory = zope.component.queryUtility(IFrameworkFactory, self._frameworkFactoryName) self._configProxy = frameworkFactory.getConfigurationProxy() self._scheduler = frameworkFactory.getScheduler() self._scheduler.maxTasks = self.options.maxTasks self._ConfigurationLoaderTask = frameworkFactory.getConfigurationLoaderTask() # OLD - set the initialServices attribute so that the PBDaemon class # will load all of the remote services we need. self.initialServices = PBDaemon.initialServices +\ [self.preferences.configurationService] # trap SIGUSR2 so that we can display detailed statistics signal.signal(signal.SIGUSR2, self._signalHandler) # let the configuration do any additional startup it might need self.preferences.postStartup() self.addedPostStartupTasks = False def buildOptions(self): """ Method called by CmdBase.__init__ to build all of the possible command-line options for this collector daemon. """ super(CollectorDaemon, self).buildOptions() maxTasks = getattr(self.preferences, 'maxTasks', None) defaultMax = maxTasks if maxTasks else 500 self.parser.add_option('--maxparallel', dest='maxTasks', type='int', default=defaultMax, help='Max number of tasks to run at once, default %default') self.parser.add_option('--logTaskStats', dest='logTaskStats', type='int', default=0, help='How often to logs statistics of current tasks, value in seconds; very verbose') self.parser.add_option('--writeStatistics', dest='writeStatistics', type='int', default=60, help='How often to write internal statistics value in seconds') frameworkFactory = zope.component.queryUtility(IFrameworkFactory, self._frameworkFactoryName) if hasattr(frameworkFactory, 'getFrameworkBuildOptions'): # During upgrades we'll be missing this option self._frameworkBuildOptions = frameworkFactory.getFrameworkBuildOptions() if self._frameworkBuildOptions: self._frameworkBuildOptions(self.parser) # give the collector configuration a chance to add options, too self.preferences.buildOptions(self.parser) def parseOptions(self): super(CollectorDaemon, self).parseOptions() self.preferences.options = self.options def connected(self): """ Method called by PBDaemon after a connection to ZenHub is established. """ return self._startup() def _getInitializationCallback(self): def doNothing(): pass if self._initializationCallback is not None: return self._initializationCallback else: return doNothing def connectTimeout(self): super(CollectorDaemon, self).connectTimeout() return self._startup() def _startup(self): d = defer.maybeDeferred( self._getInitializationCallback() ) d.addCallback( self._startConfigCycle ) d.addCallback( self._startMaintenance ) d.addErrback( self._errorStop ) return d def watchdogCycleTime(self): """ Return our cycle time (in minutes) @return: cycle time @rtype: integer """ return self.preferences.cycleInterval * 2 def getRemoteConfigServiceProxy(self): """ Called to retrieve the remote configuration service proxy object. """ return self.services.get(self.preferences.configurationService, FakeRemote()) def generateEvent(self, event, **kw): eventCopy = super(CollectorDaemon, self).generateEvent(event, **kw) if eventCopy and eventCopy.get("device"): device_id = eventCopy.get("device") guid = self._deviceGuids.get(device_id) if guid: eventCopy['device_guid'] = guid return eventCopy def writeMetric(self, contextUUID, metric, value, metricType, contextId, timestamp='N', min='U', max='U', threshEventData={}, deviceuuid=None): """ Writes the metric to the metric publisher. @param contextUUID: This is who the metric applies to. This is usually a component or a device. @param metric: the name of the metric, we expect it to be of the form datasource_datapoint @param value: the value of the metric @param metricType: type of the metric (e.g. 'COUNTER', 'GUAGE', 'DERIVE' etc) @param contextId: used for the threshold events, the id of who this metric is for @param timestamp: defaults to time.time() if not specified, the time the metric occurred @param min: used in the derive the min value for the metric @param max: used in the derive the max value for the metric @param threshEventData: extra data put into threshold events @param deviceuuid: the unique identifier of the device for this metric, maybe the same as contextUUID if the context is a device @return: a deferred that fires when the metric gets published """ timestamp = int(time.time()) if timestamp == 'N' else timestamp data_source, data_point_name = metric.split("_") tags = { 'datasource': data_source, 'uuid': contextUUID } if deviceuuid: tags['device'] = deviceuuid # write the raw metric to Redis self._metric_writer.write_metric( data_point_name, value, timestamp, tags) # compute (and cache) a rate for COUNTER/DERIVE if metricType in {'COUNTER', 'DERIVE'}: value = self._derivative_tracker.derivative( contextUUID, (int(value), timestamp), min, max) # check for threshold breaches and send events when needed if value is not None: self._threshold_notifier.notify(contextUUID, contextId, metric, timestamp, value, threshEventData) @deprecated def writeRRD(self, path, value, rrdType, rrdCommand=None, cycleTime=None, min='U', max='U', threshEventData={}, timestamp='N', allowStaleDatapoint=True): """ Use writeMetric """ # we rely on the fact that rrdPath now returns the guid for an object uuidInfo, metric = path.rsplit('/', 1) if not 'METRIC_DATA' in str(uuidInfo): raise Exception("Unable to write Metric with given path { %s } please see the rrdpath method" % str(uuidInfo)) uuidInfo = json.loads(uuidInfo) # reroute to new writeMetric method self.writeMetric(uuidInfo['contextUUID'], metric, value, rrdType, uuidInfo['contextId'], timestamp, min, max, threshEventData, uuidInfo.get('deviceUUID', None) ) def stop(self, ignored=""): if self._stoppingCallback is not None: try: self._stoppingCallback() except Exception: self.log.exception('Exception while stopping daemon') super(CollectorDaemon, self).stop( ignored ) def remote_deleteDevice(self, devId): """ Called remotely by ZenHub when a device we're monitoring is deleted. """ # guard against parsing updates during a disconnect if devId is None: return self._deleteDevice(devId) def remote_deleteDevices(self, deviceIds): """ Called remotely by ZenHub when devices we're monitoring are deleted. """ # guard against parsing updates during a disconnect if deviceIds is None: return for devId in Zipper.load(deviceIds): self._deleteDevice(devId) def remote_updateDeviceConfig(self, config): """ Called remotely by ZenHub when asynchronous configuration updates occur. """ # guard against parsing updates during a disconnect if config is None: return self.log.debug("Device %s updated", config.configId) if not self.options.device or self.options.device in (config.id, config.configId): self._updateConfig(config) self._configProxy.updateConfigProxy(self.preferences, config) def remote_updateDeviceConfigs(self, configs): """ Called remotely by ZenHub when asynchronous configuration updates occur. """ if configs is None: return for config in Zipper.load(configs): self.remote_updateDeviceConfig(config) def remote_notifyConfigChanged(self): """ Called from zenhub to notify that the entire config should be updated """ if self.reconfigureTimeout and self.reconfigureTimeout.active(): # We will run along with the already scheduled task self.log.debug("notifyConfigChanged - using existing call") return self.log.debug("notifyConfigChanged - scheduling call in 30 seconds") self.reconfigureTimeout = reactor.callLater(30, self._rebuildConfig) def _rebuildConfig(self): """ Delete and re-add the configuration tasks to completely re-build the configuration. """ if self.reconfigureTimeout and not self.reconfigureTimeout.active(): self.reconfigureTimeout = None self._scheduler.removeTasksForConfig(CONFIG_LOADER_NAME) self._startConfigCycle() def _rescheduleConfig(self, observable, attrName, oldValue, newValue, **kwargs): """ Delete and re-add the configuration tasks to start on new interval. """ if oldValue != newValue: self.log.debug("Changing config task interval from %s to %s minutes" % (oldValue, newValue)) self._scheduler.removeTasksForConfig(CONFIG_LOADER_NAME) #values are in minutes, scheduler takes seconds self._startConfigCycle(startDelay=newValue * 60) def _taskCompleteCallback(self, taskName): # if we're not running a normal daemon cycle then we need to shutdown # once all of our pending tasks have completed if not self.options.cycle: try: self._pendingTasks.remove(taskName) except ValueError: pass self._completedTasks += 1 # if all pending tasks have been completed then shutdown the daemon if len(self._pendingTasks) == 0: self._displayStatistics() self.stop() def _updateConfig(self, cfg): configId = cfg.configId self.log.debug("Processing configuration for %s", configId) nextExpectedRuns = {} if configId in self._devices: tasksToRemove = self._scheduler.getTasksForConfig(configId) nextExpectedRuns = { taskToRemove.name: self._scheduler.getNextExpectedRun(taskToRemove.name) for taskToRemove in tasksToRemove } self._scheduler.removeTasks(task.name for task in tasksToRemove) self._configListener.updated(cfg) else: self._devices.add(configId) self._configListener.added(cfg) newTasks = self._taskSplitter.splitConfiguration([cfg]) self.log.debug("Tasks for config %s: %s", configId, newTasks) nowTime = time.time() for (taskName, task_) in newTasks.iteritems(): #if not cycling run the task immediately otherwise let the scheduler #decide when to run the task now = not self.options.cycle nextExpectedRun = nextExpectedRuns.get(taskName, None) if nextExpectedRun: startDelay = nextExpectedRun - nowTime if startDelay <= 0: # handle edge case where we are about to run # so run immediately now = True task_.startDelay = 0 else: task_.startDelay = startDelay self._scheduler.addTask(task_, self._taskCompleteCallback, now) # TODO: another hack? if hasattr(cfg, 'thresholds'): self.getThresholds().updateForDevice(configId, cfg.thresholds) # if we're not running a normal daemon cycle then keep track of the # tasks we just added for this device so that we can shutdown once # all pending tasks have completed if not self.options.cycle: self._pendingTasks.append(taskName) @defer.inlineCallbacks def _updateDeviceConfigs(self, updatedConfigs, purgeOmitted): """ Update the device configurations for the devices managed by this collector. @param deviceConfigs a list of device configurations @type deviceConfigs list of name,value tuples """ self.log.debug("updateDeviceConfigs: updatedConfigs=%s", (map(str, updatedConfigs))) for cfg in updatedConfigs: self._updateConfig(cfg) # yield time to reactor so other things can happen yield task.deferLater(reactor, 0, lambda: None) if purgeOmitted: self._purgeOmittedDevices(cfg.configId for cfg in updatedConfigs) def _purgeOmittedDevices(self, updatedDevices): """ Delete all current devices that are omitted from the list of devices being updated. @param updatedDevices a collection of device ids @type updatedDevices a sequence of strings """ # remove tasks for the deleted devices deletedDevices = set(self._devices) - set(updatedDevices) self.log.debug("purgeOmittedDevices: deletedConfigs=%s", ','.join(deletedDevices)) for configId in deletedDevices: self._deleteDevice(configId) def _deleteDevice(self, deviceId): self.log.debug("Device %s deleted" % deviceId) self._devices.discard(deviceId) self._configListener.deleted(deviceId) self._configProxy.deleteConfigProxy(self.preferences, deviceId) self._scheduler.removeTasksForConfig(deviceId) def _errorStop(self, result): """ Twisted callback to receive fatal messages. @param result: the Twisted failure @type result: failure object """ if isinstance(result, Failure): msg = result.getErrorMessage() else: msg = str(result) self.log.critical("Unrecoverable Error: %s", msg) self.stop() def _startConfigCycle(self, result=None, startDelay=0): configLoader = self._ConfigurationLoaderTask(CONFIG_LOADER_NAME, taskConfig=self.preferences) configLoader.startDelay = startDelay # Don't add the config loader task if the scheduler already has # an instance of it. if configLoader not in self._scheduler: # Run initial maintenance cycle as soon as possible # TODO: should we not run maintenance if running in non-cycle mode? self._scheduler.addTask(configLoader) else: self.log.info("%s already added to scheduler", configLoader.name) return defer.succeed("Configuration loader task started") def setPropertyItems(self, items): """ Override so that preferences are updated """ super(CollectorDaemon, self).setPropertyItems(items) self._setCollectorPreferences(dict(items)) def _setCollectorPreferences(self, preferenceItems): for name, value in preferenceItems.iteritems(): if not hasattr(self.preferences, name): # TODO: make a super-low level debug mode? The following message isn't helpful #self.log.debug("Preferences object does not have attribute %s", # name) setattr(self.preferences, name, value) elif getattr(self.preferences, name) != value: self.log.debug("Updated %s preference to %s", name, value) setattr(self.preferences, name, value) def _loadThresholdClasses(self, thresholdClasses): self.log.debug("Loading classes %s", thresholdClasses) for c in thresholdClasses: try: importClass(c) except ImportError: log.exception("Unable to import class %s", c) def _configureThresholds(self, thresholds): self.getThresholds().updateList(thresholds) def _startMaintenance(self, ignored=None): unused(ignored) if not self.options.cycle: self._maintenanceCycle() return if self.options.logTaskStats > 0: log.debug("Starting Task Stat logging") loop = task.LoopingCall(self._displayStatistics, verbose=True) loop.start(self.options.logTaskStats, now=False) log.debug("Starting Statistic posting") loop = task.LoopingCall(self._postStatistics) loop.start(self.options.writeStatistics, now=False) interval = self.preferences.cycleInterval self.log.debug("Initializing maintenance Cycle") maintenanceCycle = MaintenanceCycle(interval, self, self._maintenanceCycle) maintenanceCycle.start() def _maintenanceCycle(self, ignored=None): """ Perform daemon maintenance processing on a periodic schedule. Initially called after the daemon configuration loader task is added, but afterward will self-schedule each run. """ self.log.debug("Performing periodic maintenance") def _processDeviceIssues(result): self.log.debug("deviceIssues=%r", result) if result is None: return result # exception or some other problem # Device ping issues returns as a tuple of (deviceId, count, total) # and we just want the device id newUnresponsiveDevices = set(i[0] for i in result) clearedDevices = self._unresponsiveDevices.difference(newUnresponsiveDevices) for devId in clearedDevices: self.log.debug("Resuming tasks for device %s", devId) self._scheduler.resumeTasksForConfig(devId) self._unresponsiveDevices = newUnresponsiveDevices for devId in self._unresponsiveDevices: self.log.debug("Pausing tasks for device %s", devId) self._scheduler.pauseTasksForConfig(devId) return result def _getDeviceIssues(): # TODO: handle different types of device issues, such as WMI issues d = self.getDevicePingIssues() return d def _maintenance(): if self.options.cycle: if getattr(self.preferences, 'pauseUnreachableDevices', True): d = defer.maybeDeferred(_getDeviceIssues) d.addCallback(_processDeviceIssues) else: d = defer.succeed(None) else: d = defer.succeed("No maintenance required") return d d = _maintenance() return d def runPostConfigTasks(self, result=None): """ Add post-startup tasks from the preferences. This may be called with the failure code as well. """ if isinstance(result, Failure): pass elif not self.addedPostStartupTasks: postStartupTasks = getattr(self.preferences, 'postStartupTasks', lambda : []) for task in postStartupTasks(): self._scheduler.addTask(task, now=True) self.addedPostStartupTasks = True def _postStatistics(self): self._displayStatistics() # update and post statistics if we've been configured to do so if self.rrdStats: stat = self._statService.getStatistic("devices") stat.value = len(self._devices) # stat = self._statService.getStatistic("cyclePoints") # stat.value = self._rrd.endCycle() stat = self._statService.getStatistic("dataPoints") stat.value = self.metricWriter().dataPoints # Scheduler statistics stat = self._statService.getStatistic("runningTasks") stat.value = self._scheduler._executor.running stat = self._statService.getStatistic("taskCount") stat.value = self._scheduler.taskCount stat = self._statService.getStatistic("queuedTasks") stat.value = self._scheduler._executor.queued stat = self._statService.getStatistic("missedRuns") stat.value = self._scheduler.missedRuns self._statService.postStatistics(self.rrdStats) def _displayStatistics(self, verbose=False): if self.metricWriter(): self.log.info("%d devices processed (%d datapoints)", len(self._devices), self.metricWriter().dataPoints) else: self.log.info("%d devices processed (0 datapoints)", len(self._devices)) self._scheduler.displayStatistics(verbose) def _signalHandler(self, signum, frame): self._displayStatistics(True)
def __init__(self, preferences, taskSplitter, configurationListener=DUMMY_LISTENER, initializationCallback=None, stoppingCallback=None): """ Constructs a new instance of the CollectorDaemon framework. Normally only a singleton instance of a CollectorDaemon should exist within a process, but this is not enforced. @param preferences: the collector configuration @type preferences: ICollectorPreferences @param taskSplitter: the task splitter to use for this collector @type taskSplitter: ITaskSplitter @param initializationCallback: a callable that will be executed after connection to the hub but before retrieving configuration information @type initializationCallback: any callable @param stoppingCallback: a callable that will be executed first during the stopping process. Exceptions will be logged but otherwise ignored. @type stoppingCallback: any callable """ # create the configuration first, so we have the collector name # available before activating the rest of the Daemon class hierarchy. if not ICollectorPreferences.providedBy(preferences): raise TypeError("configuration must provide ICollectorPreferences") else: self._prefs = ObservableProxy(preferences) self._prefs.attachAttributeObserver('configCycleInterval', self._rescheduleConfig) if not ITaskSplitter.providedBy(taskSplitter): raise TypeError("taskSplitter must provide ITaskSplitter") else: self._taskSplitter = taskSplitter if not IConfigurationListener.providedBy(configurationListener): raise TypeError( "configurationListener must provide IConfigurationListener") self._configListener = ConfigListenerNotifier() self._configListener.addListener(configurationListener) self._configListener.addListener(DeviceGuidListener(self)) self._initializationCallback = initializationCallback self._stoppingCallback = stoppingCallback # register the various interfaces we provide the rest of the system so # that collector implementors can easily retrieve a reference back here # if needed zope.component.provideUtility(self, ICollector) zope.component.provideUtility(self, IEventService) zope.component.provideUtility(self, IDataService) # register the collector's own preferences object so it may be easily # retrieved by factories, tasks, etc. zope.component.provideUtility(self.preferences, ICollectorPreferences, self.preferences.collectorName) super(CollectorDaemon, self).__init__(name=self.preferences.collectorName) self._statService = StatisticsService() zope.component.provideUtility(self._statService, IStatisticsService) if self.options.cycle: # setup daemon statistics (deprecated names) self._statService.addStatistic("devices", "GAUGE") self._statService.addStatistic("dataPoints", "DERIVE") self._statService.addStatistic("runningTasks", "GAUGE") self._statService.addStatistic("taskCount", "GAUGE") self._statService.addStatistic("queuedTasks", "GAUGE") self._statService.addStatistic("missedRuns", "GAUGE") # namespace these a bit so they can be used in ZP monitoring. # prefer these stat names and metrology in future refs self._dataPointsMetric = Metrology.meter( "collectordaemon.dataPoints") daemon = self class DeviceGauge(Gauge): @property def value(self): return len(daemon._devices) Metrology.gauge('collectordaemon.devices', DeviceGauge()) # Scheduler statistics class RunningTasks(Gauge): @property def value(self): return daemon._scheduler._executor.running Metrology.gauge('collectordaemon.runningTasks', RunningTasks()) class TaskCount(Gauge): @property def value(self): return daemon._scheduler.taskCount Metrology.gauge('collectordaemon.taskCount', TaskCount()) class QueuedTasks(Gauge): @property def value(self): return daemon._scheduler._executor.queued Metrology.gauge('collectordaemon.queuedTasks', QueuedTasks()) class MissedRuns(Gauge): @property def value(self): return daemon._scheduler.missedRuns Metrology.gauge('collectordaemon.missedRuns', MissedRuns()) self._deviceGuids = {} self._devices = set() self._unresponsiveDevices = set() self._rrd = None self._metric_writer = None self._derivative_tracker = None self.reconfigureTimeout = None # keep track of pending tasks if we're doing a single run, and not a # continuous cycle if not self.options.cycle: self._completedTasks = 0 self._pendingTasks = [] frameworkFactory = zope.component.queryUtility( IFrameworkFactory, self._frameworkFactoryName) self._configProxy = frameworkFactory.getConfigurationProxy() self._scheduler = frameworkFactory.getScheduler() self._scheduler.maxTasks = self.options.maxTasks self._ConfigurationLoaderTask = frameworkFactory.getConfigurationLoaderTask( ) # OLD - set the initialServices attribute so that the PBDaemon class # will load all of the remote services we need. self.initialServices = PBDaemon.initialServices +\ [self.preferences.configurationService] # trap SIGUSR2 so that we can display detailed statistics signal.signal(signal.SIGUSR2, self._signalHandler) # let the configuration do any additional startup it might need self.preferences.postStartup() self.addedPostStartupTasks = False # Variables used by enterprise collector in resmgr # # flag that indicates we have finished loading the configs for the first time after a restart self.firstConfigLoadDone = False # flag that indicates the daemon has received the encryption key from zenhub self.encryptionKeyInitialized = False # flag that indicates the daemon is loading the cached configs self.loadingCachedConfigs = False
class CollectorDaemon(RRDDaemon): """ The daemon class for the entire ZenCollector framework. This class bridges the gap between the older daemon framework and ZenCollector. New collectors no longer should extend this class to implement a new collector. """ zope.interface.implements(ICollector, IDataService, IEventService) _frameworkFactoryName = "" # So users (subclasses) can check for metric tag support without inspect. metricExtraTags = True @property def preferences(self): """ Preferences for this daemon """ return self._prefs def __init__(self, preferences, taskSplitter, configurationListener=DUMMY_LISTENER, initializationCallback=None, stoppingCallback=None): """ Constructs a new instance of the CollectorDaemon framework. Normally only a singleton instance of a CollectorDaemon should exist within a process, but this is not enforced. @param preferences: the collector configuration @type preferences: ICollectorPreferences @param taskSplitter: the task splitter to use for this collector @type taskSplitter: ITaskSplitter @param initializationCallback: a callable that will be executed after connection to the hub but before retrieving configuration information @type initializationCallback: any callable @param stoppingCallback: a callable that will be executed first during the stopping process. Exceptions will be logged but otherwise ignored. @type stoppingCallback: any callable """ # create the configuration first, so we have the collector name # available before activating the rest of the Daemon class hierarchy. if not ICollectorPreferences.providedBy(preferences): raise TypeError("configuration must provide ICollectorPreferences") else: self._prefs = ObservableProxy(preferences) self._prefs.attachAttributeObserver('configCycleInterval', self._rescheduleConfig) if not ITaskSplitter.providedBy(taskSplitter): raise TypeError("taskSplitter must provide ITaskSplitter") else: self._taskSplitter = taskSplitter if not IConfigurationListener.providedBy(configurationListener): raise TypeError( "configurationListener must provide IConfigurationListener") self._configListener = ConfigListenerNotifier() self._configListener.addListener(configurationListener) self._configListener.addListener(DeviceGuidListener(self)) self._initializationCallback = initializationCallback self._stoppingCallback = stoppingCallback # register the various interfaces we provide the rest of the system so # that collector implementors can easily retrieve a reference back here # if needed zope.component.provideUtility(self, ICollector) zope.component.provideUtility(self, IEventService) zope.component.provideUtility(self, IDataService) # register the collector's own preferences object so it may be easily # retrieved by factories, tasks, etc. zope.component.provideUtility(self.preferences, ICollectorPreferences, self.preferences.collectorName) super(CollectorDaemon, self).__init__(name=self.preferences.collectorName) self._statService = StatisticsService() zope.component.provideUtility(self._statService, IStatisticsService) if self.options.cycle: # setup daemon statistics (deprecated names) self._statService.addStatistic("devices", "GAUGE") self._statService.addStatistic("dataPoints", "DERIVE") self._statService.addStatistic("runningTasks", "GAUGE") self._statService.addStatistic("taskCount", "GAUGE") self._statService.addStatistic("queuedTasks", "GAUGE") self._statService.addStatistic("missedRuns", "GAUGE") # namespace these a bit so they can be used in ZP monitoring. # prefer these stat names and metrology in future refs self._dataPointsMetric = Metrology.meter( "collectordaemon.dataPoints") daemon = self class DeviceGauge(Gauge): @property def value(self): return len(daemon._devices) Metrology.gauge('collectordaemon.devices', DeviceGauge()) # Scheduler statistics class RunningTasks(Gauge): @property def value(self): return daemon._scheduler._executor.running Metrology.gauge('collectordaemon.runningTasks', RunningTasks()) class TaskCount(Gauge): @property def value(self): return daemon._scheduler.taskCount Metrology.gauge('collectordaemon.taskCount', TaskCount()) class QueuedTasks(Gauge): @property def value(self): return daemon._scheduler._executor.queued Metrology.gauge('collectordaemon.queuedTasks', QueuedTasks()) class MissedRuns(Gauge): @property def value(self): return daemon._scheduler.missedRuns Metrology.gauge('collectordaemon.missedRuns', MissedRuns()) self._deviceGuids = {} self._devices = set() self._unresponsiveDevices = set() self._rrd = None self._metric_writer = None self._derivative_tracker = None self.reconfigureTimeout = None # keep track of pending tasks if we're doing a single run, and not a # continuous cycle if not self.options.cycle: self._completedTasks = 0 self._pendingTasks = [] frameworkFactory = zope.component.queryUtility( IFrameworkFactory, self._frameworkFactoryName) self._configProxy = frameworkFactory.getConfigurationProxy() self._scheduler = frameworkFactory.getScheduler() self._scheduler.maxTasks = self.options.maxTasks self._ConfigurationLoaderTask = frameworkFactory.getConfigurationLoaderTask( ) # OLD - set the initialServices attribute so that the PBDaemon class # will load all of the remote services we need. self.initialServices = PBDaemon.initialServices +\ [self.preferences.configurationService] # trap SIGUSR2 so that we can display detailed statistics signal.signal(signal.SIGUSR2, self._signalHandler) # let the configuration do any additional startup it might need self.preferences.postStartup() self.addedPostStartupTasks = False # Variables used by enterprise collector in resmgr # # flag that indicates we have finished loading the configs for the first time after a restart self.firstConfigLoadDone = False # flag that indicates the daemon has received the encryption key from zenhub self.encryptionKeyInitialized = False # flag that indicates the daemon is loading the cached configs self.loadingCachedConfigs = False def buildOptions(self): """ Method called by CmdBase.__init__ to build all of the possible command-line options for this collector daemon. """ super(CollectorDaemon, self).buildOptions() maxTasks = getattr(self.preferences, 'maxTasks', None) defaultMax = maxTasks if maxTasks else 500 self.parser.add_option( '--maxparallel', dest='maxTasks', type='int', default=defaultMax, help='Max number of tasks to run at once, default %default') self.parser.add_option( '--logTaskStats', dest='logTaskStats', type='int', default=0, help= 'How often to logs statistics of current tasks, value in seconds; very verbose' ) addWorkerOptions(self.parser) self.parser.add_option( '--traceMetricName', dest='traceMetricName', type='string', default=None, help='trace metrics whose name matches this regex') self.parser.add_option( '--traceMetricKey', dest='traceMetricKey', type='string', default=None, help='trace metrics whose key value matches this regex') frameworkFactory = zope.component.queryUtility( IFrameworkFactory, self._frameworkFactoryName) if hasattr(frameworkFactory, 'getFrameworkBuildOptions'): # During upgrades we'll be missing this option self._frameworkBuildOptions = frameworkFactory.getFrameworkBuildOptions( ) if self._frameworkBuildOptions: self._frameworkBuildOptions(self.parser) # give the collector configuration a chance to add options, too self.preferences.buildOptions(self.parser) def parseOptions(self): super(CollectorDaemon, self).parseOptions() self.preferences.options = self.options configFilter = parseWorkerOptions(self.options.__dict__) if configFilter: self.preferences.configFilter = configFilter def connected(self): """ Method called by PBDaemon after a connection to ZenHub is established. """ return self._startup() def _getInitializationCallback(self): def doNothing(): pass if self._initializationCallback is not None: return self._initializationCallback else: return doNothing def connectTimeout(self): super(CollectorDaemon, self).connectTimeout() return self._startup() def _startup(self): d = defer.maybeDeferred(self._getInitializationCallback()) d.addCallback(self._initEncryptionKey) d.addCallback(self._startConfigCycle) d.addCallback(self._startMaintenance) d.addErrback(self._errorStop) return d @defer.inlineCallbacks def _initEncryptionKey(self, prv_cb_result=None): # encrypt dummy msg in order to initialize the encryption key data = yield self._configProxy.encrypt( "Hello") # block until we get the key if data: # encrypt returns None if an exception is raised self.encryptionKeyInitialized = True self.log.info("Daemon's encryption key initialized") def watchdogCycleTime(self): """ Return our cycle time (in minutes) @return: cycle time @rtype: integer """ return self.preferences.cycleInterval * 2 def getRemoteConfigServiceProxy(self): """ Called to retrieve the remote configuration service proxy object. """ return self.services.get(self.preferences.configurationService, FakeRemote()) def generateEvent(self, event, **kw): eventCopy = super(CollectorDaemon, self).generateEvent(event, **kw) if eventCopy and eventCopy.get("device"): device_id = eventCopy.get("device") guid = self._deviceGuids.get(device_id) if guid: eventCopy['device_guid'] = guid return eventCopy def should_trace_metric(self, metric, contextkey): """ Tracer implementation - use this function to indicate whether a given metric/contextkey combination is to be traced. :param metric: name of the metric in question :param contextkey: context key of the metric in question :return: boolean indicating whether to trace this metric/key """ tests = [] if self.options.traceMetricName: tests.append((self.options.traceMetricName, metric)) if self.options.traceMetricKey: tests.append((self.options.traceMetricKey, contextkey)) result = [bool(re.search(exp, subj)) for exp, subj in tests] return len(result) > 0 and all(result) @defer.inlineCallbacks def writeMetric(self, contextKey, metric, value, metricType, contextId, timestamp='N', min='U', max='U', threshEventData=None, deviceId=None, contextUUID=None, deviceUUID=None, extraTags=None): """ Writes the metric to the metric publisher. @param contextKey: This is who the metric applies to. This is usually the return value of rrdPath() for a component or device. @param metric: the name of the metric, we expect it to be of the form datasource_datapoint @param value: the value of the metric @param metricType: type of the metric (e.g. 'COUNTER', 'GAUGE', 'DERIVE' etc) @param contextId: used for the threshold events, the id of who this metric is for @param timestamp: defaults to time.time() if not specified, the time the metric occurred @param min: used in the derive the min value for the metric @param max: used in the derive the max value for the metric @param threshEventData: extra data put into threshold events @param deviceId: the id of the device for this metric @return: a deferred that fires when the metric gets published """ timestamp = int(time.time()) if timestamp == 'N' else timestamp tags = {'contextUUID': contextUUID, 'key': contextKey} if self.should_trace_metric(metric, contextKey): tags['mtrace'] = "{}".format(int(time.time())) metric_name = metric if deviceId: tags['device'] = deviceId # compute (and cache) a rate for COUNTER/DERIVE if metricType in {'COUNTER', 'DERIVE'}: if metricType == 'COUNTER' and min == 'U': # COUNTER implies only positive derivatives are valid. min = 0 dkey = "%s:%s" % (contextUUID, metric) value = self._derivative_tracker.derivative( dkey, (float(value), timestamp), min, max) # check for threshold breaches and send events when needed if value is not None: if extraTags: tags.update(extraTags) # write the metric to Redis try: yield defer.maybeDeferred(self._metric_writer.write_metric, metric_name, value, timestamp, tags) except Exception as e: self.log.debug("Error sending metric %s", e) yield defer.maybeDeferred(self._threshold_notifier.notify, contextUUID, contextId, metric, timestamp, value, threshEventData) def writeMetricWithMetadata(self, metric, value, metricType, timestamp='N', min='U', max='U', threshEventData=None, metadata=None, extraTags=None): metadata = metadata or {} try: key = metadata['contextKey'] contextId = metadata['contextId'] deviceId = metadata['deviceId'] contextUUID = metadata['contextUUID'] if metadata: metric_name = metrics.ensure_prefix(metadata, metric) else: metric_name = metric except KeyError as e: raise Exception("Missing necessary metadata: %s" % e.message) return self.writeMetric(key, metric_name, value, metricType, contextId, timestamp=timestamp, min=min, max=max, threshEventData=threshEventData, deviceId=deviceId, contextUUID=contextUUID, deviceUUID=metadata.get('deviceUUID'), extraTags=extraTags) @deprecated def writeRRD(self, path, value, rrdType, rrdCommand=None, cycleTime=None, min='U', max='U', threshEventData={}, timestamp='N', allowStaleDatapoint=True): """ Use writeMetric """ # we rely on the fact that rrdPath now returns more information than just the path metricinfo, metric = path.rsplit('/', 1) if 'METRIC_DATA' not in str(metricinfo): raise Exception( "Unable to write Metric with given path { %s } please see the rrdpath method" % str(metricinfo)) metadata = json.loads(metricinfo) # reroute to new writeMetric method return self.writeMetricWithMetadata(metric, value, rrdType, timestamp, min, max, threshEventData, metadata) def stop(self, ignored=""): if self._stoppingCallback is not None: try: self._stoppingCallback() except Exception: self.log.exception('Exception while stopping daemon') super(CollectorDaemon, self).stop(ignored) def remote_deleteDevice(self, devId): """ Called remotely by ZenHub when a device we're monitoring is deleted. """ # guard against parsing updates during a disconnect if devId is None: return self._deleteDevice(devId) def remote_deleteDevices(self, deviceIds): """ Called remotely by ZenHub when devices we're monitoring are deleted. """ # guard against parsing updates during a disconnect if deviceIds is None: return for devId in Zipper.load(deviceIds): self._deleteDevice(devId) def remote_updateDeviceConfig(self, config): """ Called remotely by ZenHub when asynchronous configuration updates occur. """ # guard against parsing updates during a disconnect if config is None: return self.log.debug("Device %s updated", config.configId) if self._updateConfig(config): self._configProxy.updateConfigProxy(self.preferences, config) else: self.log.debug("Device %s config filtered", config.configId) def remote_updateDeviceConfigs(self, configs): """ Called remotely by ZenHub when asynchronous configuration updates occur. """ if configs is None: return configs = Zipper.load(configs) self.log.debug( "remote_updateDeviceConfigs: workerid %s processing %s device configs", self.options.workerid, len(configs)) for config in configs: self.remote_updateDeviceConfig(config) def remote_notifyConfigChanged(self): """ Called from zenhub to notify that the entire config should be updated """ if self.reconfigureTimeout and self.reconfigureTimeout.active(): # We will run along with the already scheduled task self.log.debug("notifyConfigChanged - using existing call") return self.log.debug("notifyConfigChanged - scheduling call in 30 seconds") self.reconfigureTimeout = reactor.callLater(30, self._rebuildConfig) def _rebuildConfig(self): """ Delete and re-add the configuration tasks to completely re-build the configuration. """ if self.reconfigureTimeout and not self.reconfigureTimeout.active(): self.reconfigureTimeout = None self._scheduler.removeTasksForConfig(CONFIG_LOADER_NAME) self._startConfigCycle() def _rescheduleConfig(self, observable, attrName, oldValue, newValue, **kwargs): """ Delete and re-add the configuration tasks to start on new interval. """ if oldValue != newValue: self.log.debug( "Changing config task interval from %s to %s minutes" % (oldValue, newValue)) self._scheduler.removeTasksForConfig(CONFIG_LOADER_NAME) #values are in minutes, scheduler takes seconds self._startConfigCycle(startDelay=newValue * 60) def _taskCompleteCallback(self, taskName): # if we're not running a normal daemon cycle then we need to shutdown # once all of our pending tasks have completed if not self.options.cycle: try: self._pendingTasks.remove(taskName) except ValueError: pass self._completedTasks += 1 # if all pending tasks have been completed then shutdown the daemon if len(self._pendingTasks) == 0: self._displayStatistics() self.stop() def _updateConfig(self, cfg): """ Update device configuration. Returns true if config is updated, false if config is skipped """ # guard against parsing updates during a disconnect if cfg is None: return False configFilter = getattr(self.preferences, "configFilter", None) or (lambda x: True) if not ((not self.options.device and configFilter(cfg)) or self.options.device in (cfg.id, cfg.configId)): self.log.info("Device %s config filtered", cfg.configId) return False configId = cfg.configId self.log.debug("Processing configuration for %s", configId) nextExpectedRuns = {} if configId in self._devices: tasksToRemove = self._scheduler.getTasksForConfig(configId) nextExpectedRuns = { taskToRemove.name: self._scheduler.getNextExpectedRun(taskToRemove.name) for taskToRemove in tasksToRemove } self._scheduler.removeTasks(task.name for task in tasksToRemove) self._configListener.updated(cfg) else: self._devices.add(configId) self._configListener.added(cfg) newTasks = self._taskSplitter.splitConfiguration([cfg]) self.log.debug("Tasks for config %s: %s", configId, newTasks) nowTime = time.time() for (taskName, task_) in newTasks.iteritems(): #if not cycling run the task immediately otherwise let the scheduler #decide when to run the task now = not self.options.cycle nextExpectedRun = nextExpectedRuns.get(taskName, None) if nextExpectedRun: startDelay = nextExpectedRun - nowTime if startDelay <= 0: # handle edge case where we are about to run # so run immediately now = True task_.startDelay = 0 else: task_.startDelay = startDelay try: self._scheduler.addTask(task_, self._taskCompleteCallback, now) except ValueError: self.log.exception("Error adding device config") continue # TODO: another hack? if hasattr(cfg, 'thresholds'): self.getThresholds().updateForDevice(configId, cfg.thresholds) # if we're not running a normal daemon cycle then keep track of the # tasks we just added for this device so that we can shutdown once # all pending tasks have completed if not self.options.cycle: self._pendingTasks.append(taskName) # put tasks on pause after configuration update to prevent unnecessary collections ZEN-25463 if configId in self._unresponsiveDevices: self.log.debug("Pausing tasks for device %s", configId) self._scheduler.pauseTasksForConfig(configId) return True @defer.inlineCallbacks def _updateDeviceConfigs(self, updatedConfigs, purgeOmitted): """ Update the device configurations for the devices managed by this collector. @param deviceConfigs a list of device configurations @type deviceConfigs list of name,value tuples """ self.log.debug("updateDeviceConfigs: updatedConfigs=%s", (map(str, updatedConfigs))) for cfg in updatedConfigs: self._updateConfig(cfg) # yield time to reactor so other things can happen yield task.deferLater(reactor, 0, lambda: None) if purgeOmitted: self._purgeOmittedDevices(cfg.configId for cfg in updatedConfigs) def _purgeOmittedDevices(self, updatedDevices): """ Delete all current devices that are omitted from the list of devices being updated. @param updatedDevices a collection of device ids @type updatedDevices a sequence of strings """ # remove tasks for the deleted devices deletedDevices = set(self._devices) - set(updatedDevices) self.log.debug("purgeOmittedDevices: deletedConfigs=%s", ','.join(deletedDevices)) for configId in deletedDevices: self._deleteDevice(configId) def _deleteDevice(self, deviceId): self.log.debug("Device %s deleted" % deviceId) self._devices.discard(deviceId) self._configListener.deleted(deviceId) self._configProxy.deleteConfigProxy(self.preferences, deviceId) self._scheduler.removeTasksForConfig(deviceId) def _errorStop(self, result): """ Twisted callback to receive fatal messages. @param result: the Twisted failure @type result: failure object """ if isinstance(result, Failure): msg = result.getErrorMessage() else: msg = str(result) self.log.critical("Unrecoverable Error: %s", msg) self.stop() def _startConfigCycle(self, result=None, startDelay=0): configLoader = self._ConfigurationLoaderTask( CONFIG_LOADER_NAME, taskConfig=self.preferences) configLoader.startDelay = startDelay # Don't add the config loader task if the scheduler already has # an instance of it. if configLoader not in self._scheduler: # Run initial maintenance cycle as soon as possible # TODO: should we not run maintenance if running in non-cycle mode? self._scheduler.addTask(configLoader) else: self.log.info("%s already added to scheduler", configLoader.name) return defer.succeed("Configuration loader task started") def setPropertyItems(self, items): """ Override so that preferences are updated """ super(CollectorDaemon, self).setPropertyItems(items) self._setCollectorPreferences(dict(items)) def _setCollectorPreferences(self, preferenceItems): for name, value in preferenceItems.iteritems(): if not hasattr(self.preferences, name): # TODO: make a super-low level debug mode? The following message isn't helpful #self.log.debug("Preferences object does not have attribute %s", # name) setattr(self.preferences, name, value) elif getattr(self.preferences, name) != value: self.log.debug("Updated %s preference to %s", name, value) setattr(self.preferences, name, value) def _loadThresholdClasses(self, thresholdClasses): self.log.debug("Loading classes %s", thresholdClasses) for c in thresholdClasses: try: importClass(c) except ImportError: log.exception("Unable to import class %s", c) def _configureThresholds(self, thresholds): self.getThresholds().updateList(thresholds) def _startMaintenance(self, ignored=None): unused(ignored) if not self.options.cycle: self._maintenanceCycle() return if self.options.logTaskStats > 0: log.debug("Starting Task Stat logging") loop = task.LoopingCall(self._displayStatistics, verbose=True) loop.start(self.options.logTaskStats, now=False) interval = self.preferences.cycleInterval self.log.debug("Initializing maintenance Cycle") heartbeatSender = self if self.worker_id == 0 else None maintenanceCycle = MaintenanceCycle(interval, heartbeatSender, self._maintenanceCycle) maintenanceCycle.start() @defer.inlineCallbacks def _maintenanceCycle(self, ignored=None): """ Perform daemon maintenance processing on a periodic schedule. Initially called after the daemon configuration loader task is added, but afterward will self-schedule each run. """ try: self.log.debug("Performing periodic maintenance") if not self.options.cycle: ret = "No maintenance required" elif getattr(self.preferences, 'pauseUnreachableDevices', True): # TODO: handle different types of device issues ret = yield self._pauseUnreachableDevices() else: ret = None defer.returnValue(ret) except Exception: self.log.exception('failure in _maintenanceCycle') raise @defer.inlineCallbacks def _pauseUnreachableDevices(self): issues = yield self.getDevicePingIssues() self.log.debug("deviceIssues=%r", issues) if issues is None: defer.returnValue(issues) # exception or some other problem # Device ping issues returns as a tuple of (deviceId, count, total) # and we just want the device id newUnresponsiveDevices = set(i[0] for i in issues) clearedDevices = self._unresponsiveDevices.difference( newUnresponsiveDevices) for devId in clearedDevices: self.log.debug("Resuming tasks for device %s", devId) self._scheduler.resumeTasksForConfig(devId) self._unresponsiveDevices = newUnresponsiveDevices for devId in self._unresponsiveDevices: self.log.debug("Pausing tasks for device %s", devId) self._scheduler.pauseTasksForConfig(devId) defer.returnValue(issues) def runPostConfigTasks(self, result=None): """ Add post-startup tasks from the preferences. This may be called with the failure code as well. """ if isinstance(result, Failure): pass elif not self.addedPostStartupTasks: postStartupTasks = getattr(self.preferences, 'postStartupTasks', lambda: []) for task in postStartupTasks(): self._scheduler.addTask(task, now=True) self.addedPostStartupTasks = True def postStatisticsImpl(self): self._displayStatistics() # update and post statistics if we've been configured to do so if self.rrdStats: stat = self._statService.getStatistic("devices") stat.value = len(self._devices) # stat = self._statService.getStatistic("cyclePoints") # stat.value = self._rrd.endCycle() stat = self._statService.getStatistic("dataPoints") stat.value = self.metricWriter().dataPoints # Scheduler statistics stat = self._statService.getStatistic("runningTasks") stat.value = self._scheduler._executor.running stat = self._statService.getStatistic("taskCount") stat.value = self._scheduler.taskCount stat = self._statService.getStatistic("queuedTasks") stat.value = self._scheduler._executor.queued stat = self._statService.getStatistic("missedRuns") stat.value = self._scheduler.missedRuns diff = self.metricWriter( ).dataPoints - self._dataPointsMetric.count self._dataPointsMetric.mark(diff) self._statService.postStatistics(self.rrdStats) def _displayStatistics(self, verbose=False): if self.metricWriter(): self.log.info("%d devices processed (%d datapoints)", len(self._devices), self.metricWriter().dataPoints) else: self.log.info("%d devices processed (0 datapoints)", len(self._devices)) self._scheduler.displayStatistics(verbose) def _signalHandler(self, signum, frame): self._displayStatistics(True) @property def worker_count(self): """ worker_count for this daemon """ return getattr(self.options, 'workers', 1) @property def worker_id(self): """ worker_id for this particular peer """ return getattr(self.options, 'workerid', 0)
def __init__(self, preferences, taskSplitter, configurationListener=DUMMY_LISTENER, initializationCallback=None, stoppingCallback=None): """ Constructs a new instance of the CollectorDaemon framework. Normally only a singleton instance of a CollectorDaemon should exist within a process, but this is not enforced. @param preferences: the collector configuration @type preferences: ICollectorPreferences @param taskSplitter: the task splitter to use for this collector @type taskSplitter: ITaskSplitter @param initializationCallback: a callable that will be executed after connection to the hub but before retrieving configuration information @type initializationCallback: any callable @param stoppingCallback: a callable that will be executed first during the stopping process. Exceptions will be logged but otherwise ignored. @type stoppingCallback: any callable """ # create the configuration first, so we have the collector name # available before activating the rest of the Daemon class hierarchy. if not ICollectorPreferences.providedBy(preferences): raise TypeError("configuration must provide ICollectorPreferences") else: self._prefs = ObservableProxy(preferences) self._prefs.attachAttributeObserver('configCycleInterval', self._rescheduleConfig) if not ITaskSplitter.providedBy(taskSplitter): raise TypeError("taskSplitter must provide ITaskSplitter") else: self._taskSplitter = taskSplitter if not IConfigurationListener.providedBy(configurationListener): raise TypeError( "configurationListener must provide IConfigurationListener") self._configListener = ConfigListenerNotifier() self._configListener.addListener(configurationListener) self._configListener.addListener(DeviceGuidListener(self)) self._initializationCallback = initializationCallback self._stoppingCallback = stoppingCallback # register the various interfaces we provide the rest of the system so # that collector implementors can easily retrieve a reference back here # if needed zope.component.provideUtility(self, ICollector) zope.component.provideUtility(self, IEventService) zope.component.provideUtility(self, IDataService) # setup daemon statistics self._statService = StatisticsService() self._statService.addStatistic("devices", "GAUGE") self._statService.addStatistic("cyclePoints", "GAUGE") self._statService.addStatistic("dataPoints", "DERIVE") self._statService.addStatistic("runningTasks", "GAUGE") self._statService.addStatistic("queuedTasks", "GAUGE") self._statService.addStatistic("missedRuns", "GAUGE") zope.component.provideUtility(self._statService, IStatisticsService) # register the collector's own preferences object so it may be easily # retrieved by factories, tasks, etc. zope.component.provideUtility(self.preferences, ICollectorPreferences, self.preferences.collectorName) super(CollectorDaemon, self).__init__(name=self.preferences.collectorName) self._deviceGuids = {} self._devices = set() self._thresholds = Thresholds() self._unresponsiveDevices = set() self._rrd = None self.reconfigureTimeout = None # keep track of pending tasks if we're doing a single run, and not a # continuous cycle if not self.options.cycle: self._completedTasks = 0 self._pendingTasks = [] frameworkFactory = zope.component.queryUtility( IFrameworkFactory, self._frameworkFactoryName) self._configProxy = frameworkFactory.getConfigurationProxy() self._scheduler = frameworkFactory.getScheduler() self._scheduler.maxTasks = self.options.maxTasks self._ConfigurationLoaderTask = frameworkFactory.getConfigurationLoaderTask( ) # OLD - set the initialServices attribute so that the PBDaemon class # will load all of the remote services we need. self.initialServices = PBDaemon.initialServices +\ [self.preferences.configurationService] # trap SIGUSR2 so that we can display detailed statistics signal.signal(signal.SIGUSR2, self._signalHandler) # let the configuration do any additional startup it might need self.preferences.postStartup() self.addedPostStartupTasks = False
class CollectorDaemon(RRDDaemon): """ The daemon class for the entire ZenCollector framework. This class bridges the gap between the older daemon framework and ZenCollector. New collectors no longer should extend this class to implement a new collector. """ zope.interface.implements(ICollector, IDataService, IEventService) _frameworkFactoryName = "" @property def preferences(self): """ Preferences for this daemon """ return self._prefs def __init__(self, preferences, taskSplitter, configurationListener=DUMMY_LISTENER, initializationCallback=None, stoppingCallback=None): """ Constructs a new instance of the CollectorDaemon framework. Normally only a singleton instance of a CollectorDaemon should exist within a process, but this is not enforced. @param preferences: the collector configuration @type preferences: ICollectorPreferences @param taskSplitter: the task splitter to use for this collector @type taskSplitter: ITaskSplitter @param initializationCallback: a callable that will be executed after connection to the hub but before retrieving configuration information @type initializationCallback: any callable @param stoppingCallback: a callable that will be executed first during the stopping process. Exceptions will be logged but otherwise ignored. @type stoppingCallback: any callable """ # create the configuration first, so we have the collector name # available before activating the rest of the Daemon class hierarchy. if not ICollectorPreferences.providedBy(preferences): raise TypeError("configuration must provide ICollectorPreferences") else: self._prefs = ObservableProxy(preferences) self._prefs.attachAttributeObserver('configCycleInterval', self._rescheduleConfig) if not ITaskSplitter.providedBy(taskSplitter): raise TypeError("taskSplitter must provide ITaskSplitter") else: self._taskSplitter = taskSplitter if not IConfigurationListener.providedBy(configurationListener): raise TypeError( "configurationListener must provide IConfigurationListener") self._configListener = ConfigListenerNotifier() self._configListener.addListener(configurationListener) self._configListener.addListener(DeviceGuidListener(self)) self._initializationCallback = initializationCallback self._stoppingCallback = stoppingCallback # register the various interfaces we provide the rest of the system so # that collector implementors can easily retrieve a reference back here # if needed zope.component.provideUtility(self, ICollector) zope.component.provideUtility(self, IEventService) zope.component.provideUtility(self, IDataService) # setup daemon statistics self._statService = StatisticsService() self._statService.addStatistic("devices", "GAUGE") self._statService.addStatistic("cyclePoints", "GAUGE") self._statService.addStatistic("dataPoints", "DERIVE") self._statService.addStatistic("runningTasks", "GAUGE") self._statService.addStatistic("queuedTasks", "GAUGE") self._statService.addStatistic("missedRuns", "GAUGE") zope.component.provideUtility(self._statService, IStatisticsService) # register the collector's own preferences object so it may be easily # retrieved by factories, tasks, etc. zope.component.provideUtility(self.preferences, ICollectorPreferences, self.preferences.collectorName) super(CollectorDaemon, self).__init__(name=self.preferences.collectorName) self._deviceGuids = {} self._devices = set() self._thresholds = Thresholds() self._unresponsiveDevices = set() self._rrd = None self.reconfigureTimeout = None # keep track of pending tasks if we're doing a single run, and not a # continuous cycle if not self.options.cycle: self._completedTasks = 0 self._pendingTasks = [] frameworkFactory = zope.component.queryUtility( IFrameworkFactory, self._frameworkFactoryName) self._configProxy = frameworkFactory.getConfigurationProxy() self._scheduler = frameworkFactory.getScheduler() self._scheduler.maxTasks = self.options.maxTasks self._ConfigurationLoaderTask = frameworkFactory.getConfigurationLoaderTask( ) # OLD - set the initialServices attribute so that the PBDaemon class # will load all of the remote services we need. self.initialServices = PBDaemon.initialServices +\ [self.preferences.configurationService] # trap SIGUSR2 so that we can display detailed statistics signal.signal(signal.SIGUSR2, self._signalHandler) # let the configuration do any additional startup it might need self.preferences.postStartup() self.addedPostStartupTasks = False def buildOptions(self): """ Method called by CmdBase.__init__ to build all of the possible command-line options for this collector daemon. """ super(CollectorDaemon, self).buildOptions() maxTasks = getattr(self.preferences, 'maxTasks', None) defaultMax = maxTasks if maxTasks else 500 self.parser.add_option( '--maxparallel', dest='maxTasks', type='int', default=defaultMax, help='Max number of tasks to run at once, default %default') self.parser.add_option( '--logTaskStats', dest='logTaskStats', type='int', default=0, help= 'How often to logs statistics of current tasks, value in seconds; very verbose' ) self.parser.add_option( '--redis-url', default='redis://localhost:16379/0', help= 'redis connection string: redis://[hostname]:[port]/[db], default: %default' ) frameworkFactory = zope.component.queryUtility( IFrameworkFactory, self._frameworkFactoryName) if hasattr(frameworkFactory, 'getFrameworkBuildOptions'): # During upgrades we'll be missing this option self._frameworkBuildOptions = frameworkFactory.getFrameworkBuildOptions( ) if self._frameworkBuildOptions: self._frameworkBuildOptions(self.parser) # give the collector configuration a chance to add options, too self.preferences.buildOptions(self.parser) def parseOptions(self): super(CollectorDaemon, self).parseOptions() self.preferences.options = self.options def connected(self): """ Method called by PBDaemon after a connection to ZenHub is established. """ return self._startup() def _getInitializationCallback(self): def doNothing(): pass if self._initializationCallback is not None: return self._initializationCallback else: return doNothing def connectTimeout(self): super(CollectorDaemon, self).connectTimeout() return self._startup() def _startup(self): d = defer.maybeDeferred(self._getInitializationCallback()) d.addCallback(self._startConfigCycle) d.addCallback(self._startMaintenance) d.addErrback(self._errorStop) return d def watchdogCycleTime(self): """ Return our cycle time (in minutes) @return: cycle time @rtype: integer """ return self.preferences.cycleInterval * 2 def getRemoteConfigServiceProxy(self): """ Called to retrieve the remote configuration service proxy object. """ return self.services.get(self.preferences.configurationService, FakeRemote()) def generateEvent(self, event, **kw): eventCopy = super(CollectorDaemon, self).generateEvent(event, **kw) if eventCopy.get("device"): device_id = eventCopy.get("device") guid = self._deviceGuids.get(device_id) if guid: eventCopy['device_guid'] = guid return eventCopy def writeRRD(self, path, value, rrdType, rrdCommand=None, cycleTime=None, min='U', max='U', threshEventData={}, timestamp='N', allowStaleDatapoint=True): now = time.time() hasThresholds = bool(self._thresholds.byFilename.get(path)) if hasThresholds: rrd_write_fn = self._rrd.save else: rrd_write_fn = self._rrd.put # save the raw data directly to the RRD files value = rrd_write_fn( path, value, rrdType, rrdCommand, cycleTime, min, max, timestamp=timestamp, allowStaleDatapoint=allowStaleDatapoint, ) # check for threshold breaches and send events when needed if hasThresholds: if 'eventKey' in threshEventData: eventKeyPrefix = [threshEventData['eventKey']] else: eventKeyPrefix = [path.rsplit('/')[-1]] for ev in self._thresholds.check(path, now, value): parts = eventKeyPrefix[:] if 'eventKey' in ev: parts.append(ev['eventKey']) ev['eventKey'] = '|'.join(parts) # add any additional values for this threshold # (only update if key is not in event, or if # the event's value is blank or None) for key, value in threshEventData.items(): if ev.get(key, None) in ('', None): ev[key] = value self.sendEvent(ev) def readRRD(self, path, consolidationFunction, start, end): return RRDUtil.read(path, consolidationFunction, start, end) def stop(self, ignored=""): if self._stoppingCallback is not None: try: self._stoppingCallback() except Exception: self.log.exception('Exception while stopping daemon') super(CollectorDaemon, self).stop(ignored) def remote_deleteDevice(self, devId): """ Called remotely by ZenHub when a device we're monitoring is deleted. """ # guard against parsing updates during a disconnect if devId is None: return self._deleteDevice(devId) def remote_deleteDevices(self, deviceIds): """ Called remotely by ZenHub when devices we're monitoring are deleted. """ # guard against parsing updates during a disconnect if deviceIds is None: return for devId in Zipper.load(deviceIds): self._deleteDevice(devId) def remote_updateDeviceConfig(self, config): """ Called remotely by ZenHub when asynchronous configuration updates occur. """ # guard against parsing updates during a disconnect if config is None: return self.log.debug("Device %s updated", config.configId) if not self.options.device or self.options.device in (config.id, config.configId): self._updateConfig(config) self._configProxy.updateConfigProxy(self.preferences, config) def remote_updateDeviceConfigs(self, configs): """ Called remotely by ZenHub when asynchronous configuration updates occur. """ if configs is None: return for config in Zipper.load(configs): self.remote_updateDeviceConfig(config) def remote_notifyConfigChanged(self): """ Called from zenhub to notify that the entire config should be updated """ if self.reconfigureTimeout and self.reconfigureTimeout.active(): # We will run along with the already scheduled task self.log.debug("notifyConfigChanged - using existing call") return self.log.debug("notifyConfigChanged - scheduling call in 30 seconds") self.reconfigureTimeout = reactor.callLater(30, self._rebuildConfig) def _rebuildConfig(self): """ Delete and re-add the configuration tasks to completely re-build the configuration. """ if self.reconfigureTimeout and not self.reconfigureTimeout.active(): self.reconfigureTimeout = None self._scheduler.removeTasksForConfig(CONFIG_LOADER_NAME) self._startConfigCycle() def _rescheduleConfig(self, observable, attrName, oldValue, newValue, **kwargs): """ Delete and re-add the configuration tasks to start on new interval. """ if oldValue != newValue: self.log.debug( "Changing config task interval from %s to %s minutes" % (oldValue, newValue)) self._scheduler.removeTasksForConfig(CONFIG_LOADER_NAME) #values are in minutes, scheduler takes seconds self._startConfigCycle(startDelay=newValue * 60) def _taskCompleteCallback(self, taskName): # if we're not running a normal daemon cycle then we need to shutdown # once all of our pending tasks have completed if not self.options.cycle: try: self._pendingTasks.remove(taskName) except ValueError: pass self._completedTasks += 1 # if all pending tasks have been completed then shutdown the daemon if len(self._pendingTasks) == 0: self._displayStatistics() self.stop() def _updateConfig(self, cfg): configId = cfg.configId self.log.debug("Processing configuration for %s", configId) nextExpectedRuns = {} if configId in self._devices: tasksToRemove = self._scheduler.getTasksForConfig(configId) nextExpectedRuns = { taskToRemove.name: self._scheduler.getNextExpectedRun(taskToRemove.name) for taskToRemove in tasksToRemove } self._scheduler.removeTasks(task.name for task in tasksToRemove) self._configListener.updated(cfg) else: self._devices.add(configId) self._configListener.added(cfg) newTasks = self._taskSplitter.splitConfiguration([cfg]) self.log.debug("Tasks for config %s: %s", configId, newTasks) nowTime = time.time() for (taskName, task_) in newTasks.iteritems(): #if not cycling run the task immediately otherwise let the scheduler #decide when to run the task now = not self.options.cycle nextExpectedRun = nextExpectedRuns.get(taskName, None) if nextExpectedRun: startDelay = nextExpectedRun - nowTime if startDelay <= 0: # handle edge case where we are about to run # so run immediately now = True task_.startDelay = 0 else: task_.startDelay = startDelay self._scheduler.addTask(task_, self._taskCompleteCallback, now) # TODO: another hack? if hasattr(cfg, 'thresholds'): self._thresholds.updateForDevice(configId, cfg.thresholds) # if we're not running a normal daemon cycle then keep track of the # tasks we just added for this device so that we can shutdown once # all pending tasks have completed if not self.options.cycle: self._pendingTasks.append(taskName) @defer.inlineCallbacks def _updateDeviceConfigs(self, updatedConfigs, purgeOmitted): """ Update the device configurations for the devices managed by this collector. @param deviceConfigs a list of device configurations @type deviceConfigs list of name,value tuples """ self.log.debug("updateDeviceConfigs: updatedConfigs=%s", (map(str, updatedConfigs))) for cfg in updatedConfigs: self._updateConfig(cfg) # yield time to reactor so other things can happen yield task.deferLater(reactor, 0, lambda: None) if purgeOmitted: self._purgeOmittedDevices(cfg.configId for cfg in updatedConfigs) def _purgeOmittedDevices(self, updatedDevices): """ Delete all current devices that are omitted from the list of devices being updated. @param updatedDevices a collection of device ids @type updatedDevices a sequence of strings """ # remove tasks for the deleted devices deletedDevices = set(self._devices) - set(updatedDevices) self.log.debug("purgeOmittedDevices: deletedConfigs=%s", ','.join(deletedDevices)) for configId in deletedDevices: self._deleteDevice(configId) def _deleteDevice(self, deviceId): self.log.debug("Device %s deleted" % deviceId) self._devices.discard(deviceId) self._configListener.deleted(deviceId) self._configProxy.deleteConfigProxy(self.preferences, deviceId) self._scheduler.removeTasksForConfig(deviceId) def _errorStop(self, result): """ Twisted callback to receive fatal messages. @param result: the Twisted failure @type result: failure object """ if isinstance(result, Failure): msg = result.getErrorMessage() else: msg = str(result) self.log.critical("Unrecoverable Error: %s", msg) self.stop() def _startConfigCycle(self, result=None, startDelay=0): configLoader = self._ConfigurationLoaderTask( CONFIG_LOADER_NAME, taskConfig=self.preferences) configLoader.startDelay = startDelay # Don't add the config loader task if the scheduler already has # an instance of it. if configLoader not in self._scheduler: # Run initial maintenance cycle as soon as possible # TODO: should we not run maintenance if running in non-cycle mode? self._scheduler.addTask(configLoader) else: self.log.info("%s already added to scheduler", configLoader.name) return defer.succeed("Configuration loader task started") def setPropertyItems(self, items): """ Override so that preferences are updated """ super(CollectorDaemon, self).setPropertyItems(items) self._setCollectorPreferences(dict(items)) def _setCollectorPreferences(self, preferenceItems): for name, value in preferenceItems.iteritems(): if not hasattr(self.preferences, name): # TODO: make a super-low level debug mode? The following message isn't helpful #self.log.debug("Preferences object does not have attribute %s", # name) setattr(self.preferences, name, value) elif getattr(self.preferences, name) != value: self.log.debug("Updated %s preference to %s", name, value) setattr(self.preferences, name, value) def _loadThresholdClasses(self, thresholdClasses): self.log.debug("Loading classes %s", thresholdClasses) for c in thresholdClasses: try: importClass(c) except ImportError: log.exception("Unable to import class %s", c) def _configureRRD(self, rrdCreateCommand, thresholds): self._rrd = RRDUtil.RRDUtil(rrdCreateCommand, self.preferences.cycleInterval) self.rrdStats.config(self.options.monitor, self.name, thresholds, rrdCreateCommand) def _isRRDConfigured(self): return (self.rrdStats and self._rrd) def _startMaintenance(self, ignored=None): unused(ignored) if not self.options.cycle: self._maintenanceCycle() return if self.options.logTaskStats > 0: log.debug("Starting Task Stat logging") loop = task.LoopingCall(self._displayStatistics, verbose=True) loop.start(self.options.logTaskStats, now=False) interval = self.preferences.cycleInterval self.log.debug("Initializing maintenance Cycle") maintenanceCycle = MaintenanceCycle(interval, self, self._maintenanceCycle) maintenanceCycle.start() def _maintenanceCycle(self, ignored=None): """ Perform daemon maintenance processing on a periodic schedule. Initially called after the daemon configuration loader task is added, but afterward will self-schedule each run. """ self.log.debug("Performing periodic maintenance") def _processDeviceIssues(result): self.log.debug("deviceIssues=%r", result) if result is None: return result # exception or some other problem # Device ping issues returns as a tuple of (deviceId, count, total) # and we just want the device id newUnresponsiveDevices = set(i[0] for i in result) clearedDevices = self._unresponsiveDevices.difference( newUnresponsiveDevices) for devId in clearedDevices: self.log.debug("Resuming tasks for device %s", devId) self._scheduler.resumeTasksForConfig(devId) self._unresponsiveDevices = newUnresponsiveDevices for devId in self._unresponsiveDevices: self.log.debug("Pausing tasks for device %s", devId) self._scheduler.pauseTasksForConfig(devId) return result def _getDeviceIssues(result): # TODO: handle different types of device issues, such as WMI issues d = self.getDevicePingIssues() return d def _postStatistics(): self._displayStatistics() # update and post statistics if we've been configured to do so if self._isRRDConfigured(): stat = self._statService.getStatistic("devices") stat.value = len(self._devices) stat = self._statService.getStatistic("cyclePoints") stat.value = self._rrd.endCycle() stat = self._statService.getStatistic("dataPoints") stat.value = self._rrd.dataPoints # Scheduler statistics stat = self._statService.getStatistic("runningTasks") stat.value = self._scheduler._executor.running stat = self._statService.getStatistic("queuedTasks") stat.value = self._scheduler._executor.queued stat = self._statService.getStatistic("missedRuns") stat.value = self._scheduler.missedRuns events = self._statService.postStatistics( self.rrdStats, self.preferences.cycleInterval) self.sendEvents(events) def _maintenance(): if self.options.cycle: d = defer.maybeDeferred(_postStatistics) if getattr(self.preferences, 'pauseUnreachableDevices', True): d.addCallback(_getDeviceIssues) d.addCallback(_processDeviceIssues) else: d = defer.succeed("No maintenance required") return d d = _maintenance() return d def runPostConfigTasks(self, result=None): """ Add post-startup tasks from the preferences. This may be called with the failure code as well. """ if isinstance(result, Failure): pass elif not self.addedPostStartupTasks: postStartupTasks = getattr(self.preferences, 'postStartupTasks', lambda: []) for task in postStartupTasks(): self._scheduler.addTask(task, now=True) self.addedPostStartupTasks = True def _displayStatistics(self, verbose=False): if self._rrd: self.log.info("%d devices processed (%d datapoints)", len(self._devices), self._rrd.dataPoints) else: self.log.info("%d devices processed (0 datapoints)", len(self._devices)) self._scheduler.displayStatistics(verbose) def _signalHandler(self, signum, frame): self._displayStatistics(True)