def __init__(self): self._log = grok_logging.getExtendedLogger(self.__class__.__name__) self._profiling = ( grok.app.config.getboolean("debugging", "profiling") or self._log.isEnabledFor(logging.DEBUG)) self._pollInterval = grok.app.config.getfloat( "metric_collector", "poll_interval") self._metricErrorGracePeriod = grok.app.config.getfloat( "metric_collector", "metric_error_grace_period") # Interval for periodic garbage collection of our caches (e.g., # self._metricInfoCache and self._resourceInfoCache) self._cacheGarbageCollectionIntervalSec = self._metricErrorGracePeriod * 2 # Time (unix epoch) when to run the next garbage collection of # self._metricInfoCache and self._resourceInfoCache; 0 triggers it ASAP as a # quick test of the logic. See self._cacheGarbageCollectionIntervalSec. self._nextCacheGarbageCollectionTime = 0 # We use this to cache info about metrics that helps us avoid unnecessary # queries to the datasource. The keys are metric uid's and corresponding # values are _MetricInfoCacheItem objects. self._metricInfoCache = defaultdict(_MetricInfoCacheItem) # We use this to cache info about resources referenced by metrics to help # us avoid unnecessary resource-status queries to the datasource. The keys # are resource cananonical identifiers (from Metric.server) and # corresponding values are _ResourceInfoCacheItem objects. self._resourceInfoCache = defaultdict(_ResourceInfoCacheItem) self.metricStreamer = MetricStreamer()
def __init__(self): # NOTE: the EC2InstanceMetricGetter instance and its process pool must be # created BEFORE this main (parent) process creates any global or # class-level shared resources that are also used by the pool workers (e.g., # boto connections) that would have undersirable consequences when # replicated into and used by forked child processes (e.g., the same # connection socket file descriptor used by multiple processes). self._metricGetter = EC2InstanceMetricGetter() self._log = _getLogger() self.metricStreamer = MetricStreamer()
def __init__(self): self._log = YOMP_logging.getExtendedLogger(self.__class__.__name__) self._profiling = ( YOMP.app.config.getboolean("debugging", "profiling") or self._log.isEnabledFor(logging.DEBUG)) self._pollInterval = YOMP.app.config.getfloat( "metric_collector", "poll_interval") self._metricErrorGracePeriod = YOMP.app.config.getfloat( "metric_collector", "metric_error_grace_period") # Interval for periodic garbage collection of our caches (e.g., # self._metricInfoCache and self._resourceInfoCache) self._cacheGarbageCollectionIntervalSec = self._metricErrorGracePeriod * 2 # Time (unix epoch) when to run the next garbage collection of # self._metricInfoCache and self._resourceInfoCache; 0 triggers it ASAP as a # quick test of the logic. See self._cacheGarbageCollectionIntervalSec. self._nextCacheGarbageCollectionTime = 0 # We use this to cache info about metrics that helps us avoid unnecessary # queries to the datasource. The keys are metric uid's and corresponding # values are _MetricInfoCacheItem objects. self._metricInfoCache = defaultdict(_MetricInfoCacheItem) # We use this to cache info about resources referenced by metrics to help # us avoid unnecessary resource-status queries to the datasource. The keys # are resource cananonical identifiers (from Metric.server) and # corresponding values are _ResourceInfoCacheItem objects. self._resourceInfoCache = defaultdict(_ResourceInfoCacheItem) self.metricStreamer = MetricStreamer()
def runServer(): # Get the current list of custom metrics appConfig = Config("application.conf", os.environ["APPLICATION_CONFIG_PATH"]) engine = repository.engineFactory(appConfig) global gCustomMetrics now = datetime.datetime.utcnow() with engine.connect() as conn: gCustomMetrics = dict( (m.name, [m, now]) for m in repository.getCustomMetrics(conn)) queueName = appConfig.get("metric_listener", "queue_name") global gProfiling gProfiling = (appConfig.getboolean("debugging", "profiling") or LOGGER.isEnabledFor(logging.DEBUG)) del appConfig metricStreamer = MetricStreamer() modelSwapper = ModelSwapperInterface() with MessageBusConnector() as bus: if not bus.isMessageQeueuePresent(queueName): bus.createMessageQueue(mqName=queueName, durable=True) LOGGER.info("Waiting for messages. To exit, press CTRL+C") with bus.consume(queueName) as consumer: messages = [] messageRxTimes = [] while True: message = consumer.pollOneMessage() if message is not None: messages.append(message) if gProfiling: messageRxTimes.append(time.time()) if message is None or len(messages) >= MAX_MESSAGES_PER_BATCH: if messages: # Process the batch try: _handleBatch(engine, messages, messageRxTimes, metricStreamer, modelSwapper) except Exception: # pylint: disable=W0703 LOGGER.exception("Unknown failure in processing messages.") # Make sure that we ack messages when there is an unexpected error # to avoid getting hung forever on one bad record. # Ack all the messages messages[-1].ack(multiple=True) # Clear the message buffer messages = [] messageRxTimes = [] else: # Queue is empty, wait before retrying time.sleep(POLL_DELAY_SEC)
def __init__(self): # NOTE: the EC2InstanceMetricGetter instance and its process pool must be # created BEFORE this main (parent) process creates any global or # class-level shared resources that are also used by the pool workers (e.g., # boto connections) that would have undersirable consequences when # replicated into and used by forked child processes (e.g., the same # connection socket file descriptor used by multiple processes). self._metricGetter = EC2InstanceMetricGetter() self._log = _getLogger() self.metricStreamer = MetricStreamer()
class AggregatorService(object): _NOTHING_READY_SLEEP_TIME_SEC = 0.5 def __init__(self): # NOTE: the EC2InstanceMetricGetter instance and its process pool must be # created BEFORE this main (parent) process creates any global or # class-level shared resources that are also used by the pool workers (e.g., # boto connections) that would have undersirable consequences when # replicated into and used by forked child processes (e.g., the same # connection socket file descriptor used by multiple processes). self._metricGetter = EC2InstanceMetricGetter() self._log = _getLogger() self.metricStreamer = MetricStreamer() def close(self): self._metricGetter.close() def run(self): with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: with engine.connect() as conn: pendingStacks = repository.retryOnTransientErrors( repository.getAutostackMetricsPendingDataCollection)(conn) if not pendingStacks: time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC) continue # Build a sequence of autostack metric requests requests = [] for autostack, metrics in pendingStacks: refBase = len(requests) requests.extend( AutostackMetricRequest(refID=refBase + i, autostack=autostack, metric=metric) for i, metric in enumerate(metrics)) # Collect, aggregate, and stream metric data self._processAutostackMetricRequests(engine, requests, modelSwapper) def _processAutostackMetricRequests(self, engine, requests, modelSwapper): """ Execute autostack metric requests, aggregate and stream collected metric data :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param requests: sequence of AutostackMetricRequest objects :param modelSwapper: Model Swapper """ # Start collecting requested metric data collectionIter = self._metricGetter.collectMetricData(requests) # Aggregate each collection and dispatch to app MetricStreamer for metricCollection in collectionIter: request = requests[metricCollection.refID] metricObj = request.metric data = None if metricCollection.slices: aggregationFn = getAggregationFn(metricObj) if aggregationFn: data = aggregate(metricCollection.slices, aggregationFn=aggregationFn) else: data = aggregate(metricCollection.slices) try: with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricLastTimestamp)( conn, metricObj.uid, metricCollection.nextMetricTime) except ObjectNotFoundError: self._log.warning("Processing autostack data collection results for " "unknown model=%s (model deleted?)", metricObj.uid) continue if data: try: self.metricStreamer.streamMetricData(data, metricID=metricObj.uid, modelSwapper=modelSwapper) except ObjectNotFoundError: # We expect that the model exists but in the odd case that it has # already been deleted we don't want to crash the process. self._log.info("Metric not found when adding data. metric=%s" % metricObj.uid) self._log.debug( "{TAG:APP.AGG.DATA.PUB} Published numItems=%d for metric=%s;" "timeRange=[%sZ-%sZ]; headTS=%sZ; tailTS=%sZ", len(data), getMetricLogPrefix(metricObj), metricCollection.timeRange.start.isoformat(), metricCollection.timeRange.end.isoformat(), data[0][0].isoformat(), data[-1][0].isoformat()) else: self._log.info( "{TAG:APP.AGG.DATA.NONE} No data for metric=%s;" "timeRange=[%sZ-%sZ]", getMetricLogPrefix(metricObj), metricCollection.timeRange.start.isoformat(), metricCollection.timeRange.end.isoformat())
class MetricCollector(object): """ This process is responsible for collecting data from all the metrics at a specified time interval """ # Number of concurrent worker processes used for querying metrics _WORKER_PROCESS_POOL_SIZE = 10 # Amount of time to sleep when there are no metrics pending data collection _NO_PENDING_METRICS_SLEEP_SEC = 10 # Metric's period times this number is the duration of the metric's quarantine # time after a mettic errors out or returns empty data _METRIC_QUARANTINE_DURATION_RATIO = 0.5 # The approximate time between status queries of a metric's resource to # avoid too many of the costly calls (e.g., Cloudwatch), especially during a # metric's catch-up phase _RESOURCE_STATUS_UPDATE_INTERVAL_SEC = 2.5 * 60 # An self._metricInfoCache item may be garbage-collected when this much time # passes since last access to the item _METRIC_INFO_CACHE_ITEM_EXPIRATION_SEC = (60 * 60) _SENTINEL = None def __init__(self): self._log = grok_logging.getExtendedLogger(self.__class__.__name__) self._profiling = ( grok.app.config.getboolean("debugging", "profiling") or self._log.isEnabledFor(logging.DEBUG)) self._pollInterval = grok.app.config.getfloat( "metric_collector", "poll_interval") self._metricErrorGracePeriod = grok.app.config.getfloat( "metric_collector", "metric_error_grace_period") # Interval for periodic garbage collection of our caches (e.g., # self._metricInfoCache and self._resourceInfoCache) self._cacheGarbageCollectionIntervalSec = self._metricErrorGracePeriod * 2 # Time (unix epoch) when to run the next garbage collection of # self._metricInfoCache and self._resourceInfoCache; 0 triggers it ASAP as a # quick test of the logic. See self._cacheGarbageCollectionIntervalSec. self._nextCacheGarbageCollectionTime = 0 # We use this to cache info about metrics that helps us avoid unnecessary # queries to the datasource. The keys are metric uid's and corresponding # values are _MetricInfoCacheItem objects. self._metricInfoCache = defaultdict(_MetricInfoCacheItem) # We use this to cache info about resources referenced by metrics to help # us avoid unnecessary resource-status queries to the datasource. The keys # are resource cananonical identifiers (from Metric.server) and # corresponding values are _ResourceInfoCacheItem objects. self._resourceInfoCache = defaultdict(_ResourceInfoCacheItem) self.metricStreamer = MetricStreamer() def _collectDataForMetrics(self, metricsToUpdate, processPool, resultsQueue): """ Collect data for the given metrics :param metricsToUpdate: a dict of Metric instances which are due for an update :param processPool: Process poll in which collection tasks are mapped. :type processPool: multiprocessing.Pool :param resultsQueue: Results Queue onto which collection results are published :type resultsQueue: multiprocessing.JoinableQueue :returns: Result of calling `processPool.map_async()`. Call .wait() on the return value to block until all map tasks have been completed. Meanwhile, you may consume resultsQueue for the results as they become available. :rtype: multiprocessing.pool.AsyncResult """ # Create task parameters for concurrent processing now = time.time() tasks = [] for metricObj in metricsToUpdate.values(): metricSpec = utils.jsonDecode(metricObj.parameters)["metricSpec"] resourceCacheItem = self._resourceInfoCache[metricObj.server] if now >= resourceCacheItem.nextResourceStatusUpdateTime: updateResourceStatus = True resourceCacheItem.nextResourceStatusUpdateTime = ( now + self._RESOURCE_STATUS_UPDATE_INTERVAL_SEC) else: updateResourceStatus = False task = _DataCollectionTask( metricID=metricObj.uid, datasource=metricObj.datasource, metricSpec=metricSpec, rangeStart=metricObj.last_timestamp, metricPeriod=metricObj.poll_interval, updateResourceStatus=updateResourceStatus, resultsQueue=resultsQueue) tasks.append(task) # Process the tasks concurrently, return immediately return processPool.map_async(_collect, tasks) def _garbageCollectInfoCache(self): """ Remove stale items from self._metricInfoCache and clear self._resourceInfoCache TODO: unit-test """ now = time.time() self._nextCacheGarbageCollectionTime = ( now + self._cacheGarbageCollectionIntervalSec) # Clear resource info cache self._resourceInfoCache.clear() self._log.info("Garbage-collected resource info cache") # Gargabe-collect stale metric info cache items staleProperties = tuple( k for k, v in self._metricInfoCache.iteritems() if now > (v.lastAccessTime + self._METRIC_INFO_CACHE_ITEM_EXPIRATION_SEC)) for k in staleProperties: self._metricInfoCache.pop(k) self._log.info("Garbage-collected stale=%d metric cache info items", len(staleProperties)) def _getCandidateMetrics(self, engine): """ Return the metrics that are due for an update :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :returns: a (possibly empty) sequence of Metric instances that are due for an update (after removing quarantined indexes) TODO: unit-test """ with engine.connect() as conn: metricsToUpdate = repository.retryOnTransientErrors( repository.getCloudwatchMetricsPendingDataCollection)(conn) # Remove quarantined metrics quarantinedIndexes = None if metricsToUpdate: now = time.time() quarantinedIndexes = set( i for i, m in enumerate(metricsToUpdate) if now < self._metricInfoCache[m.uid].quarantineEndTime) metricsToUpdate = OrderedDict((m.uid, m) for i, m in enumerate(metricsToUpdate) if i not in quarantinedIndexes) if not metricsToUpdate: # TODO: unit-test self._log.debug("All candidate numMetrics=%d are quarantined", len(quarantinedIndexes)) return metricsToUpdate def _handleMetricCollectionError(self, engine, metricObj, startTime, error): """ Update the metric's collector_error member and promote the metric to ERROR state if needed :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricObj: Metric instance :param startTime: metric collection start time; unix epoch timestamp :param error: exception object that is the result of metric data collection or (possibly pending) resource status check """ # Quarantine the metric temporarily self._metricInfoCache[metricObj.uid].quarantineEndTime = ( startTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO) try: if metricObj.collector_error is None: # TODO: unit-test # Begin error grace period for this metric deadline = time.time() + self._metricErrorGracePeriod with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricCollectorError)( conn, metricObj.uid, utils.jsonEncode(dict(deadline=deadline, message=repr(error)))) self._log.error( "Started error grace period on metric=<%r> through %sZ due to " "error=%r", metricObj, datetime.utcfromtimestamp(deadline).isoformat(), error) elif (time.time() > utils.jsonDecode(metricObj.collector_error)["deadline"]): # TODO: unit-test # Error grace period expired: promote the metric to ERROR state with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricStatus)( conn, metricObj.uid, MetricStatus.ERROR, repr(error)) self._log.error( "Metric Collector: grace period expired; placed metric=<%r> in " "ERROR state due to error=%r", metricObj, error) except app_exceptions.ObjectNotFoundError: # TODO: unit-test self._log.warning("Metric deleted? metric=%r", metricObj, exc_info=True) def _processCollectedData(self, engine, metricsToUpdate, modelSwapper, collectResult): """ Process collected metrics data; publish it on the destination message queue, and update metric instance status and metric collection state. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricsToUpdate: a sequence of Metric instances which were due for an update :type metricsToUpdate: collections.OrderedDict :param modelSwapper: ModelSwapperInterface object for running models :param collectResult: _DataCollectionResult object :returns: 2-tuple (numEmpty, numErrors), where numEmpty is the count of metrics with empty data (not ready or data gap) numErrors is the count of metrics with errors """ numEmpty = numErrors = 0 metricObj = metricsToUpdate[collectResult.metricID] # Update cached resource status resourceInfoCacheItem = self._resourceInfoCache[metricObj.server] newResourceStatus = None if collectResult.resourceStatus is not None: if isinstance(collectResult.resourceStatus, Exception): resourceInfoCacheItem.error = collectResult.resourceStatus else: newResourceStatus = collectResult.resourceStatus resourceInfoCacheItem.error = None if isinstance(collectResult.data, Exception): dataError = collectResult.data newData = None nextStartTime = None else: dataError = None newData = collectResult.data nextStartTime = collectResult.nextCallStart if newData is not None or newResourceStatus is not None: rangeStart = metricObj.last_timestamp if not newData: # Data is not ready yet or data gap numEmpty += 1 self._metricInfoCache[metricObj.uid].quarantineEndTime = ( collectResult.creationTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO) else: headTS = newData[0][0] tailTS = newData[-1][0] startTime = time.time() # Update resource status and next metric start time @repository.retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: try: # Save resource status if newResourceStatus is not None: repository.saveMetricInstanceStatus(conn, metricObj.server, newResourceStatus) # Save starting time for the next data query in metric row if nextStartTime is not None: repository.setMetricLastTimestamp(conn, metricObj.uid, nextStartTime) except app_exceptions.ObjectNotFoundError: self._log.warning("Metric deleted?", exc_info=True) if newResourceStatus is not None or nextStartTime is not None: runSQL(engine) if newData: # This is done outside the cursor context to minimize lock duration. # # NOTE: It is still possible for duplicate data to be sent to the # exchange so the recipient will need to de-dupe it. It is also # possible for the metric to be updated in the database but data not # sent to the exchange. try: self.metricStreamer.streamMetricData(newData, metricID=metricObj.uid, modelSwapper=modelSwapper) except ValueError: self._log.exception("Failed to stream metric data %r" % newData) except app_exceptions.ObjectNotFoundError: # We expect that the model exists but in the odd case that it has # already been deleted we don't want to crash the process. self._log.info("Metric not found when adding data.") if self._profiling: self._log.info( "{TAG:APP.MC.DATA.TO_STRM.DONE} numItems=%d for metric=%r; " "rangeStart=%s; ts=[%s]; duration=%.4fs; ", len(newData), metricObj, rangeStart.isoformat() + "Z" if rangeStart is not None else None, (("%sZ..%sZ" % (headTS.isoformat(), tailTS.isoformat())) if len(newData) > 1 else (headTS.isoformat() + "Z")), time.time() - startTime + collectResult.duration) else: if self._log.isEnabledFor(logging.DEBUG): self._log.debug( "{TAG:APP.MC.DATA.NONE} No data for metric=%r; rangeStart=%s; " "duration=%.4fs", metricObj, rangeStart.isoformat() + "Z" if rangeStart is not None else None, time.time() - startTime + collectResult.duration) # Take care of errors error = dataError or resourceInfoCacheItem.error if error is not None: # Metric data or resource status collection error numErrors += 1 # Update metric error grace period and quarantine info self._handleMetricCollectionError(engine, metricObj, startTime=collectResult.creationTime, error=error) else: if metricObj.collector_error is not None: oldErrorInfo = metricObj.collector_error # There was pending collector error before, but no error this time, so # clear metric's collector_error try: with engine.connect() as conn: repository.retryOnTransientErrors( repository.setMetricCollectorError)(conn, metricObj.uid, None) except app_exceptions.ObjectNotFoundError: self._log.warning("Metric deleted?", exc_info=True) self._log.info("metric=<%r> exited error grace state %s", metricObj, oldErrorInfo) return (numEmpty, numErrors) @abortProgramOnAnyException( _EXIT_CODE_ON_UNHANDLED_EXCEPTION_IN_THREAD, logger=_getLogger()) @logExceptions(_getLogger) def _processAndDispatchThreadTarget(self, engine, metricsToUpdate, resultsQueue, modelSwapper, statsPipe): """ Process collected metrics data; publish it on the destination message queue, and update metric instance status and metric collection state. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricsToUpdate: a sequence of Metric instances which were due for an update :type metricsToUpdate: collections.OrderedDict :param resultsQueue: Queue from which a sequence of _DataCollectionResult instances are processed :type resultsQueue: multiprocessing.JoinableQueue :param modelSwapper: ModelSwapperInterface object for running models :param statsPipe: Pipe in which the final statistics are conveyed in the form of a 2-tuple (numEmpty, numErrors), where numEmpty is the count of metrics with empty data (not ready or data gap) numErrors is the count of metrics with errors :type statsPipe: multiprocessing.Pipe (non-duplex) :returns: None """ numEmpty = numErrors = 0 while True: try: collectResult = resultsQueue.get(True) if collectResult is self._SENTINEL: # There are no more results to process break stats = self._processCollectedData(engine, metricsToUpdate, modelSwapper, collectResult) numEmpty += stats[0] numErrors += stats[1] finally: resultsQueue.task_done() statsPipe.send((numEmpty, numErrors)) def run(self): """ Collect metric data and status for active metrics """ # NOTE: the process pool must be created BEFORE this main (parent) process # creates any global or class-level shared resources (e.g., boto # connection) that would have undersirable consequences when # replicated into and used by forked child processes (e.g., the same MySQL # connection socket file descriptor used by multiple processes). And we # can't take advantage of the process Pool's maxtasksperchild feature # either (for the same reason) self._log.info("Starting grok Metric Collector") resultsQueue = multiprocessing.Manager().JoinableQueue() recvPipe, sendPipe = multiprocessing.Pipe(False) processPool = multiprocessing.Pool( processes=self._WORKER_PROCESS_POOL_SIZE, maxtasksperchild=None) try: with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: startTime = time.time() if startTime > self._nextCacheGarbageCollectionTime: # TODO: unit-test self._garbageCollectInfoCache() # Determine which metrics are due for an update metricsToUpdate = self._getCandidateMetrics(engine) filterDuration = time.time() - startTime if not metricsToUpdate: time.sleep(self._NO_PENDING_METRICS_SLEEP_SEC) continue # Collect metric data collectionStartTime = time.time() poolResults = self._collectDataForMetrics(metricsToUpdate, processPool, resultsQueue) # Process/dispatch results in parallel in another thread as results # become available in resultsQueue dispatchThread = ( threading.Thread(target=self._processAndDispatchThreadTarget, args=(engine, metricsToUpdate, resultsQueue, modelSwapper, sendPipe))) dispatchStartTime = time.time() dispatchThread.start() # Syncronize with processPool poolResults.wait() # Wait for collection tasks to complete metricPollDuration = time.time() - collectionStartTime resultsQueue.join() # Block until all tasks completed... # Syncronize with dispatchThread resultsQueue.put(self._SENTINEL) # Signal to dispatchThread that # there are no more results to # process. resultsQueue.join() numEmpty, numErrors = recvPipe.recv() # Get dispatchThread stats dispatchDuration = time.time() - dispatchStartTime self._log.info( "Processed numMetrics=%d; numEmpty=%d; numErrors=%d; " "duration=%.4fs (filter=%.4fs; query=%.4fs; dispatch=%.4fs)", len(metricsToUpdate), numEmpty, numErrors, time.time() - startTime, filterDuration, metricPollDuration, dispatchDuration) finally: self._log.info("Exiting Metric Collector run-loop") processPool.terminate() processPool.join()
class MetricCollector(object): """ This process is responsible for collecting data from all the metrics at a specified time interval """ # Number of concurrent worker processes used for querying metrics _WORKER_PROCESS_POOL_SIZE = 10 # Amount of time to sleep when there are no metrics pending data collection _NO_PENDING_METRICS_SLEEP_SEC = 10 # Metric's period times this number is the duration of the metric's quarantine # time after a mettic errors out or returns empty data _METRIC_QUARANTINE_DURATION_RATIO = 0.5 # The approximate time between status queries of a metric's resource to # avoid too many of the costly calls (e.g., Cloudwatch), especially during a # metric's catch-up phase _RESOURCE_STATUS_UPDATE_INTERVAL_SEC = 2.5 * 60 # An self._metricInfoCache item may be garbage-collected when this much time # passes since last access to the item _METRIC_INFO_CACHE_ITEM_EXPIRATION_SEC = (60 * 60) _SENTINEL = None def __init__(self): self._log = YOMP_logging.getExtendedLogger(self.__class__.__name__) self._profiling = ( YOMP.app.config.getboolean("debugging", "profiling") or self._log.isEnabledFor(logging.DEBUG)) self._pollInterval = YOMP.app.config.getfloat( "metric_collector", "poll_interval") self._metricErrorGracePeriod = YOMP.app.config.getfloat( "metric_collector", "metric_error_grace_period") # Interval for periodic garbage collection of our caches (e.g., # self._metricInfoCache and self._resourceInfoCache) self._cacheGarbageCollectionIntervalSec = self._metricErrorGracePeriod * 2 # Time (unix epoch) when to run the next garbage collection of # self._metricInfoCache and self._resourceInfoCache; 0 triggers it ASAP as a # quick test of the logic. See self._cacheGarbageCollectionIntervalSec. self._nextCacheGarbageCollectionTime = 0 # We use this to cache info about metrics that helps us avoid unnecessary # queries to the datasource. The keys are metric uid's and corresponding # values are _MetricInfoCacheItem objects. self._metricInfoCache = defaultdict(_MetricInfoCacheItem) # We use this to cache info about resources referenced by metrics to help # us avoid unnecessary resource-status queries to the datasource. The keys # are resource cananonical identifiers (from Metric.server) and # corresponding values are _ResourceInfoCacheItem objects. self._resourceInfoCache = defaultdict(_ResourceInfoCacheItem) self.metricStreamer = MetricStreamer() def _collectDataForMetrics(self, metricsToUpdate, processPool, resultsQueue): """ Collect data for the given metrics :param metricsToUpdate: a dict of Metric instances which are due for an update :param processPool: Process poll in which collection tasks are mapped. :type processPool: multiprocessing.Pool :param resultsQueue: Results Queue onto which collection results are published :type resultsQueue: multiprocessing.JoinableQueue :returns: Result of calling `processPool.map_async()`. Call .wait() on the return value to block until all map tasks have been completed. Meanwhile, you may consume resultsQueue for the results as they become available. :rtype: multiprocessing.pool.AsyncResult """ # Create task parameters for concurrent processing now = time.time() tasks = [] for metricObj in metricsToUpdate.values(): metricSpec = utils.jsonDecode(metricObj.parameters)["metricSpec"] resourceCacheItem = self._resourceInfoCache[metricObj.server] if now >= resourceCacheItem.nextResourceStatusUpdateTime: updateResourceStatus = True resourceCacheItem.nextResourceStatusUpdateTime = ( now + self._RESOURCE_STATUS_UPDATE_INTERVAL_SEC) else: updateResourceStatus = False task = _DataCollectionTask( metricID=metricObj.uid, datasource=metricObj.datasource, metricSpec=metricSpec, rangeStart=metricObj.last_timestamp, metricPeriod=metricObj.poll_interval, updateResourceStatus=updateResourceStatus, resultsQueue=resultsQueue) tasks.append(task) # Process the tasks concurrently, return immediately return processPool.map_async(_collect, tasks) def _garbageCollectInfoCache(self): """ Remove stale items from self._metricInfoCache and clear self._resourceInfoCache TODO: unit-test """ now = time.time() self._nextCacheGarbageCollectionTime = ( now + self._cacheGarbageCollectionIntervalSec) # Clear resource info cache self._resourceInfoCache.clear() self._log.info("Garbage-collected resource info cache") # Gargabe-collect stale metric info cache items staleProperties = tuple( k for k, v in self._metricInfoCache.iteritems() if now > (v.lastAccessTime + self._METRIC_INFO_CACHE_ITEM_EXPIRATION_SEC)) for k in staleProperties: self._metricInfoCache.pop(k) self._log.info("Garbage-collected stale=%d metric cache info items", len(staleProperties)) def _getCandidateMetrics(self, engine): """ Return the metrics that are due for an update :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :returns: a (possibly empty) sequence of Metric instances that are due for an update (after removing quarantined indexes) TODO: unit-test """ with engine.connect() as conn: metricsToUpdate = repository.retryOnTransientErrors( repository.getCloudwatchMetricsPendingDataCollection)(conn) # Remove quarantined metrics quarantinedIndexes = None if metricsToUpdate: now = time.time() quarantinedIndexes = set( i for i, m in enumerate(metricsToUpdate) if now < self._metricInfoCache[m.uid].quarantineEndTime) metricsToUpdate = OrderedDict((m.uid, m) for i, m in enumerate(metricsToUpdate) if i not in quarantinedIndexes) if not metricsToUpdate: # TODO: unit-test self._log.debug("All candidate numMetrics=%d are quarantined", len(quarantinedIndexes)) return metricsToUpdate def _handleMetricCollectionError(self, engine, metricObj, startTime, error): """ Update the metric's collector_error member and promote the metric to ERROR state if needed :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricObj: Metric instance :param startTime: metric collection start time; unix epoch timestamp :param error: exception object that is the result of metric data collection or (possibly pending) resource status check """ # Quarantine the metric temporarily self._metricInfoCache[metricObj.uid].quarantineEndTime = ( startTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO) try: if metricObj.collector_error is None: # TODO: unit-test # Begin error grace period for this metric deadline = time.time() + self._metricErrorGracePeriod with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricCollectorError)( conn, metricObj.uid, utils.jsonEncode(dict(deadline=deadline, message=repr(error)))) self._log.error( "Started error grace period on metric=<%r> through %sZ due to " "error=%r", metricObj, datetime.utcfromtimestamp(deadline).isoformat(), error) elif (time.time() > utils.jsonDecode(metricObj.collector_error)["deadline"]): # TODO: unit-test # Error grace period expired: promote the metric to ERROR state with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricStatus)( conn, metricObj.uid, MetricStatus.ERROR, repr(error)) self._log.error( "Metric Collector: grace period expired; placed metric=<%r> in " "ERROR state due to error=%r", metricObj, error) except app_exceptions.ObjectNotFoundError: # TODO: unit-test self._log.warning("Metric deleted? metric=%r", metricObj, exc_info=True) def _processCollectedData(self, engine, metricsToUpdate, modelSwapper, collectResult): """ Process collected metrics data; publish it on the destination message queue, and update metric instance status and metric collection state. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricsToUpdate: a sequence of Metric instances which were due for an update :type metricsToUpdate: collections.OrderedDict :param modelSwapper: ModelSwapperInterface object for running models :param collectResult: _DataCollectionResult object :returns: 2-tuple (numEmpty, numErrors), where numEmpty is the count of metrics with empty data (not ready or data gap) numErrors is the count of metrics with errors """ numEmpty = numErrors = 0 metricObj = metricsToUpdate[collectResult.metricID] # Update cached resource status resourceInfoCacheItem = self._resourceInfoCache[metricObj.server] newResourceStatus = None if collectResult.resourceStatus is not None: if isinstance(collectResult.resourceStatus, Exception): resourceInfoCacheItem.error = collectResult.resourceStatus else: newResourceStatus = collectResult.resourceStatus resourceInfoCacheItem.error = None if isinstance(collectResult.data, Exception): dataError = collectResult.data newData = None nextStartTime = None else: dataError = None newData = collectResult.data nextStartTime = collectResult.nextCallStart if newData is not None or newResourceStatus is not None: rangeStart = metricObj.last_timestamp if not newData: # Data is not ready yet or data gap numEmpty += 1 self._metricInfoCache[metricObj.uid].quarantineEndTime = ( collectResult.creationTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO) else: headTS = newData[0][0] tailTS = newData[-1][0] startTime = time.time() # Update resource status and next metric start time @repository.retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: try: # Save resource status if newResourceStatus is not None: repository.saveMetricInstanceStatus(conn, metricObj.server, newResourceStatus) # Save starting time for the next data query in metric row if nextStartTime is not None: repository.setMetricLastTimestamp(conn, metricObj.uid, nextStartTime) except app_exceptions.ObjectNotFoundError: self._log.warning("Metric deleted?", exc_info=True) if newResourceStatus is not None or nextStartTime is not None: runSQL(engine) if newData: # This is done outside the cursor context to minimize lock duration. # # NOTE: It is still possible for duplicate data to be sent to the # exchange so the recipient will need to de-dupe it. It is also # possible for the metric to be updated in the database but data not # sent to the exchange. try: self.metricStreamer.streamMetricData(newData, metricID=metricObj.uid, modelSwapper=modelSwapper) except ValueError: self._log.exception("Failed to stream metric data %r" % newData) except app_exceptions.ObjectNotFoundError: # We expect that the model exists but in the odd case that it has # already been deleted we don't want to crash the process. self._log.info("Metric not found when adding data.") if self._profiling: self._log.info( "{TAG:APP.MC.DATA.TO_STRM.DONE} numItems=%d for metric=%r; " "rangeStart=%s; ts=[%s]; duration=%.4fs; ", len(newData), metricObj, rangeStart.isoformat() + "Z" if rangeStart is not None else None, (("%sZ..%sZ" % (headTS.isoformat(), tailTS.isoformat())) if len(newData) > 1 else (headTS.isoformat() + "Z")), time.time() - startTime + collectResult.duration) else: if self._log.isEnabledFor(logging.DEBUG): self._log.debug( "{TAG:APP.MC.DATA.NONE} No data for metric=%r; rangeStart=%s; " "duration=%.4fs", metricObj, rangeStart.isoformat() + "Z" if rangeStart is not None else None, time.time() - startTime + collectResult.duration) # Take care of errors error = dataError or resourceInfoCacheItem.error if error is not None: # Metric data or resource status collection error numErrors += 1 # Update metric error grace period and quarantine info self._handleMetricCollectionError(engine, metricObj, startTime=collectResult.creationTime, error=error) else: if metricObj.collector_error is not None: oldErrorInfo = metricObj.collector_error # There was pending collector error before, but no error this time, so # clear metric's collector_error try: with engine.connect() as conn: repository.retryOnTransientErrors( repository.setMetricCollectorError)(conn, metricObj.uid, None) except app_exceptions.ObjectNotFoundError: self._log.warning("Metric deleted?", exc_info=True) self._log.info("metric=<%r> exited error grace state %s", metricObj, oldErrorInfo) return (numEmpty, numErrors) @abortProgramOnAnyException( _EXIT_CODE_ON_UNHANDLED_EXCEPTION_IN_THREAD, logger=_getLogger()) @logExceptions(_getLogger) def _processAndDispatchThreadTarget(self, engine, metricsToUpdate, resultsQueue, modelSwapper, statsPipe): """ Process collected metrics data; publish it on the destination message queue, and update metric instance status and metric collection state. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricsToUpdate: a sequence of Metric instances which were due for an update :type metricsToUpdate: collections.OrderedDict :param resultsQueue: Queue from which a sequence of _DataCollectionResult instances are processed :type resultsQueue: multiprocessing.JoinableQueue :param modelSwapper: ModelSwapperInterface object for running models :param statsPipe: Pipe in which the final statistics are conveyed in the form of a 2-tuple (numEmpty, numErrors), where numEmpty is the count of metrics with empty data (not ready or data gap) numErrors is the count of metrics with errors :type statsPipe: multiprocessing.Pipe (non-duplex) :returns: None """ numEmpty = numErrors = 0 while True: try: collectResult = resultsQueue.get(True) if collectResult is self._SENTINEL: # There are no more results to process break stats = self._processCollectedData(engine, metricsToUpdate, modelSwapper, collectResult) numEmpty += stats[0] numErrors += stats[1] finally: resultsQueue.task_done() statsPipe.send((numEmpty, numErrors)) def run(self): """ Collect metric data and status for active metrics """ # NOTE: the process pool must be created BEFORE this main (parent) process # creates any global or class-level shared resources (e.g., boto # connection) that would have undersirable consequences when # replicated into and used by forked child processes (e.g., the same MySQL # connection socket file descriptor used by multiple processes). And we # can't take advantage of the process Pool's maxtasksperchild feature # either (for the same reason) self._log.info("Starting YOMP Metric Collector") resultsQueue = multiprocessing.Manager().JoinableQueue() recvPipe, sendPipe = multiprocessing.Pipe(False) processPool = multiprocessing.Pool( processes=self._WORKER_PROCESS_POOL_SIZE, maxtasksperchild=None) try: with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: startTime = time.time() if startTime > self._nextCacheGarbageCollectionTime: # TODO: unit-test self._garbageCollectInfoCache() # Determine which metrics are due for an update metricsToUpdate = self._getCandidateMetrics(engine) filterDuration = time.time() - startTime if not metricsToUpdate: time.sleep(self._NO_PENDING_METRICS_SLEEP_SEC) continue # Collect metric data collectionStartTime = time.time() poolResults = self._collectDataForMetrics(metricsToUpdate, processPool, resultsQueue) # Process/dispatch results in parallel in another thread as results # become available in resultsQueue dispatchThread = ( threading.Thread(target=self._processAndDispatchThreadTarget, args=(engine, metricsToUpdate, resultsQueue, modelSwapper, sendPipe))) dispatchStartTime = time.time() dispatchThread.start() # Syncronize with processPool poolResults.wait() # Wait for collection tasks to complete metricPollDuration = time.time() - collectionStartTime resultsQueue.join() # Block until all tasks completed... # Syncronize with dispatchThread resultsQueue.put(self._SENTINEL) # Signal to dispatchThread that # there are no more results to # process. resultsQueue.join() numEmpty, numErrors = recvPipe.recv() # Get dispatchThread stats dispatchDuration = time.time() - dispatchStartTime self._log.info( "Processed numMetrics=%d; numEmpty=%d; numErrors=%d; " "duration=%.4fs (filter=%.4fs; query=%.4fs; dispatch=%.4fs)", len(metricsToUpdate), numEmpty, numErrors, time.time() - startTime, filterDuration, metricPollDuration, dispatchDuration) finally: self._log.info("Exiting Metric Collector run-loop") processPool.terminate() processPool.join()