def run_single_check(self, check): """Run a single check returns number of measurement collected, collection time """ sub_timer = util.Timer() count = 0 log.debug("Running plugin %s" % check.name) try: # Run the check. check.run() current_check_metrics = check.get_metrics() # Emit the metrics after each check self._emit(current_check_metrics) # Save the status of the check. count += len(current_check_metrics) except Exception: log.exception("Error running plugin %s" % check.name) sub_collect_duration = sub_timer.step() sub_collect_duration_mills = sub_collect_duration * 1000 log.debug( "Finished plugin %s run. Collection time: %.2fms %d Metrics." % (check.name, round(sub_collect_duration_mills, 2), count)) if sub_collect_duration > util.get_sub_collection_warn(): log.warn("Collection time for check %s is high: %.2fs." % (check.name, round(sub_collect_duration, 2))) return count, sub_collect_duration_mills
def run(self, check_frequency): """Collect data from each check and submit their data. Also, submit a metric which is how long the checks_d took """ timer = util.Timer() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) # checks_d checks num_metrics = self.run_checks_d(check_frequency) collect_duration = timer.step() # Warn if collection time is approaching the collection period if collect_duration > (4 * check_frequency / 5): log.warn("Collection time (s) is high: %.1f, metrics count: %d" % (collect_duration, num_metrics)) self.collector_stats(num_metrics, collect_duration) collect_stats = [] dimensions = {'component': 'monasca-agent', 'service': 'monitoring'} # Add in metrics on the collector run for name, value in self.collection_metrics.items(): metric = metrics.Metric(name, self._set_dimensions(dimensions), tenant=None) collect_stats.append(metric.measurement(value, time.time())) self.collection_metrics.clear() self._emit(collect_stats) # Persist the status of the collection run. self._set_status(collect_duration)
def run(self): """Collect data from each check and submit their data. There are currently two types of checks the system checks and the configured ones from checks_d """ timer = util.Timer() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) # checks_d checks num_metrics, emitter_statuses, checks_statuses = self.run_checks_d() collect_duration = timer.step() collect_stats = [] dimensions = {'component': 'monasca-agent', 'service': 'monitoring'} # Add in metrics on the collector run for name, value in self.collector_stats(num_metrics, collect_duration).iteritems(): collect_stats.append(metrics.Measurement(name, time.time(), value, self._set_dimensions(dimensions), None)) emitter_statuses.append(self._emit(collect_stats)) # Persist the status of the collection run. self._set_status(checks_statuses, emitter_statuses, collect_duration)
def flush(self): if self._trs_to_flush is not None: log.debug("A flush is already in progress, not doing anything") return to_flush = [] # Do we have something to do ? now = datetime.now() for tr in self._transactions: if tr.time_to_flush(now): to_flush.append(tr) count = len(to_flush) should_log = self._flush_count + 1 <= FLUSH_LOGGING_INITIAL or \ (self._flush_count + 1) % FLUSH_LOGGING_PERIOD == 0 if count > 0: if should_log: log.info( "Flushing %s transaction%s during flush #%s" % (count, util.plural(count), str(self._flush_count + 1))) else: log.debug( "Flushing %s transaction%s during flush #%s" % (count, util.plural(count), str(self._flush_count + 1))) timer = util.Timer() self._trs_to_flush = to_flush self.flush_next() # The emit time is reported on the next run. dimensions = self._set_dimensions({ 'component': 'monasca-agent', 'service': 'monitoring' }) emit_measurement = metrics.Measurement('monasca.emit_time_sec', time.time(), timer.step(), dimensions) MetricTransaction([emit_measurement], headers={'Content-Type': 'application/json'}) else: if should_log: log.info("No transaction to flush during flush #%s" % str(self._flush_count + 1)) else: log.debug("No transaction to flush during flush #%s" % str(self._flush_count + 1)) if self._flush_count + 1 == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) self._flush_count += 1 check_status.ForwarderStatus( queue_length=self._total_count, queue_size=self._total_size, flush_count=self._flush_count, transactions_received=self._transactions_received, transactions_flushed=self._transactions_flushed).persist()
def run_checks_d(self): """Run defined checks_d checks. returns a list of Measurements, a dictionary of events and a list of check statuses. """ sub_timer = util.Timer() measurements = [] events = {} check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.debug("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() # Save them for the payload. measurements.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) status_check = check_status.CheckStatus(check.name, instance_statuses, metric_count, event_count, library_versions=check.get_library_info()) check_statuses.append(status_check) sub_collect_duration = sub_timer.step() sub_collect_duration_mills = sub_collect_duration * 1000 log.debug("Finished run check %s. Collection time: %.2fms." % ( check.name, round(sub_collect_duration_mills, 2))) if sub_collect_duration > util.get_sub_collection_warn(): log.warn("Collection time for check %s is high: %.2fs." % ( check.name, round(sub_collect_duration, 2))) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return status_check = check_status.CheckStatus(check_name, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(status_check) return measurements, events, check_statuses
def run(self): """Collect data from each check and submit their data. There are currently two types of checks the system checks and the configured ones from checks_d """ timer = util.Timer() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) metrics_list = [] timestamp = time.time() events = {} if self.os == 'windows': # Windows uses old style checks. for check_type in self._checks: try: for name, value in check_type.check().iteritems(): metrics_list.append(metrics.Measurement(name, timestamp, value, self._set_dimensions(None), None)) except Exception: log.exception('Error running check.') else: for check_type in self._checks: metrics_list.extend(check_type.check()) # checks_d checks checks_d_metrics, checks_d_events, checks_statuses = self.run_checks_d() metrics_list.extend(checks_d_metrics) events.update(checks_d_events) # Store the metrics and events in the payload. collect_duration = timer.step() dimensions = {'component': 'monasca-agent', 'service': 'monitoring'} # Add in metrics on the collector run for name, value in self.collector_stats(len(metrics_list), len(events), collect_duration).iteritems(): metrics_list.append(metrics.Measurement(name, timestamp, value, self._set_dimensions(dimensions), None)) emitter_statuses = self._emit(metrics_list) # Persist the status of the collection run. self._set_status(checks_statuses, emitter_statuses, collect_duration)
def stop(self, timeout=0): """Tell the collector to stop at the next logical point. """ # This is called when the process is being killed, so # try to stop the collector as soon as possible. # Most importantly, don't try to submit to the emitters # because the forwarder is quite possibly already killed # in which case we'll get a misleading error in the logs. # Best to not even try. log.info("stopping the collector with timeout %d seconds" % timeout) self.continue_running = False for check_name in self.collection_times: check = self.collection_times[check_name]['check'] check.stop() for check_name in self.collection_results: run_time = time.time( ) - self.collection_results[check_name]['start_time'] log.info( 'When exiting... Plugin %s still running after %d seconds' % (check_name, run_time)) self.pool.close() # Won't call join() if timeout is zero. If we are in an event thread # a BlockingSwitchOutError occurs if wait if (timeout > 0): timer = util.Timer() for worker in self.pool._pool: t = timeout - timer.total() if t <= 0: break if worker.is_alive(): try: worker.join(t) except Exception: log.error("Unexpected error: ", sys.exc_info()[0]) for worker in self.pool._pool: if worker.is_alive(): # the worker didn't complete in the specified timeout. # collector must honor the stop request to avoid agent stop/restart hang. # os._exit() should be called after collector stops. log.info( 'worker %s is still alive when collector stop times out.' % worker.name)