def check_status_lines(cs): check_lines = [" " + cs.name, " " + "-" * len(cs.name)] if cs.init_failed_error: check_lines.append( " - initialize check class [%s]: %s" % (style(STATUS_ERROR, "red"), repr(cs.init_failed_error)) ) if cs.init_failed_traceback: check_lines.extend(" " + line for line in cs.init_failed_traceback.split("\n")) else: for s in cs.instance_statuses: c = "green" if s.has_warnings(): c = "yellow" if s.has_error(): c = "red" line = " - instance #%s [%s]" % (s.instance_id, style(s.status, c)) if s.has_error(): line += u": %s" % s.error if s.metric_count is not None: line += " collected %s metrics" % s.metric_count if s.instance_check_stats is not None: line += " Last run duration: %s" % s.instance_check_stats.get("run_time") check_lines.append(line) if s.has_warnings(): for warning in s.warnings: warn = warning.split("\n") if not len(warn): continue check_lines.append(u" %s: %s" % (style("Warning", "yellow"), warn[0])) check_lines.extend(u" %s" % l for l in warn[1:]) if s.traceback is not None: check_lines.extend(" " + line for line in s.traceback.split("\n")) check_lines += [ " - Collected %s metric%s, %s event%s & %s service check%s" % ( cs.metric_count, plural(cs.metric_count), cs.event_count, plural(cs.event_count), cs.service_check_count, plural(cs.service_check_count), ) ] if cs.check_stats is not None: check_lines += [" - Stats: %s" % pretty_statistics(cs.check_stats)] if cs.library_versions is not None: check_lines += [" - Dependencies:"] for library, version in cs.library_versions.iteritems(): check_lines += [" - %s: %s" % (library, version)] check_lines += [""] return check_lines
i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc()) finally: self._roll_up_instance_metadata() instance_statuses.append(instance_status) if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: after = AgentCheck._collect_internal_stats() self._set_internal_profiling_stats(before, after) log.info("\n \t %s %s" % (self.name, pretty_statistics(self._internal_profiling_stats))) except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats after check {0}".format( self.name)) return instance_statuses def check(self, instance): """ Overriden by the check class. This will be called to run the check. :param instance: A dict with the instance information. This will vary depending on your config structure. """ raise NotImplementedError()
except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc() ) finally: self._roll_up_instance_metadata() instance_statuses.append(instance_status) if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: after = AgentCheck._collect_internal_stats() self._set_internal_profiling_stats(before, after) log.info("\n \t %s %s" % (self.name, pretty_statistics(self._internal_profiling_stats))) except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug("Failed to collect Agent Stats after check {0}".format(self.name)) return instance_statuses def check(self, instance): """ Overriden by the check class. This will be called to run the check. :param instance: A dict with the instance information. This will vary depending on your config structure. """ raise NotImplementedError() def stop(self):
def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.min_collection_interval) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception as e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc()) finally: self._roll_up_instance_metadata() instance_statuses.append(instance_status) if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: after = AgentCheck._collect_internal_stats() self._set_internal_profiling_stats(before, after) log.info("\n \t %s %s" % (self.name, pretty_statistics(self._internal_profiling_stats))) except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats after check {0}".format( self.name)) return instance_statuses
def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug("Failed to collect Agent Stats before check {0}".format(self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( "min_collection_interval", self.init_config.get("min_collection_interval", self.DEFAULT_MIN_COLLECTION_INTERVAL), ) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago".format( i, self.name, min_collection_interval ) ) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = {"run_time": timeit.default_timer() - check_start_time} if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats, ) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats ) except Exception as e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc() ) finally: self._roll_up_instance_metadata() instance_statuses.append(instance_status) if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: after = AgentCheck._collect_internal_stats() self._set_internal_profiling_stats(before, after) log.info("\n \t %s %s" % (self.name, pretty_statistics(self._internal_profiling_stats))) except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug("Failed to collect Agent Stats after check {0}".format(self.name)) return instance_statuses
def body_lines(self): # Metadata whitelist metadata_whitelist = [ 'hostname', 'fqdn', 'ipv4', 'instance-id' ] lines = [ 'Clocks', '======', '' ] try: ntp_offset, ntp_styles = get_ntp_info() lines.append(' ' + style('NTP offset', *ntp_styles) + ': ' + style('%s s' % round(ntp_offset, 4), *ntp_styles)) except Exception as e: lines.append(' NTP offset: Unknown (%s)' % str(e)) lines.append(' System UTC time: ' + datetime.datetime.utcnow().__str__()) lines.append('') # Paths to checks.d/conf.d lines += [ 'Paths', '=====', '' ] osname = config.get_os() try: confd_path = config.get_confd_path(osname) except config.PathNotFound: confd_path = 'Not found' try: checksd_path = config.get_checksd_path(osname) except config.PathNotFound: checksd_path = 'Not found' lines.append(' conf.d: ' + confd_path) lines.append(' checks.d: ' + checksd_path) lines.append('') # Hostnames lines += [ 'Hostnames', '=========', '' ] if not self.host_metadata: lines.append(" No host information available yet.") else: for key, host in self.host_metadata.iteritems(): for whitelist_item in metadata_whitelist: if whitelist_item in key: lines.append(" " + key + ": " + host) break lines.append('') # Checks.d Status lines += [ 'Checks', '======', '' ] check_statuses = self.check_statuses + get_jmx_status() if not check_statuses: lines.append(" No checks have run yet.") else: for cs in check_statuses: check_lines = [ ' ' + cs.name + ' ({})'.format(cs.check_version), ' ' + '-' * (len(cs.name) + 3 + len(cs.check_version)) ] if cs.init_failed_error: check_lines.append(" - initialize check class [%s]: %s" % (style(STATUS_ERROR, 'red'), repr(cs.init_failed_error))) if self.verbose and cs.init_failed_traceback: check_lines.extend(' ' + line for line in cs.init_failed_traceback.split('\n')) else: for s in cs.instance_statuses: c = 'green' if s.has_warnings(): c = 'yellow' if s.has_error(): c = 'red' line = " - instance #%s [%s]" % ( s.instance_id, style(s.status, c)) if s.has_error(): line += u": %s" % s.error if s.metric_count is not None: line += " collected %s metrics" % s.metric_count if s.instance_check_stats is not None: line += " Last run duration: %s" % s.instance_check_stats.get('run_time') check_lines.append(line) if s.has_warnings(): for warning in s.warnings: warn = warning.split('\n') if not len(warn): continue check_lines.append(u" %s: %s" % (style("Warning", 'yellow'), warn[0])) check_lines.extend(u" %s" % l for l in warn[1:]) if self.verbose and s.traceback is not None: check_lines.extend(' ' + line for line in s.traceback.split('\n')) check_lines += [ " - Collected %s metric%s, %s event%s & %s service check%s" % ( cs.metric_count, plural(cs.metric_count), cs.event_count, plural(cs.event_count), cs.service_check_count, plural(cs.service_check_count)), ] if cs.check_stats is not None: check_lines += [ " - Stats: %s" % pretty_statistics(cs.check_stats) ] if cs.library_versions is not None: check_lines += [ " - Dependencies:"] for library, version in cs.library_versions.iteritems(): check_lines += [ " - %s: %s" % (library, version)] check_lines += [""] lines += check_lines # Metadata status metadata_enabled = 1#_is_affirmative(get_config().get('display_service_metadata', False)) if metadata_enabled: lines += [ "", "Service metadata", "================", "" ] if not check_statuses: lines.append(" No checks have run yet.") else: meta_lines = [] for cs in check_statuses: # Check title check_line = [ ' ' + cs.name, ' ' + '-' * len(cs.name) ] instance_lines = [] for i, meta in enumerate(cs.service_metadata): if not meta: continue instance_lines += [" - instance #%s:" % i] for k, v in meta.iteritems(): instance_lines += [" - %s: %s" % (k, v)] if instance_lines: check_line += instance_lines meta_lines += check_line if meta_lines: lines += meta_lines else: lines.append(" No metadata were collected.") # Emitter status lines += [ "", "Emitters", "========", "" ] if not self.emitter_statuses: lines.append(" No emitters have run yet.") else: for es in self.emitter_statuses: c = 'green' if es.has_error(): c = 'red' line = " - %s [%s]" % (es.name, style(es.status, c)) if es.status != STATUS_OK: line += ": %s" % es.error lines.append(line) return lines
def check_status_lines(cs): check_lines = [ ' ' + cs.name + ' ({})'.format(cs.check_version), ' ' + '-' * (len(cs.name) + 3 + len(cs.check_version)) ] if cs.init_failed_error: check_lines.append(" - initialize check class [%s]: %s" % (style(STATUS_ERROR, 'red'), repr(cs.init_failed_error))) if cs.init_failed_traceback: check_lines.extend(' ' + line for line in cs.init_failed_traceback.split('\n')) else: for s in cs.instance_statuses: c = 'green' if s.has_warnings(): c = 'yellow' if s.has_error(): c = 'red' line = " - instance #%s [%s]" % ( s.instance_id, style(s.status, c)) if s.has_error(): line += u": %s" % s.error if s.metric_count is not None: line += " collected %s metrics" % s.metric_count if s.instance_check_stats is not None: line += " Last run duration: %s" % s.instance_check_stats.get('run_time') check_lines.append(line) if s.has_warnings(): for warning in s.warnings: warn = warning.split('\n') if not len(warn): continue check_lines.append(u" %s: %s" % (style("Warning", 'yellow'), warn[0])) check_lines.extend(u" %s" % l for l in warn[1:]) if s.traceback is not None: check_lines.extend(' ' + line for line in s.traceback.split('\n')) check_lines += [ " - Collected %s metric%s, %s event%s & %s service check%s" % ( cs.metric_count, plural(cs.metric_count), cs.event_count, plural(cs.event_count), cs.service_check_count, plural(cs.service_check_count)), ] if cs.check_stats is not None: check_lines += [ " - Stats: %s" % pretty_statistics(cs.check_stats) ] if cs.library_versions is not None: check_lines += [ " - Dependencies:"] for library, version in cs.library_versions.iteritems(): check_lines += [" - %s: %s" % (library, version)] check_lines += [""] return check_lines
def body_lines(self): # Metadata whitelist metadata_whitelist = [ 'hostname', 'fqdn', 'ipv4', 'instance-id' ] lines = [ 'Clocks', '======', '' ] try: ntp_offset, ntp_styles = get_ntp_info() lines.append(' ' + style('NTP offset', *ntp_styles) + ': ' + style('%s s' % round(ntp_offset, 4), *ntp_styles)) except Exception as e: lines.append(' NTP offset: Unknown (%s)' % str(e)) lines.append(' System UTC time: ' + datetime.datetime.utcnow().__str__()) lines.append('') # Paths to checks.d/conf.d lines += [ 'Paths', '=====', '' ] osname = config.get_os() try: confd_path = config.get_confd_path(osname) except config.PathNotFound: confd_path = 'Not found' try: checksd_path = config.get_checksd_path(osname) except config.PathNotFound: checksd_path = 'Not found' lines.append(' conf.d: ' + confd_path) lines.append(' checks.d: ' + checksd_path) lines.append('') # Hostnames lines += [ 'Hostnames', '=========', '' ] if not self.host_metadata: lines.append(" No host information available yet.") else: for key, host in self.host_metadata.iteritems(): for whitelist_item in metadata_whitelist: if whitelist_item in key: lines.append(" " + key + ": " + host) break lines.append('') # Checks.d Status lines += [ 'Checks', '======', '' ] check_statuses = self.check_statuses + get_jmx_status() if not check_statuses: lines.append(" No checks have run yet.") else: for cs in check_statuses: check_lines = [ ' ' + cs.name + ' ({})'.format(cs.check_version), ' ' + '-' * (len(cs.name) + 3 + len(cs.check_version)) ] if cs.init_failed_error: check_lines.append(" - initialize check class [%s]: %s" % (style(STATUS_ERROR, 'red'), repr(cs.init_failed_error))) if self.verbose and cs.init_failed_traceback: check_lines.extend(' ' + line for line in cs.init_failed_traceback.split('\n')) else: for s in cs.instance_statuses: c = 'green' if s.has_warnings(): c = 'yellow' if s.has_error(): c = 'red' line = " - instance #%s [%s]" % ( s.instance_id, style(s.status, c)) if s.has_error(): line += u": %s" % s.error if s.metric_count is not None: line += " collected %s metrics" % s.metric_count if s.instance_check_stats is not None: line += " Last run duration: %s" % s.instance_check_stats.get('run_time') check_lines.append(line) if s.has_warnings(): for warning in s.warnings: warn = warning.split('\n') if not len(warn): continue check_lines.append(u" %s: %s" % (style("Warning", 'yellow'), warn[0])) check_lines.extend(u" %s" % l for l in warn[1:]) if self.verbose and s.traceback is not None: check_lines.extend(' ' + line for line in s.traceback.split('\n')) check_lines += [ " - Collected %s metric%s, %s event%s & %s service check%s" % ( cs.metric_count, plural(cs.metric_count), cs.event_count, plural(cs.event_count), cs.service_check_count, plural(cs.service_check_count)), ] if cs.check_stats is not None: check_lines += [ " - Stats: %s" % pretty_statistics(cs.check_stats) ] if cs.library_versions is not None: check_lines += [ " - Dependencies:"] for library, version in cs.library_versions.iteritems(): check_lines += [ " - %s: %s" % (library, version)] check_lines += [""] lines += check_lines # Metadata status metadata_enabled = _is_affirmative(get_config().get('display_service_metadata', False)) if metadata_enabled: lines += [ "", "Service metadata", "================", "" ] if not check_statuses: lines.append(" No checks have run yet.") else: meta_lines = [] for cs in check_statuses: # Check title check_line = [ ' ' + cs.name, ' ' + '-' * len(cs.name) ] instance_lines = [] for i, meta in enumerate(cs.service_metadata): if not meta: continue instance_lines += [" - instance #%s:" % i] for k, v in meta.iteritems(): instance_lines += [" - %s: %s" % (k, v)] if instance_lines: check_line += instance_lines meta_lines += check_line if meta_lines: lines += meta_lines else: lines.append(" No metadata were collected.") # Emitter status lines += [ "", "Emitters", "========", "" ] if not self.emitter_statuses: lines.append(" No emitters have run yet.") else: for es in self.emitter_statuses: c = 'green' if es.has_error(): c = 'red' line = " - %s [%s]" % (es.name, style(es.status, c)) if es.status != STATUS_OK: line += ": %s" % es.error lines.append(line) return lines
check_lines.extend( ' ' + line for line in s.traceback.split('\n')) check_lines += [ " - Collected %s metric%s, %s event%s & %s service check%s" % (cs.metric_count, plural( cs.metric_count), cs.event_count, plural(cs.event_count), cs.service_check_count, plural(cs.service_check_count)), ] if cs.check_stats is not None: check_lines += [ " - Stats: %s" % pretty_statistics(cs.check_stats) ] if cs.library_versions is not None: check_lines += [" - Dependencies:"] for library, version in cs.library_versions.iteritems( ): check_lines += [ " - %s: %s" % (library, version) ] check_lines += [""] lines += check_lines # Metadata status