def collect(self): background_process_in_flight_count = GaugeMetricFamily( "synapse_background_process_in_flight_count", "Number of background processes in flight", labels=["name"], ) # We copy the dict so that it doesn't change from underneath us. # We also copy the process lists as that can also change with _bg_metrics_lock: _background_processes_copy = { k: list(v) for k, v in six.iteritems(_background_processes) } for desc, processes in six.iteritems(_background_processes_copy): background_process_in_flight_count.add_metric( (desc,), len(processes), ) for process in processes: process.update_metrics() yield background_process_in_flight_count # now we need to run collect() over each of the static Counters, and # yield each metric they return. for m in ( _background_process_ru_utime, _background_process_ru_stime, _background_process_db_txn_count, _background_process_db_txn_duration, _background_process_db_sched_duration, ): for r in m.collect(): yield r
def collect(self): cm = GaugeMetricFamily( "python_twisted_reactor_last_seen", "Seconds since the Twisted reactor was last seen", ) cm.add_metric([], time.time() - last_ticked) yield cm
def collect(self): logger.debug('Polling...') if not self.session_id: self.session_id = get_session_id(self.base_url, self.login, self.password) tickets_count = get_tickes_count(self.base_url, self.session_id) support_tickets_total = GaugeMetricFamily( 'support_tickets_total', 'Number of tickets', labels=['project', 'status']) for status_data in tickets_count: if status_data['department_id'] in self.department_ids: support_tickets_total.add_metric([ status_data['department'], status_data['name']], status_data['count']) yield support_tickets_total
def __init__(self): self.runnable = GaugeMetricFamily( "hydra_machine_type_runnable", "Number of currently runnable builds", labels=["machineType"]) self.running = GaugeMetricFamily( "hydra_machine_type_running", "Number of currently running builds", labels=["machineType"]) self.wait_time = CounterMetricFamily( "hydra_machine_type_wait_time_total", "Number of seconds spent waiting", labels=["machineType"]) self.last_active = CounterMetricFamily( "hydra_machine_type_last_active_total", "Last time this machine type was active", labels=["machineType"])
def collect(self): start = time.time() # Request data from Azure Status status = self._request_data() for region_section in status[1]: for category in status[1][region_section]: for service in status[1][region_section][category]: for region in status[1][region_section][category][service]: metric_name = "azure_status_{}_{}_status".format(category, service).replace(".", "_") metric = GaugeMetricFamily(metric_name, 'Azure Status for {}'.format(metric_name), labels=["region"]) metric.add_metric([region], STATUSES[status[1][region_section][category][service][region]]) yield metric duration = time.time() - start COLLECTION_TIME.observe(duration)
def collect(self) -> Iterator[GaugeMetricFamily]: active, idle, dead = 0, 0, 0 for slave in self._get_slaves(): if slave.is_alive(use_cached=True) and slave.current_build_id is not None: active += 1 elif slave.is_alive(use_cached=True) and slave.current_build_id is None: idle += 1 elif not slave.is_alive(use_cached=True) and not slave.is_shutdown(): # Slave is not alive and was not deliberately put in shutdown mode. Count it as dead. dead += 1 else: # If not slave.is_alive() and slave.is_shutdown() = True then we have deliberately # and gracefully killed the slave. We do not want to categorize such a slave as 'dead' pass slaves_gauge = GaugeMetricFamily('slaves', 'Total number of slaves', labels=['state']) slaves_gauge.add_metric(['active'], active) slaves_gauge.add_metric(['idle'], idle) slaves_gauge.add_metric(['dead'], dead) yield slaves_gauge
def collect(self): g = GaugeMetricFamily(self.name, self.desc, labels=self.labels) try: calls = self.caller() except Exception: logger.exception( "Exception running callback for LaterGauge(%s)", self.name, ) yield g return if isinstance(calls, dict): for k, v in six.iteritems(calls): g.add_metric(k, v) else: g.add_metric([], calls) yield g
def collect(self): """Called by prometheus client when it reads metrics. Note: may be called by a separate thread. """ in_flight = GaugeMetricFamily(self.name + "_total", self.desc, labels=self.labels) metrics_by_key = {} # We copy so that we don't mutate the list while iterating with self._lock: keys = list(self._registrations) for key in keys: with self._lock: callbacks = set(self._registrations[key]) in_flight.add_metric(key, len(callbacks)) metrics = self._metrics_class() metrics_by_key[key] = metrics for callback in callbacks: callback(metrics) yield in_flight for name in self.sub_metrics: gauge = GaugeMetricFamily("_".join([self.name, name]), "", labels=self.labels) for key, metrics in six.iteritems(metrics_by_key): gauge.add_metric(key, getattr(metrics, name)) yield gauge
def collect(self): result_dict = {} apps = self.client.list_apps(embed_task_stats=True) for app_attribute in self.APP_ATTIBUTES: metric_family = GaugeMetricFamily( self.get_metric_key(app_attribute, 'apps'), documentation='from v2/apps?embed=apps.taskStats value of %s' % app_attribute, labels=["id"]) for app in apps: labels = [app.id] value = self.get_metric_value(app_attribute, app) if value is None: continue metric_family.add_metric(labels, value) yield metric_family queue = self.client.list_queue() for queue_attribute in self.QUEUE_ATTRIBUTES: metric_family = GaugeMetricFamily( self.get_metric_key(queue_attribute, 'queue'), documentation='from v2/queue value of %s' % queue_attribute, labels=["id"]) for queue_item in queue: labels = [queue_item.app.id] value = self.get_metric_value(queue_attribute, queue_item) if value is None: continue metric_family.add_metric(labels, value) yield metric_family
class MachineTypeScrapeImporter: def __init__(self): self.runnable = GaugeMetricFamily( "hydra_machine_type_runnable", "Number of currently runnable builds", labels=["machineType"]) self.running = GaugeMetricFamily( "hydra_machine_type_running", "Number of currently running builds", labels=["machineType"]) self.wait_time = CounterMetricFamily( "hydra_machine_type_wait_time_total", "Number of seconds spent waiting", labels=["machineType"]) self.last_active = CounterMetricFamily( "hydra_machine_type_last_active_total", "Last time this machine type was active", labels=["machineType"]) def load_machine_type(self, name, report): self.runnable.add_metric([name], report.destructive_read("runnable")) self.running.add_metric([name], report.destructive_read("running")) try: self.wait_time.add_metric([name], report.destructive_read("waitTime")) except KeyError: pass try: self.last_active.add_metric([name], report.destructive_read("lastActive")) except KeyError: pass debug_remaining_state(report) def metrics(self): yield self.runnable yield self.running yield self.wait_time yield self.last_active
def convert_meter_metric(cls, marathon_key, marathon_metric): metric_key = cls.convert_metric_key(marathon_key) metric_key = '%s_rate' % metric_key g = GaugeMetricFamily( name=metric_key, documentation='from %s' % marathon_key, labels=('window',)) g.add_metric(('1m',), marathon_metric['m1_rate']) g.add_metric(('5m',), marathon_metric['m5_rate']) g.add_metric(('15m',), marathon_metric['m15_rate']) g.add_metric(('mean',), marathon_metric['mean_rate']) return g
def __init__(self): labels = [ "host" ] self.consective_failures = GaugeMetricFamily( "hydra_machine_consecutive_failures", "Number of consecutive failed builds", labels=labels) self.current_jobs = GaugeMetricFamily( "hydra_machine_current_jobs", "Number of current jobs", labels=labels) self.idle_since = GaugeMetricFamily( "hydra_machine_idle_since", "When the current idle period started", labels=labels) self.disabled_until = GaugeMetricFamily( "hydra_machine_disabled_until", "When the machine will be used again", labels=labels) self.enabled = GaugeMetricFamily( "hydra_machine_enabled", "If the machine is enabled (1) or not (0)", labels=labels) self.last_failure = CounterMetricFamily( "hydra_machine_last_failure", "timestamp of the last failure", labels=labels) self.number_steps_done = CounterMetricFamily( "hydra_machine_steps_done_total", "Total count of the steps completed", labels=labels) self.total_step_build_time = CounterMetricFamily( "hydra_machine_step_build_time_total", "Number of seconds spent building steps", labels=labels) self.total_step_time = CounterMetricFamily( "hydra_machine_step_time_total", "Number of seconds spent on steps", labels=labels)
def collect(self): session = requests.Session() session.trust_env = False session.auth = (self.sonar_user, self.sonar_password) session.verify = False req_string = self.rest_url + '/resources?metrics=ncloc,coverage' res = session.get(req_string) # METRIC: detailed test results c = GaugeMetricFamily('sonar_metrics', 'SonarQube Metrics', labels=['name', 'key']) if res: results = res.json() #pp = pprint.PrettyPrinter() for result in results: # pp.pprint(result) for msr in result['msr']: c.add_metric([result['name'], msr['key']], msr['val']) yield c else: print "Error fetching from " + req_string print res
def collect(self): session = requests.Session() session.trust_env = False session.auth = (self.sonar_user, self.sonar_password) session.verify = False req_string = self.rest_url + '/resources?metrics=' + self.metrics res = session.get(req_string) # METRIC: detailed test results c = GaugeMetricFamily('sonar_metrics', 'SonarQube Metrics', labels=['name', 'key']) try: if res: results = res.json() for result in results: for msr in result['msr']: c.add_metric([result['name'], msr['key']], msr['val']) yield c else: logging.error("Error fetching from " + req_string) logging.error(res) except KeyError: logging.error("Could not retrieve metrics from: " + self.metrics) logging.error("Check argument sonar_metrics")
def collect(self): if not HAVE_PROC_SELF_STAT: return with open("/proc/self/stat") as s: line = s.read() raw_stats = line.split(") ", 1)[1].split(" ") user = GaugeMetricFamily("process_cpu_user_seconds_total", "") user.add_metric([], float(raw_stats[11]) / self.ticks_per_sec) yield user sys = GaugeMetricFamily("process_cpu_system_seconds_total", "") sys.add_metric([], float(raw_stats[12]) / self.ticks_per_sec) yield sys
class MachineScrapeImporter: def __init__(self): labels = [ "host" ] self.consective_failures = GaugeMetricFamily( "hydra_machine_consecutive_failures", "Number of consecutive failed builds", labels=labels) self.current_jobs = GaugeMetricFamily( "hydra_machine_current_jobs", "Number of current jobs", labels=labels) self.idle_since = GaugeMetricFamily( "hydra_machine_idle_since", "When the current idle period started", labels=labels) self.disabled_until = GaugeMetricFamily( "hydra_machine_disabled_until", "When the machine will be used again", labels=labels) self.enabled = GaugeMetricFamily( "hydra_machine_enabled", "If the machine is enabled (1) or not (0)", labels=labels) self.last_failure = CounterMetricFamily( "hydra_machine_last_failure", "timestamp of the last failure", labels=labels) self.number_steps_done = CounterMetricFamily( "hydra_machine_steps_done_total", "Total count of the steps completed", labels=labels) self.total_step_build_time = CounterMetricFamily( "hydra_machine_step_build_time_total", "Number of seconds spent building steps", labels=labels) self.total_step_time = CounterMetricFamily( "hydra_machine_step_time_total", "Number of seconds spent on steps", labels=labels) def load_machine(self, name, report): report.unused_read("mandatoryFeatures") report.unused_read("supportedFeatures") report.unused_read("systemTypes") report.unused_read("avgStepBuildTime") report.unused_read("avgStepTime") labels = [name] self.consective_failures.add_metric( labels, report.destructive_read("consecutiveFailures") ) self.current_jobs.add_metric( labels, report.destructive_read("currentJobs") ) try: self.idle_since.add_metric( labels, report.destructive_read("idleSince") ) except KeyError: pass self.disabled_until.add_metric( labels, report.destructive_read("disabledUntil") ) self.enabled.add_metric( labels, 1 if report.destructive_read("enabled") else 0 ) self.last_failure.add_metric( labels, report.destructive_read("lastFailure") ) self.number_steps_done.add_metric( labels, report.destructive_read("nrStepsDone") ) self.total_step_build_time.add_metric( labels, report.destructive_read_default("totalStepBuildTime", default=0) ) self.total_step_time.add_metric( labels, report.destructive_read_default("totalStepTime", default=0) ) debug_remaining_state(report) def metrics(self): yield self.consective_failures yield self.current_jobs yield self.idle_since yield self.disabled_until yield self.enabled yield self.last_failure yield self.number_steps_done yield self.total_step_build_time yield self.total_step_time
def collect(self): self.tor.reconnect() yield GaugeMetricFamily("tor_written_bytes", "Tor written data counter", value=int( self.tor.get_info("traffic/written"))) yield GaugeMetricFamily("tor_read_bytes", "Tor received data counter", value=int(self.tor.get_info("traffic/read"))) version = GaugeMetricFamily("tor_version", "Tor version as a label", labels=["version"]) version.add_metric([str(torctl.get_version())], 1) yield version version_status = GaugeMetricFamily( "tor_version_status", "Tor version status {new, old, unrecommended, recommended, new in series, obsolete, unknown} as a label", labels=["version_status"]) version_status.add_metric( [self.tor.get_info("status/version/current")], 1) yield version_status yield GaugeMetricFamily( "tor_network_liveness", "Indicates whether tor believes that the network is currently reachable", value=int(self.tor.get_info("network-liveness") == "up")) reachable = GaugeMetricFamily( "tor_reachable", "Indicates whether our OR/Dir port is reachable", labels=["port"]) for entry in self.tor.get_info( "status/reachability-succeeded").split(): k, v = entry.split("=") reachable.add_metric([k], int(v)) yield reachable yield GaugeMetricFamily( "tor_circuit_established", "Indicates whether Tor is capable of establishing circuits", value=int(self.tor.get_info("status/circuit-established"))) # For some reason, 0 actually means that Tor is active, keep it that way yield GaugeMetricFamily( "tor_dormant", "Indicates whether Tor is currently active and building circuits (note that 0 corresponds to Tor being active)", value=int(self.tor.get_info("dormant"))) effective_rate = self.tor.get_effective_rate(None) effective_burst_rate = self.tor.get_effective_rate(None, burst=True) if effective_rate is not None and effective_burst_rate is not None: yield GaugeMetricFamily("tor_effective_rate", "Shows Tor effective rate", value=int(effective_rate)) yield GaugeMetricFamily("tor_effective_burst_rate", "Shows Tor effective burst rate", value=int(effective_burst_rate)) fingerprint = GaugeMetricFamily("tor_fingerprint", "Tor fingerprint as a label", labels=["fingerprint"]) fingerprint.add_metric([self.tor.get_info("fingerprint")], 1) yield fingerprint nickname = GaugeMetricFamily("tor_nickname", "Tor nickname as a label", labels=["nickname"]) nickname.add_metric([self.tor.get_conf("Nickname", "Unnamed")], 1) yield nickname # Connection counting # This won't work/will return wrong results if we are not running on # the same box as the Tor daemon is. # DisableDebuggerAttachment has to be set to 0 # TODO: Count individual OUT/DIR/Control connections, see arm sources # for reference try: connections = stem.util.connection.get_connections( process_pid=self.tor.get_pid()) yield GaugeMetricFamily( "tor_connection_count", "Amount of connections the Tor daemon has open", value=len(connections)) except OSError: # This happens if the PID does not exists (on another machine). pass try: has_flags = self.tor.get_network_status().flags except stem.DescriptorUnavailable: # The tor daemon fails with this for a few minutes after startup # (before figuring out its own flags?) has_flags = [] flags = GaugeMetricFamily("tor_flags", "Has a Tor flag", labels=["flag"]) for flag in [ "Authority", "BadExit", "Exit", "Fast", "Guard", "HSDir", "NoEdConsensus", "Stable", "Running", "Valid", "V2Dir" ]: flags.add_metric([flag], int(flag in has_flags)) yield flags accs = self.tor.get_accounting_stats() yield GaugeMetricFamily("tor_accounting_read_bytes", "Tor accounting read bytes", accs.read_bytes) yield GaugeMetricFamily("tor_accounting_left_read_bytes", "Tor accounting read bytes left", accs.read_bytes_left) yield GaugeMetricFamily("tor_accounting_read_limit_bytes", "Tor accounting read bytes limit", accs.read_limit) yield GaugeMetricFamily("tor_accounting_write_bytes", "Tor accounting write bytes", accs.written_bytes) yield GaugeMetricFamily("tor_accounting_left_write_bytes", "Tor accounting write bytes left", accs.write_bytes_left) yield GaugeMetricFamily("tor_accounting_write_limit_bytes", "Tor accounting write bytes limit", accs.write_limit)
def _create_metric_containers(self): metric_list = {} metric_list['vms'] = { 'vmware_vm_power_state': GaugeMetricFamily( 'vmware_vm_power_state', 'VMWare VM Power state (On / Off)', labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']), 'vmware_vm_boot_timestamp_seconds': GaugeMetricFamily( 'vmware_vm_boot_timestamp_seconds', 'VMWare VM boot time in seconds', labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']), 'vmware_vm_num_cpu': GaugeMetricFamily( 'vmware_vm_num_cpu', 'VMWare Number of processors in the virtual machine', labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']), } metric_list['vmguests'] = { 'vmware_vm_guest_disk_free': GaugeMetricFamily('vmware_vm_guest_disk_free', 'Disk metric per partition', labels=[ 'vm_name', 'host_name', 'dc_name', 'cluster_name', 'partition', ]), 'vmware_vm_guest_disk_capacity': GaugeMetricFamily('vmware_vm_guest_disk_capacity', 'Disk capacity metric per partition', labels=[ 'vm_name', 'host_name', 'dc_name', 'cluster_name', 'partition', ]), } metric_list['snapshots'] = { 'vmware_vm_snapshots': GaugeMetricFamily( 'vmware_vm_snapshots', 'VMWare current number of existing snapshots', labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']), 'vmware_vm_snapshot_timestamp_seconds': GaugeMetricFamily('vmware_vm_snapshot_timestamp_seconds', 'VMWare Snapshot creation time in seconds', labels=[ 'vm_name', 'host_name', 'dc_name', 'cluster_name', 'vm_snapshot_name' ]), } metric_list['datastores'] = { 'vmware_datastore_capacity_size': GaugeMetricFamily('vmware_datastore_capacity_size', 'VMWare Datasore capacity in bytes', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_freespace_size': GaugeMetricFamily('vmware_datastore_freespace_size', 'VMWare Datastore freespace in bytes', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_uncommited_size': GaugeMetricFamily('vmware_datastore_uncommited_size', 'VMWare Datastore uncommitted in bytes', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_provisoned_size': GaugeMetricFamily('vmware_datastore_provisoned_size', 'VMWare Datastore provisoned in bytes', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_hosts': GaugeMetricFamily('vmware_datastore_hosts', 'VMWare Hosts number using this datastore', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_vms': GaugeMetricFamily('vmware_datastore_vms', 'VMWare Virtual Machines count per datastore', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_maintenance_mode': GaugeMetricFamily( 'vmware_datastore_maintenance_mode', 'VMWare datastore maintenance mode (normal / inMaintenance / enteringMaintenance)', labels=['ds_name', 'dc_name', 'ds_cluster', 'mode']), 'vmware_datastore_type': GaugeMetricFamily( 'vmware_datastore_type', 'VMWare datastore type (VMFS, NetworkFileSystem, NetworkFileSystem41, CIFS, VFAT, VSAN, VFFS)', labels=['ds_name', 'dc_name', 'ds_cluster', 'ds_type']), 'vmware_datastore_accessible': GaugeMetricFamily('vmware_datastore_accessible', 'VMWare datastore accessible (true / false)', labels=['ds_name', 'dc_name', 'ds_cluster']) } metric_list['hosts'] = { 'vmware_host_power_state': GaugeMetricFamily('vmware_host_power_state', 'VMWare Host Power state (On / Off)', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_connection_state': GaugeMetricFamily( 'vmware_host_connection_state', 'VMWare Host connection state (connected / disconnected / notResponding)', labels=['host_name', 'dc_name', 'cluster_name', 'state']), 'vmware_host_maintenance_mode': GaugeMetricFamily('vmware_host_maintenance_mode', 'VMWare Host maintenance mode (true / false)', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_boot_timestamp_seconds': GaugeMetricFamily('vmware_host_boot_timestamp_seconds', 'VMWare Host boot time in seconds', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_cpu_usage': GaugeMetricFamily('vmware_host_cpu_usage', 'VMWare Host CPU usage in Mhz', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_cpu_max': GaugeMetricFamily('vmware_host_cpu_max', 'VMWare Host CPU max availability in Mhz', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_memory_usage': GaugeMetricFamily('vmware_host_memory_usage', 'VMWare Host Memory usage in Mbytes', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_memory_max': GaugeMetricFamily('vmware_host_memory_max', 'VMWare Host Memory Max availability in Mbytes', labels=['host_name', 'dc_name', 'cluster_name']), } metrics = {} for key, value in self.collect_only.items(): if value is True: metrics.update(metric_list[key]) return metrics
def collect(self): if self.stats_access_pending or self.ns_session_pending: return if not self.login(): return data = {} self.stats_access_pending = True for entity in self.metrics.keys(): logger.debug('Collecting metric {}'.format(entity)) try: status, entity_data = self.collect_data(entity) except Exception as e: logger.error('Could not collect metric :{}'.format(e)) if status == self.FAILURE: self.ns_session_clear() return if entity_data: data[entity] = entity_data if 'k8s_ingress_lbvs' in self.metrics and \ os.environ.get('KUBERNETES_SERVICE_HOST') is not None: lbvs_dict = None try: status, lbvs_dict = self.collect_lbvs_config() except Exception as e: logger.error( 'Could not collect config entries for lbvs: {}'.format(e)) if status == self.FAILURE: self.ns_session_clear() return # Add labels to metrics and provide to Prometheus log_prefix_match = True for entity_name, entity in self.metrics.items(): if ('labels' in entity.keys()): label_names = [v[1] for v in entity['labels']] label_names.append('nsip') else: label_names = [] label_names.append('nsip') # Provide collected metric to Prometheus as a counter entity_stats = data.get(entity_name, []) if (type(entity_stats) is not list): entity_stats = [entity_stats] for ns_metric_name, prom_metric_name in entity.get('counters', []): c = CounterMetricFamily(prom_metric_name, ns_metric_name, labels=label_names) for data_item in entity_stats: if not data_item: continue if ns_metric_name not in data_item.keys(): logger.info( 'Counter stats {} not enabled for entity: {}'. format(ns_metric_name, entity_name)) break if ('labels' in entity.keys()): label_values = [ data_item[key] for key in [v[0] for v in entity['labels']] ] # populate and update k8s_ingress_lbvs metrics if in k8s-CIC enviroment if entity_name == "k8s_ingress_lbvs": if os.environ.get( 'KUBERNETES_SERVICE_HOST') is not None: prefix_match = self.update_lbvs_label( label_values, lbvs_dict, log_prefix_match) if not prefix_match: log_prefix_match = False continue else: continue label_values.append(self.nsip) else: label_values = [self.nsip] try: c.add_metric(label_values, float(data_item[ns_metric_name])) except Exception as e: logger.error( 'Caught exception while adding counter {} to {}: {}' .format(ns_metric_name, entity_name, str(e))) yield c # Provide collected metric to Prometheus as a gauge for ns_metric_name, prom_metric_name in entity.get('gauges', []): g = GaugeMetricFamily(prom_metric_name, ns_metric_name, labels=label_names) for data_item in entity_stats: if not data_item: continue if ns_metric_name not in data_item.keys(): logger.info( 'Gauge stat {} not enabled for entity: {}'.format( ns_metric_name, entity_name)) break if ('labels' in entity.keys()): label_values = [ data_item[key] for key in [v[0] for v in entity['labels']] ] # populate and update k8s_ingress_lbvs metrics if in k8s-CIC enviroment if entity_name == "k8s_ingress_lbvs": if os.environ.get( 'KUBERNETES_SERVICE_HOST') is not None: prefix_match = self.update_lbvs_label( label_values, lbvs_dict, log_prefix_match) if not prefix_match: log_prefix_match = False continue else: continue label_values.append(self.nsip) else: label_values = [self.nsip] try: g.add_metric(label_values, float(data_item[ns_metric_name])) except Exception as e: logger.error( 'Caught exception while adding counter {} to {}: {}' .format(ns_metric_name, entity_name, str(e))) yield g self.stats_access_pending = False
def _vmware_get_vm_perf_manager_metrics(self, vm_metrics): log('START: _vmware_get_vm_perf_manager_metrics') virtual_machines, counter_info = yield parallelize( self.vm_inventory, self.counter_ids) # List of performance counter we want perf_list = [ 'cpu.ready.summation', 'cpu.usage.average', 'cpu.usagemhz.average', 'disk.usage.average', 'disk.read.average', 'disk.write.average', 'mem.usage.average', 'net.received.average', 'net.transmitted.average', ] # Prepare gauges for p in perf_list: p_metric = 'vmware_vm_' + p.replace('.', '_') vm_metrics[p_metric] = GaugeMetricFamily( p_metric, p_metric, labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']) metrics = [] metric_names = {} for perf_metric in perf_list: perf_metric_name = 'vmware_vm_' + perf_metric.replace('.', '_') counter_key = counter_info[perf_metric] metrics.append( vim.PerformanceManager.MetricId(counterId=counter_key, instance='')) metric_names[counter_key] = perf_metric_name specs = [] for vm in virtual_machines.values(): if vm.get('runtime.powerState') != 'poweredOn': continue specs.append( vim.PerformanceManager.QuerySpec(maxSample=1, entity=vm['obj'], metricId=metrics, intervalId=20)) content = yield self.content results, labels = yield parallelize( threads.deferToThread(content.perfManager.QueryStats, querySpec=specs), self.vm_labels, ) for ent in results: for metric in ent.value: vm_metrics[metric_names[metric.id.counterId]].add_metric( labels[ent.entity._moId], float(sum(metric.value)), ) log('FIN: _vmware_get_vm_perf_manager_metrics')
def metric_workers(self): metric = GaugeMetricFamily('resque_workers', "Number of workers") metric.add_metric([], len(self.workers)) return metric
def collect(self): session = requests.Session() session.trust_env = False session.auth = (self.bamboo_user, self.bamboo_password) session.verify = False # METRIC: detailed test results c = GaugeMetricFamily('bamboo_test_results', 'Bamboo Test Results', labels=['name', 'job', 'className', 'methodName']) for job in self.bamboo_test_jobs: res = session.get(self.web_url + '/rest/api/latest/result/' + job + '/latest.json?expand=testResults.allTests') if res: results = res.json() for testResult in res.json()['testResults']['allTests']['testResult']: c.add_metric([results['plan']['name'], job, testResult['className'], testResult['methodName']], testResult['status'] == 'successful') else: print "error fetching test results" print res yield c # METRIC: bamboo agent state c = GaugeMetricFamily('bamboo_build_state', 'Bamboo Build Dashboard', labels=['state', 'host']) res = session.get(self.web_url + '/build/admin/ajax/getDashboardSummary.action') if res: dashboard_summary = res.json() for host, values in self.tally_agent_info(dashboard_summary).iteritems(): for state, state_count in values.iteritems(): c.add_metric([state, host], state_count) yield c else: print res # Collect results tagged d = {} r = session.get( self.web_url + '/rest/api/latest/result.json?favourite&expand=results.result.buildDurationInSeconds') if r.ok: # NOTE: this may return multiple results for the same plan - need to use highest build number results = r.json() for result in results['results']['result']: key = result['plan']['key'] if key in d and d[key]['number'] < result['number']: continue # don't overwrite with older build d[key] = result else: print r # METRIC: build status (favourites) METRICS = ['buildNumber', 'buildDurationInSeconds'] TEST_METRICS = ['failedTestCount', 'skippedTestCount', 'quarantinedTestCount', 'successfulTestCount'] statusMetric = GaugeMetricFamily('build_results', 'Status of flagged plans', labels=['name', 'state']) testMetric = GaugeMetricFamily('test_counts', 'Test result counts', labels=['shortName', 'countType']) metrics = {x: GaugeMetricFamily(x, x, labels=['shortName']) for x in METRICS} for key, result in d.iteritems(): statusMetric.add_metric([result['plan']['shortName'], result['state']], result['successful']) for name in TEST_METRICS: testMetric.add_metric([result['plan']['shortName'], name], result[name]) for name, metric in metrics.iteritems(): metric.add_metric([result['plan']['shortName']], result[name]) yield statusMetric yield testMetric for metric in metrics.itervalues(): yield metric
def gen_nvidia_gpu_mem_util_gauge(): return GaugeMetricFamily("nvidiasmi_utilization_memory", "gpu memory utilization of card", labels=["minor_number"])
def collect(self): '''collect metrics''' # Task metrics task_info = get_task_state_info() t_state = GaugeMetricFamily( 'airflow_task_status', 'Shows the number of task starts with this status', labels=['dag_id', 'task_id', 'owner', 'status'] ) for task in task_info: t_state.add_metric([task.dag_id, task.task_id, task.owners, task.state or 'none'], task.value) yield t_state # Dag Metrics dag_info = get_dag_state_info() d_state = GaugeMetricFamily( 'airflow_dag_status', 'Shows the number of dag starts with this status', labels=['dag_id', 'owner', 'status'] ) for dag in dag_info: d_state.add_metric([dag.dag_id, dag.owners, dag.state], dag.count) yield d_state # DagRun metrics dag_duration = GaugeMetricFamily( 'airflow_dag_run_duration', 'Duration of currently running dag_runs in seconds', labels=['dag_id', 'run_id'] ) driver = Session.bind.driver for dag in get_dag_duration_info(): if driver == 'mysqldb' or driver == 'pysqlite': dag_duration.add_metric([dag.dag_id, dag.run_id], dag.duration) else: dag_duration.add_metric([dag.dag_id, dag.run_id], dag.duration.seconds) yield dag_duration
def collect(self): cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) for n, m in enumerate(gc.get_count()): cm.add_metric([str(n)], m) yield cm
def metric_queues(self): metric = GaugeMetricFamily('resque_queues', "Number of queues") metric.add_metric([], len(self.queues)) return metric
def _get_coordinator_counters(self): return { 'segment/assigned/count': GaugeMetricFamily( 'druid_coordinator_segment_assigned_count', 'Number of segments assigned to be loaded in the cluster.', labels=['tier']), 'segment/moved/count': GaugeMetricFamily( 'druid_coordinator_segment_moved_count', 'Number of segments assigned to be loaded in the cluster.', labels=['tier']), 'segment/dropped/count': GaugeMetricFamily( 'druid_coordinator_segment_dropped_count', 'Number of segments dropped due to being overshadowed.', labels=['tier']), 'segment/deleted/count': GaugeMetricFamily('druid_coordinator_segment_deleted_count', 'Number of segments dropped due to rules.', labels=['tier']), 'segment/unneeded/count': GaugeMetricFamily( 'druid_coordinator_segment_unneeded_count', 'Number of segments dropped due to being marked as unused.', labels=['tier']), 'segment/overShadowed/count': GaugeMetricFamily('druid_coordinator_segment_overshadowed_count', 'Number of overShadowed segments.'), 'segment/loadQueue/failed': GaugeMetricFamily( 'druid_coordinator_segment_loadqueue_failed_count', 'Number of segments that failed to load.', labels=['server']), 'segment/loadQueue/count': GaugeMetricFamily('druid_coordinator_segment_loadqueue_count', 'Number of segments to load.', labels=['server']), 'segment/dropQueue/count': GaugeMetricFamily('druid_coordinator_segment_dropqueue_count', 'Number of segments to drop.', labels=['server']), 'segment/size': GaugeMetricFamily('druid_coordinator_segment_size_bytes', 'Size in bytes of available segments.', labels=['datasource']), 'segment/count': GaugeMetricFamily('druid_coordinator_segment_count', 'Number of served segments.', labels=['datasource']), 'segment/unavailable/count': GaugeMetricFamily( 'druid_coordinator_segment_unavailable_count', 'Number of segments (not including replicas) left to load ' 'until segments that should be loaded in the cluster ' 'are available for queries.', labels=['datasource']), 'segment/underReplicated/count': GaugeMetricFamily( 'druid_coordinator_segment_under_replicated_count', 'Number of segments (including replicas) left to load until ' 'segments that should be loaded in the cluster are ' 'available for queries.', labels=['tier', 'datasource']), 'jetty/numOpenConnections': GaugeMetricFamily('druid_coordinator_jetty_numOpenConnections', 'Number of open jetty connections.', labels=['datasource']), }
def gen_k8s_component_gauge(): return GaugeMetricFamily("k8s_component_count", "count of k8s component", labels=["service_name", "error", "host_ip"])
def gen_amd_gpu_util_gauge(): return GaugeMetricFamily("rocmsmi_utilization_gpu", "gpu core utilization of card", labels=["minor_number"])
def collect(self): blazegraph_metrics = { '/Query Engine/queryStartCount': CounterMetricFamily( 'blazegraph_queries_start', 'Number of queries that have started since the start of the application.' ), '/Query Engine/queryDoneCount': CounterMetricFamily( 'blazegraph_queries_done', 'Number of queries completed since the start of the application.' ), '/Query Engine/queryErrorCount': CounterMetricFamily( 'blazegraph_queries_error', 'Number of queries in error since the start of the application.' ), '/Query Engine/queriesPerSecond': GaugeMetricFamily( 'blazegraph_queries_per_second', 'Number of queries per second (rolling average).'), '/Query Engine/operatorActiveCount': GaugeMetricFamily('blazegraph_operator_active_count', 'Number of active blazegraph operators'), '/Query Engine/runningQueriesCount': GaugeMetricFamily('blazegraph_running_queries_count', 'Number of running queries'), '/Query Engine/GeoSpatial/geoSpatialSearchRequests': GaugeMetricFamily( 'blazegraph_geospatial_search_requets', 'Number of geospatial search requests since the start of the application.' ), '/Journal/bytesReadPerSec': GaugeMetricFamily('blazegraph_journal_bytes_read_per_second', ''), '/Journal/bytesWrittenPerSec': GaugeMetricFamily('blazegraph_journal_bytes_written_per_second', ''), '/Journal/extent': GaugeMetricFamily('blazegraph_journal_extent', ''), '/Journal/commitCount': CounterMetricFamily('blazegraph_journal_commit_count', ''), '/Journal/commit/totalCommitSecs': GaugeMetricFamily('blazegraph_journal_total_commit_seconds', 'Total time spent in commit.'), '/Journal/commit/flushWriteSetSecs': GaugeMetricFamily('blazegraph_journal_flush_write_set_seconds', ''), '/Journal/Concurrency Manager/Read Service/Average Active Count': GaugeMetricFamily( 'blazegraph_journal_concurrency_read_average_active_count', 'Average Number of Read Active Threads'), '/JVM/Memory/DirectBufferPool/default/bytesUsed': GaugeMetricFamily( 'blazegraph_jvm_memory_direct_buffer_pool_default_bytes_used', ''), '/JVM/Memory/Runtime Free Memory': GaugeMetricFamily('blazegraph_jvm_memory_runtime_free_memory', 'Current amount of free memory in the JVM.'), '/JVM/Memory/Runtime Max Memory': GaugeMetricFamily('blazegraph_jvm_memory_runtime_max_memory', 'Max amount of memory the JVM can allocate.'), '/JVM/Memory/Runtime Total Memory': GaugeMetricFamily('blazegraph_jvm_memory_runtime_total_memory', 'Total amount of memory allocated to the JVM.'), '/JVM/Memory/Garbage Collectors/G1 Old Generation/Collection Count': CounterMetricFamily( 'blazegraph_jvm_memory_gc_g1_old_collecton_count', 'Number of old GC since JVM start.'), '/JVM/Memory/Garbage Collectors/G1 Old Generation/Cumulative Collection Time': GaugeMetricFamily( 'blazegraph_jvm_memory_gc_g1_old_cumulative_collection_time', 'Total time spent in old GC (seconds).'), '/JVM/Memory/Garbage Collectors/G1 Young Generation/Collection Count': CounterMetricFamily( 'blazegraph_jvm_memory_gc_g1_young_collection_count', 'Number of young GC since JVM start.'), '/JVM/Memory/Garbage Collectors/G1 Young Generation/Cumulative Collection Time': GaugeMetricFamily( 'blazegraph_jvm_memory_gc_g1_young_cumulative_collection_time', 'Total time spent in young GC (seconds).'), } for metric_name, metric_family in blazegraph_metrics.items(): if metric_name is None: log.warning('Unknown metric %r', metric_name) else: metric_value = self.get_counter(metric_name) try: value = float(metric_value) except (ValueError, TypeError): value = float('nan') metric_family.add_metric([], value) triple_metric = GaugeMetricFamily('blazegraph_triples', 'Count of triples in Blazegraph') lag_metric = CounterMetricFamily('blazegraph_lastupdated', 'Last update timestamp') try: sparql_query = """ prefix schema: <http://schema.org/> SELECT * WHERE { { SELECT ( COUNT( * ) AS ?count ) { ?s ?p ?o } } UNION { SELECT * WHERE { <http://www.wikidata.org> schema:dateModified ?y } } }""" data = self.execute_sparql(sparql_query) for binding in data['results']['bindings']: if 'count' in binding: triple_count = binding['count']['value'] triple_metric.add_metric([], float(triple_count)) elif 'y' in binding: lastUpdated = parse(binding['y']['value']) lag_metric.add_metric([], float(lastUpdated.strftime('%s'))) else: raise ValueError( 'SPARQL binding returned with unexpected key') except requests.exceptions.RequestException: log.exception("Error querying endpoint") triple_metric.add_metric([], float('nan')) lag_metric.add_metric([], float('nan')) alloc_metric = GaugeMetricFamily( 'blazegraph_allocators', 'Number of used FixedAllocators in Blazegraph') alloc_free_metric = GaugeMetricFamily( 'blazegraph_free_allocators', 'Number of free FixedAllocators in Blazegraph') allocs = self.fetch_allocators() if allocs: alloc_metric.add_metric([], allocs) alloc_free_metric.add_metric([], 256 * 1024 - allocs) else: alloc_metric.add_metric([], float('nan')) alloc_free_metric.add_metric([], float('nan')) yield triple_metric yield lag_metric yield alloc_metric yield alloc_free_metric for metric in blazegraph_metrics.values(): yield metric
def gen_gpu_used_by_external_process_counter(): return GaugeMetricFamily("gpu_used_by_external_process_count", "count of gpu used by external process", labels=["minor_number", "pid"])
def metric_up_gauge(resource: str, succeeded=True): metric_name = resource + '_up' description = 'Did the {} fetch succeed.'.format(resource) return GaugeMetricFamily(metric_name, description, value=int(succeeded))
def gen_process_mem_usage_gauge(): return GaugeMetricFamily( "process_mem_usage_byte", "memory usage of process, to save space in prometheus, we only expose those who consume more than 500Mb of memory", labels=["pid", "cmd"])
def collect(self): repository_tags_total = GaugeMetricFamily( 'repository_tags_total', 'Number of tags for each repo', labels=['repository']) repository_revisions_total = GaugeMetricFamily( 'repository_revisions_total', 'Number of revisions for each repo', labels=['repository']) repository_tag_layers_total = GaugeMetricFamily( 'repository_tag_layers_total', 'Number of layers in each tag', labels=['repository', 'tag']) repository_tag_size_bytes = GaugeMetricFamily( 'repository_tag_size_bytes', 'Size of each tag', labels=['repository', 'tag']) repositories = self._find_repositories() logger.debug('Found %s repositories: %s', len(repositories), repositories) for repository in repositories: logger.debug('Scanning %s for tags', repository) tags = self._scrape_tags(repository) repository_tags_total.add_metric([repository], len(tags)) revisions = self._scrape_revisions(repository) repository_revisions_total.add_metric([repository], len(revisions)) for tag in tags: manifest = self._scrape_manifest(repository, tag) repository_tag_layers_total.add_metric([repository, tag], len(manifest['layers'])) size = 0 for layer in manifest['layers']: size += layer['size'] if 'size' in layer else 0 repository_tag_size_bytes.add_metric([repository, tag], size) yield repository_tags_total yield repository_revisions_total yield repository_tag_layers_total yield repository_tag_size_bytes
def gen_docker_daemon_counter(): return GaugeMetricFamily("docker_daemon_count", "count of docker daemon", labels=["error"])
def collect(self): deluge_host = os.environ.get('DELUGE_HOST', '127.0.0.1') client = DelugeRPCClient(deluge_host, self.rpc_port, self.rpc_user, self.rpc_password) client.connect() libtorrent_metrics = get_libtorrent_metrics_meta() libtorrent_metric_values = client.call('core.get_session_status', []) for metric, metric_type in libtorrent_metrics.items(): encoded_name = metric.encode('ascii') if encoded_name in libtorrent_metric_values: yield metric_type( 'deluge_libtorrent_{}'.format(metric.replace('.', '_')), 'libtorrent metric {}'.format(metric), value=libtorrent_metric_values[encoded_name] ) yield new_metric_with_labels_and_value(GaugeMetricFamily, 'deluge_info', 'Deluge information', labels={ 'version': client.call('daemon.info').decode('utf-8'), 'libtorrent_version': client.call('core.get_libtorrent_version').decode('utf-8'), }, value=1 ) for key, value in client.call('core.get_config').items(): if isinstance(value, (int, float, bool)): yield GaugeMetricFamily('deluge_config_{}'.format(key.decode('utf-8')), 'Value of the deluge config setting {}'.format(key.decode('utf-8')), value=value) torrents_by_state = { 'downloading': 0, 'seeding': 0, 'paused': 0, 'checking': 0, 'queued': 0, 'error': 0, 'active': 0, # not the prometheus way, but the states above (as defined by deluge) are already overlapping, so sum() over them is already meaningless 'total': 0, } torrents_by_label = defaultdict(int) for torrent in client.core.get_torrents_status({}, [b'label', b'state', b'download_payload_rate', b'upload_payload_rate']).values(): if b'label' in torrent: torrents_by_label[torrent[b'label'].decode('utf-8')] += 1 torrents_by_state[torrent[b'state'].decode('utf-8').lower()] += 1 torrents_by_state['total'] += 1 if torrent[b'download_payload_rate'] > 0 or torrent[b'upload_payload_rate'] > 0: torrents_by_state['active'] += 1 if len(torrents_by_label) > 0: torrents_by_label_metric = GaugeMetricFamily('deluge_torrents_by_label', 'The number of torrents for each label assigned to a torrent using the deluge label plugin', labels=['label']) for label, count in torrents_by_label.items(): torrents_by_label_metric.add_metric([label], count) yield torrents_by_label_metric torrents_metric = GaugeMetricFamily('deluge_torrents', 'The number of torrents in a specific state (note: some states overlap)', labels=['state']) for state, torrent_count in torrents_by_state.items(): torrents_metric.add_metric([state], torrent_count) yield torrents_metric if self.per_torrent_metrics_enabled: per_torrent_keys = [ (CounterMetricFamily, b'total_done', 'The amount of data downloaded for this torrent'), (CounterMetricFamily, b'total_size', 'The size of this torrent'), (CounterMetricFamily, b'total_uploaded', 'The amount of data uploaded for this torrent'), (GaugeMetricFamily, b'num_peers', 'The number of peers currently connected to for this torrent'), (GaugeMetricFamily, b'num_seeds', 'The number of seeds currently connected to for this torrent'), (GaugeMetricFamily, b'total_peers', 'The number of peers in the swarm for this torrent'), (GaugeMetricFamily, b'total_seeds', 'The number of seeds in the swarm for this torrent'), ] per_torrent_metrics = dict(generate_per_torrent_metrics(per_torrent_keys)) for torrent_hash, torrent in client.core.get_torrents_status({}, [key[1] for key in per_torrent_keys] + [b'name']).items(): for metric_name, metric in per_torrent_metrics.items(): metric.add_metric( [ torrent[b'name'].decode('utf-8'), torrent_hash.decode('utf-8') ], torrent[metric_name] ) for metric in per_torrent_metrics.values(): yield metric client.disconnect()
def trivial_gauge(self, name, help, value): c = GaugeMetricFamily(f"hydra_{name}", help) c.add_metric([], value) return c
def collect(self, vsphere_host, section='default'): """ collects metrics """ if section not in self.config.keys(): log("{} is not a valid section, using default".format(section)) section = 'default' metric_list = {} metric_list['vms'] = { 'vmware_vm_power_state': GaugeMetricFamily('vmware_vm_power_state', 'VMWare VM Power state (On / Off)', labels=['vm_name', 'host_name']), 'vmware_vm_boot_timestamp_seconds': GaugeMetricFamily('vmware_vm_boot_timestamp_seconds', 'VMWare VM boot time in seconds', labels=['vm_name', 'host_name']), 'vmware_vm_snapshots': GaugeMetricFamily('vmware_vm_snapshots', 'VMWare current number of existing snapshots', labels=['vm_name']), 'vmware_vm_snapshot_timestamp_seconds': GaugeMetricFamily('vmware_vm_snapshot_timestamp_seconds', 'VMWare Snapshot creation time in seconds', labels=['vm_name', 'vm_snapshot_name']), 'vmware_vm_num_cpu': GaugeMetricFamily( 'vmware_vm_num_cpu', 'VMWare Number of processors in the virtual machine', labels=['vm_name', 'host_name']) } metric_list['datastores'] = { 'vmware_datastore_capacity_size': GaugeMetricFamily('vmware_datastore_capacity_size', 'VMWare Datasore capacity in bytes', labels=['ds_name']), 'vmware_datastore_freespace_size': GaugeMetricFamily('vmware_datastore_freespace_size', 'VMWare Datastore freespace in bytes', labels=['ds_name']), 'vmware_datastore_uncommited_size': GaugeMetricFamily('vmware_datastore_uncommited_size', 'VMWare Datastore uncommitted in bytes', labels=['ds_name']), 'vmware_datastore_provisoned_size': GaugeMetricFamily('vmware_datastore_provisoned_size', 'VMWare Datastore provisoned in bytes', labels=['ds_name']), 'vmware_datastore_hosts': GaugeMetricFamily('vmware_datastore_hosts', 'VMWare Hosts number using this datastore', labels=['ds_name']), 'vmware_datastore_vms': GaugeMetricFamily( 'vmware_datastore_vms', 'VMWare Virtual Machines number using this datastore', labels=['ds_name']) } metric_list['hosts'] = { 'vmware_host_power_state': GaugeMetricFamily('vmware_host_power_state', 'VMWare Host Power state (On / Off)', labels=['host_name']), 'vmware_host_boot_timestamp_seconds': GaugeMetricFamily('vmware_host_boot_timestamp_seconds', 'VMWare Host boot time in seconds', labels=['host_name']), 'vmware_host_cpu_usage': GaugeMetricFamily('vmware_host_cpu_usage', 'VMWare Host CPU usage in Mhz', labels=['host_name']), 'vmware_host_cpu_max': GaugeMetricFamily('vmware_host_cpu_max', 'VMWare Host CPU max availability in Mhz', labels=['host_name']), 'vmware_host_memory_usage': GaugeMetricFamily('vmware_host_memory_usage', 'VMWare Host Memory usage in Mbytes', labels=['host_name']), 'vmware_host_memory_max': GaugeMetricFamily('vmware_host_memory_max', 'VMWare Host Memory Max availability in Mbytes', labels=['host_name']), } metrics = {} for key, value in self.config[section]['collect_only'].items(): if value is True: metrics.update(metric_list[key]) log("Start collecting vcenter metrics for {0}".format(vsphere_host)) self.vmware_connection = self._vmware_connect(vsphere_host, section) if not self.vmware_connection: log("Cannot connect to vmware") return content = self.vmware_connection.RetrieveContent() if self.config[section]['collect_only']['vms'] is True: # Get performance metrics counter information counter_info = self._vmware_perf_metrics(content) # Fill VM Informations log("Starting VM performance metric collection") self._vmware_get_vms(content, metrics, counter_info) log("Finish starting vm performance vm collection") # Fill Snapshots (count and age) log("Starting VM snapshot metric collection") vm_counts, vm_ages = self._vmware_get_snapshots(content) for v in vm_counts: metrics['vmware_vm_snapshots'].add_metric([v['vm_name']], v['snapshot_count']) for vm_age in vm_ages: for v in vm_age: metrics['vmware_vm_snapshot_timestamp_seconds'].add_metric( [v['vm_name'], v['vm_snapshot_name']], v['vm_snapshot_timestamp_seconds']) log("Finished VM snapshot metric collection") # Fill Datastore if self.config[section]['collect_only']['datastores'] is True: self._vmware_get_datastores(content, metrics) # Fill Hosts Informations if self.config[section]['collect_only']['hosts'] is True: self._vmware_get_hosts(content, metrics) log("Stop collecting vcenter metrics for {0}".format(vsphere_host)) self.threader.join() self._vmware_disconnect() for _key, metric in metrics.items(): yield metric
def gen_nv_peer_mem_gauge(): return GaugeMetricFamily( "nv_peer_mem_count", "count of active nv_peer_mem (GPUDirect) module. 0 or 1")
def gen_zombie_process_counter(): return GaugeMetricFamily("zombie_process_count", "count of zombie process", labels=["command"])
def gen_gpu_util_gauge(): return GaugeMetricFamily("nvidiasmi_utilization_gpu", "gpu core utilization of card", labels=["minor_number", "uuid"])
def gen_gpu_used_by_zombie_container_counter(): return GaugeMetricFamily("gpu_used_by_zombie_container_count", "count of gpu used by zombie container", labels=["minor_number", "container_id"])
def gen_gpu_retired_page_count(): return GaugeMetricFamily("nvidiasmi_retired_page_count", "count of nvidia ecc retired page", labels=["minor_number", "uuid", "type"])
def add_gauge(self, name, desc, labels): self.gauges[name] = GaugeMetricFamily(name, desc, labels=labels)
def gen_gpu_ecc_counter(): return GaugeMetricFamily("nvidiasmi_ecc_error_count", "count of nvidia ecc error", labels=["minor_number", "type"])
def gen_gpu_mem_util_gauge(): return GaugeMetricFamily("gpu_mem_utilization", "gpu memory utilization of card", labels=["minor_number", "vender"])
def gen_gpu_memory_leak_counter(): return GaugeMetricFamily("nvidiasmi_memory_leak_count", "count of nvidia memory leak", labels=["minor_number"])
def gen_nvidia_gpu_temperature_gauge(): return GaugeMetricFamily("nvidiasmi_temperature", "gpu temperature of card", labels=["minor_number"])
def collect(self) -> Iterable[Metric]: # @stats is a pretty-printer object with __str__() returning a nice table, # plus some fields that contain data from that table. # unfortunately, fields are pretty-printed themselves (i. e. '4.5MB'). stats = gc.get_stats(memory_pressure=False) # type: ignore # @s contains same fields as @stats, but as actual integers. s = stats._s # type: ignore # also note that field naming is completely braindead # and only vaguely correlates with the pretty-printed table. # >>>> gc.get_stats(False) # Total memory consumed: # GC used: 8.7MB (peak: 39.0MB) # s.total_gc_memory, s.peak_memory # in arenas: 3.0MB # s.total_arena_memory # rawmalloced: 1.7MB # s.total_rawmalloced_memory # nursery: 4.0MB # s.nursery_size # raw assembler used: 31.0kB # s.jit_backend_used # ----------------------------- # Total: 8.8MB # stats.memory_used_sum # # Total memory allocated: # GC allocated: 38.7MB (peak: 41.1MB) # s.total_allocated_memory, s.peak_allocated_memory # in arenas: 30.9MB # s.peak_arena_memory # rawmalloced: 4.1MB # s.peak_rawmalloced_memory # nursery: 4.0MB # s.nursery_size # raw assembler allocated: 1.0MB # s.jit_backend_allocated # ----------------------------- # Total: 39.7MB # stats.memory_allocated_sum # # Total time spent in GC: 0.073 # s.total_gc_time pypy_gc_time = CounterMetricFamily( "pypy_gc_time_seconds_total", "Total time spent in PyPy GC", labels=[], ) pypy_gc_time.add_metric([], s.total_gc_time / 1000) yield pypy_gc_time pypy_mem = GaugeMetricFamily( "pypy_memory_bytes", "Memory tracked by PyPy allocator", labels=["state", "class", "kind"], ) # memory used by JIT assembler pypy_mem.add_metric(["used", "", "jit"], s.jit_backend_used) pypy_mem.add_metric(["allocated", "", "jit"], s.jit_backend_allocated) # memory used by GCed objects pypy_mem.add_metric(["used", "", "arenas"], s.total_arena_memory) pypy_mem.add_metric(["allocated", "", "arenas"], s.peak_arena_memory) pypy_mem.add_metric(["used", "", "rawmalloced"], s.total_rawmalloced_memory) pypy_mem.add_metric(["allocated", "", "rawmalloced"], s.peak_rawmalloced_memory) pypy_mem.add_metric(["used", "", "nursery"], s.nursery_size) pypy_mem.add_metric(["allocated", "", "nursery"], s.nursery_size) # totals pypy_mem.add_metric(["used", "totals", "gc"], s.total_gc_memory) pypy_mem.add_metric(["allocated", "totals", "gc"], s.total_allocated_memory) pypy_mem.add_metric(["used", "totals", "gc_peak"], s.peak_memory) pypy_mem.add_metric(["allocated", "totals", "gc_peak"], s.peak_allocated_memory) yield pypy_mem
def collect(self) -> Iterable[Metric]: cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) for n, m in enumerate(gc.get_count()): cm.add_metric([str(n)], m) yield cm
def collect(self): # Collect metrics from NetScalers data = {} for nsip in self.nsips: data[nsip] = {} for entity in self.metrics.keys(): # cycle through metrics json to get required entities whose stats need to be collected print('>>> Collecting stats for: %s::%s' % (nsip, entity)) try: data[nsip][entity] = collect_data(nsip, entity, self.username, self.password, self.secure) except Exception as e: print('>>> Caught exception while collecting data: ' + str(e)) # Provide collected stats to Prometheus as a counter/guage with desired labels for entity_name, entity in self.metrics.items(): if('labels' in entity.keys()): label_names = [v[1] for v in entity['labels']] label_names.append('nsip') else: label_names = [] label_names.append('nsip') for ns_metric_name, prom_metric_name in entity.get('counters', []): c = CounterMetricFamily(prom_metric_name, ns_metric_name, labels=label_names) for nsip in self.nsips: entity_stats = data[nsip].get(entity_name, []) if( type(entity_stats) is not list): entity_stats = [entity_stats] for data_item in entity_stats: if('labels' in entity.keys()): label_values = [data_item[key] for key in [v[0] for v in entity['labels']]] label_values.append(nsip) else: label_values = [nsip] try: c.add_metric(label_values, float(data_item[ns_metric_name])) except Exception as e: print('>>> Caught exception while adding counter %s to %s: %s' %(ns_metric_name, entity_name, str(e))) yield c for ns_metric_name, prom_metric_name in entity.get('gauges', []): g = GaugeMetricFamily(prom_metric_name, ns_metric_name, labels=label_names) for nsip in self.nsips: entity_stats = data[nsip].get(entity_name, []) if(type(entity_stats) is not list): entity_stats = [entity_stats] for data_item in entity_stats: if('labels' in entity.keys()): label_values = [data_item[key] for key in [v[0] for v in entity['labels']]] label_values.append(nsip) else: label_values = [nsip] try: g.add_metric(label_values, float(data_item[ns_metric_name])) except Exception as e: print('>>> Caught exception while adding guage %s to %s: %s' %(ns_metric_name, entity_name, str(e)) ) yield g