def collect(self): deluge_host = os.environ.get('DELUGE_HOST', '127.0.0.1') client = DelugeRPCClient(deluge_host, self.rpc_port, self.rpc_user, self.rpc_password) client.connect() libtorrent_status_metrics = get_libtorrent_status_metrics_meta() libtorrent_status_metric_source_names = [ x['source'] for x in libtorrent_status_metrics.values() ] libtorrent_status_metric_values = client.call( 'core.get_session_status', libtorrent_status_metric_source_names) for metric, props in libtorrent_status_metrics.items(): if props['type'] is None: continue value = libtorrent_status_metric_values[props['source']] if 'conv' in props: value = props['conv'](value) yield props['type']('deluge_libtorrent_{}'.format(metric), props['help'], value=value) for direction in ['upload', 'download']: transfer_metric = CounterMetricFamily( 'deluge_libtorrent_{}_bytes_total'.format(direction), 'Total bytes {}ed for all torrents.'.format(direction), labels=['type']) for traffic_type in ['payload', 'ip_overhead', 'dht', 'tracker']: transfer_metric.add_metric( [traffic_type], libtorrent_status_metric_values['total_{}_{}'.format( traffic_type, direction).encode('ascii')]) yield transfer_metric yield new_metric_with_labels_and_value( GaugeMetricFamily, 'deluge_info', 'Deluge information', labels={ 'version': client.call('daemon.info').decode('utf-8'), 'libtorrent_version': client.call('core.get_libtorrent_version').decode('utf-8'), }, value=1) for key, value in client.call('core.get_config').items(): if isinstance(value, (int, float, bool)): yield GaugeMetricFamily( 'deluge_config_{}'.format(key.decode('utf-8')), 'Value of the deluge config setting {}'.format( key.decode('utf-8')), value=value) torrents_by_state = { 'downloading': 0, 'seeding': 0, 'paused': 0, 'checking': 0, 'queued': 0, 'error': 0, 'active': 0, # not the prometheus way, but the states above (as defined by deluge) are already overlapping, so sum() over them is already meaningless 'total': 0, } torrents_by_label = defaultdict(int) for torrent in client.core.get_torrents_status({}, [ b'label', b'state', b'download_payload_rate', b'upload_payload_rate' ]).values(): if b'label' in torrent: torrents_by_label[torrent[b'label'].decode('utf-8')] += 1 torrents_by_state[torrent[b'state'].decode('utf-8').lower()] += 1 torrents_by_state['total'] += 1 if torrent[b'download_payload_rate'] > 0 or torrent[ b'upload_payload_rate'] > 0: torrents_by_state['active'] += 1 if len(torrents_by_label) > 0: torrents_by_label_metric = GaugeMetricFamily( 'deluge_torrents_by_label', 'The number of torrents for each label assigned to a torrent using the deluge label plugin', labels=['label']) for label, count in torrents_by_label.items(): torrents_by_label_metric.add_metric([label], count) yield torrents_by_label_metric torrents_metric = GaugeMetricFamily( 'deluge_torrents', 'The number of torrents in a specific state (note: some states overlap)', labels=['state']) for state, torrent_count in torrents_by_state.items(): torrents_metric.add_metric([state], torrent_count) yield torrents_metric client.disconnect()
def test_gauge_labels(self): cmf = GaugeMetricFamily('g', 'help', labels=['a']) cmf.add_metric(['b'], 2) self.custom_collector(cmf) self.assertEqual(2, self.registry.get_sample_value('g', {'a': 'b'}))
def collect(self): """Collect metrics.""" # Task metrics task_info = get_task_state_info() t_state = GaugeMetricFamily( 'airflow_task_status', 'Shows the number of task instances with particular status', labels=['dag_id', 'task_id', 'owner', 'status']) for task in task_info: t_state.add_metric( [task.dag_id, task.task_id, task.owners, task.state or 'none'], task.value) yield t_state task_duration = GaugeMetricFamily( 'airflow_task_duration', 'Duration of successful tasks in seconds', labels=['task_id', 'dag_id', 'execution_date']) for task in get_task_duration_info(): task_duration_value = (task.end_date - task.start_date).total_seconds() task_duration.add_metric( [task.task_id, task.dag_id, str(task.execution_date.date())], task_duration_value) yield task_duration task_failure_count = GaugeMetricFamily('airflow_task_fail_count', 'Count of failed tasks', labels=['dag_id', 'task_id']) for task in get_task_failure_counts(): task_failure_count.add_metric([task.dag_id, task.task_id], task.count) yield task_failure_count # Dag Metrics dag_info = get_dag_state_info() d_state = GaugeMetricFamily( 'airflow_dag_status', 'Shows the number of dag starts with this status', labels=['dag_id', 'owner', 'status']) for dag in dag_info: d_state.add_metric([dag.dag_id, dag.owners, dag.state], dag.count) yield d_state dag_duration = GaugeMetricFamily( 'airflow_dag_run_duration', 'Duration of successful dag_runs in seconds', labels=['dag_id']) for dag in get_dag_duration_info(): dag_duration_value = (dag.end_date - dag.start_date).total_seconds() dag_duration.add_metric([dag.dag_id], dag_duration_value) yield dag_duration # Scheduler Metrics dag_scheduler_delay = GaugeMetricFamily('airflow_dag_scheduler_delay', 'Airflow DAG scheduling delay', labels=['dag_id']) for dag in get_dag_scheduler_delay(): dag_scheduling_delay_value = (dag.start_date - dag.execution_date).total_seconds() dag_scheduler_delay.add_metric([dag.dag_id], dag_scheduling_delay_value) yield dag_scheduler_delay task_scheduler_delay = GaugeMetricFamily( 'airflow_task_scheduler_delay', 'Airflow Task scheduling delay', labels=['queue']) for task in get_task_scheduler_delay(): task_scheduling_delay_value = (task.start_date - task.queued_dttm).total_seconds() task_scheduler_delay.add_metric([task.queue], task_scheduling_delay_value) yield task_scheduler_delay num_queued_tasks_metric = GaugeMetricFamily( 'airflow_num_queued_tasks', 'Airflow Number of Queued Tasks', ) num_queued_tasks = get_num_queued_tasks() num_queued_tasks_metric.add_metric([], num_queued_tasks) yield num_queued_tasks_metric
def gen_k8s_node_gpu_reserved(): return GaugeMetricFamily("k8s_node_gpu_reserved", "gpu reserved on k8s node", labels=["host_ip"])
def gen_k8s_api_gauge(): return GaugeMetricFamily("k8s_api_server_count", "count of k8s api server", labels=["error", "host_ip"])
def collect(self): start = time.time() # Perform REST API call to fetch data data = call_rest_api('/mgmt/status/default/CurrentSensors', self.ip, self.port, self.session, self.timeout) if data == '': return # Update Prometheus metrics for cs in data['CurrentSensors']: if cs['Name'] == 'Power Supply 1 In Current': g = GaugeMetricFamily( 'mqa_current_sensors_power_supply_1_in_current_upper_critical_threshold_amperes', 'Upper critical threshold for current going into power supply 1', labels=['appliance', 'readingStatus']) g.add_metric([self.appliance, cs['ReadingStatus']], cs['UpperCriticalThreshold'] / 1000) yield g g = GaugeMetricFamily( 'mqa_current_sensors_power_supply_1_in_current_amperes', 'Current going into power supply 1', labels=['appliance', 'readingStatus']) g.add_metric([self.appliance, cs['ReadingStatus']], cs['Value'] / 1000) yield g if cs['Name'] == 'Power Supply 1 Out Current': g = GaugeMetricFamily( 'mqa_current_sensors_power_supply_1_out_current_upper_critical_threshold_amperes', 'Upper critical threshold for current going out power supply 1', labels=['appliance', 'readingStatus']) g.add_metric([self.appliance, cs['ReadingStatus']], cs['UpperCriticalThreshold'] / 1000) yield g g = GaugeMetricFamily( 'mqa_current_sensors_power_supply_1_out_current_amperes', 'Current going out power supply 1', labels=['appliance', 'readingStatus']) g.add_metric([self.appliance, cs['ReadingStatus']], cs['Value'] / 1000) yield g if cs['Name'] == 'Power Supply 2 In Current': g = GaugeMetricFamily( 'mqa_current_sensors_power_supply_2_in_current_upper_critical_threshold_amperes', 'Upper critical threshold for current going into power supply 2', labels=['appliance', 'readingStatus']) g.add_metric([self.appliance, cs['ReadingStatus']], cs['UpperCriticalThreshold'] / 1000) yield g g = GaugeMetricFamily( 'mqa_current_sensors_power_supply_2_in_current_amperes', 'Current going into power supply 2', labels=['appliance', 'readingStatus']) g.add_metric([self.appliance, cs['ReadingStatus']], cs['Value'] / 1000) yield g if cs['Name'] == 'Power Supply 2 Out Current': g = GaugeMetricFamily( 'mqa_current_sensors_power_supply_2_out_current_upper_critical_threshold_amperes', 'Upper critical threshold for current going out power supply 2', labels=['appliance', 'readingStatus']) g.add_metric([self.appliance, cs['ReadingStatus']], cs['UpperCriticalThreshold'] / 1000) yield g g = GaugeMetricFamily( 'mqa_current_sensors_power_supply_2_out_current_amperes', 'Current going out power supply 2', labels=['appliance', 'readingStatus']) g.add_metric([self.appliance, cs['ReadingStatus']], cs['Value'] / 1000) yield g g = GaugeMetricFamily( 'mqa_exporter_current_sensors_elapsed_time_seconds', 'Exporter eleapsed time to collect current sensors metrics', labels=['appliance']) g.add_metric([self.appliance], time.time() - start) yield g
def gen_gpu_mem_util_gauge(): return GaugeMetricFamily("nvidiasmi_utilization_memory", "gpu memory utilization of card", labels=["minor_number"])
def collect_host_info(self) -> dict: """ GET /host "output": { "disk_available": 3057, "ipl_time": "IPL at 06/02/17 11:07:10 EDT", "vcpus_used": 6, "hypervisor_type": "zvm", "vcpus": 6, "zvm_host": "OPNSTK2", "memory_mb": 51200.0, "cpu_info": { "cec_model": "2817", "architecture": "s390x" }, "disk_total": 3623, "zcc_userid": "ZCCUID", "hypervisor_hostname": "OPNSTK2", "hypervisor_version": 640, "disk_used": 566, "memory_mb_used": 0.0 } """ res = self.send_request('host_get_info') metric = {} metric['vcpus'] = GaugeMetricFamily('zvm_host_vcpus', 'The virtual CPUs', labels=['host']) metric['vcpus_used'] = GaugeMetricFamily('zvm_host_vcpus_used', 'The used vcpus', labels=['host']) metric['memory_mb'] = GaugeMetricFamily( 'zvm_host_memory_mb', 'The total available size of the memory in MB.', labels=['host']) metric['memory_mb_used'] = GaugeMetricFamily( 'zvm_host_memory_mb_used', 'The size of used memory in MB.', labels=['host']) metric['disk_available'] = GaugeMetricFamily( 'zvm_host_disk_available', 'The total available size of the disks in the pool in Gigabytes(G).', labels=['host']) metric['disk_total'] = GaugeMetricFamily( 'zvm_host_disk_total', 'The total size of the pool in Gigabytes (G).', labels=['host']) metric['disk_used'] = GaugeMetricFamily( 'zvm_host_disk_used', 'The size of used disks in the pool in Gigabytes(G).', labels=['host']) data = res['output'] self.host = data['hypervisor_hostname'] # hypervisor_hostname? for i in metric.keys(): metric[i].add_metric([self.host], data[i]) # labels = ['zvm_host', 'hypervisor_hostname', 'hypervisor_version', 'hypervisor_type', 'zcc_userid', 'ipl_time'] # TODO:deal with cpu_info # labels_value = [] # for i in labels: # print(i) # labels_value.append(data[i]) # metric['other_info'] = GaugeMetricFamily('other_info', '', labels=labels) # metric['other_info'].add_metric(labels_value, 1) return metric
def collect(self): cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) for n, m in enumerate(gc.get_count()): cm.add_metric([str(n)], m) yield cm
def collect(self): data_gauges = {} account_keys = [ "td_account_cost", "td_account_imp", "td_account_click", "td_account_activation", "td_account_register", "td_account_conversion", "td_account_retention", "td_account_download_completed", "td_account_awaken", "td_account_media_kuaishou_aclick", "td_account_media_kuaishou_bclick", "td_account_form", "td_account_adv_form", "td_account_adv_valid_clue" ] account_value = list( mysql_get(''' select SUM(cost) as td_account_cost , SUM(imp) as td_account_imp, SUM(click) as td_account_click, SUM(activation) as td_account_activation, SUM(register) as td_account_register , SUM(conversion) as td_account_conversion , SUM(retention) as td_account_retention, SUM(download_completed) as td_account_download_completed, SUM(awaken) as td_account_awaken, SUM(media_kuaishou_aclick) as td_account_media_kuaishou_aclick, SUM(media_kuaishou_bclick) as td_account_media_kuaishou_bclick, sum(form) as td_account_form, sum(adv_form) as td_account_adv_form, SUM(adv_valid_clue) as td_account_adv_valid_clue from alphadesk.report_realtime_account where pday=date_format(now(),'%Y%m%d');''' )) creative_kyes = [ "td_creative_cost", "td_creative_imp", "td_creative_click", "td_creative_activation", "td_creative_register", "td_creative_conversion", "td_creative_retention", "td_creative_download_completed", "td_creative_awaken", "td_creative_media_kuaishou_aclick", "td_creative_media_kuaishou_bclick", "td_creative_form", "td_creative_adv_form", "td_creative_adv_valid_clue", "td_creative_drs_click" ] creative_value = list( mysql_get(''' select SUM(cost) as td_creative_cost , SUM(imp) as td_creative_imp, SUM(click) as td_creative_click, SUM(activation) as td_creative_activation, SUM(register) as td_creative_register , SUM(conversion) as td_creative_conversion , SUM(retention) as td_creative_retention, SUM(download_completed) as td_creative_download_completed, SUM(awaken) as td_creative_awaken, SUM(media_kuaishou_aclick) as td_creative_media_kuaishou_aclick, SUM(media_kuaishou_bclick) as td_creative_media_kuaishou_bclick, sum(form) as td_creative_form, sum(adv_form) as td_creative_adv_form, SUM(adv_valid_clue) as td_creative_adv_valid_clue, SUM(drs_click) as td_creative_drs_click from alphadesk.report_realtime_creative where pday=date_format(now(),'%Y%m%d');''' )) account_dict = dictpro(account_keys, account_value) creative_dict = dictpro(creative_kyes, creative_value) pday, phour = time_create() hour_account_keys = [ 'hour_account_imp', 'hour_account_cost', 'hour_account_clk' ] hour_account_value = list( mysql_get(''' select SUM(imp) as hour_account_imp,sum(cost) as hour_account_cost,SUM(click) as hour_account_clk from alphadesk.report_realtime_account where pday={} and phour={}; '''.format(pday, phour))) hour_account_dict = dictpro(hour_account_keys, hour_account_value) hour_creative_keys = [ 'hour_creative_imp', 'hour_creative_cost', 'hour_creative_clk', 'hour_creative_drs_click' ] hour_creative_value = list( mysql_get(''' select SUM(imp) as hour_creative_imp,sum(cost) as hour_creative_cost,SUM(click) as hour_creative_clk, SUM(drs_click) as hour_creative_drs_click from alphadesk.report_realtime_creative where pday={} and phour={}; '''.format(pday, phour))) hour_creative_dict = dictpro(hour_creative_keys, hour_creative_value) for key in account_dict: data_gauges[key] = GaugeMetricFamily('polardb_{}'.format(key), 'td_polardb', value=account_dict[key]) for key in creative_dict: data_gauges[key] = GaugeMetricFamily('polardb_{}'.format(key), 'td_polardb', value=creative_dict[key]) for key in hour_account_dict: data_gauges[key] = GaugeMetricFamily('polardb_{}'.format(key), 'td_polardb', value=hour_account_dict[key]) for key in hour_creative_dict: data_gauges[key] = GaugeMetricFamily('polardb_{}'.format(key), 'td_polardb', value=hour_creative_dict[key]) for metric in data_gauges: yield data_gauges[metric]
def __init__(self, fa): self.fa = fa self.map_host_vol = GaugeMetricFamily( 'purefa_host_volumes_info', 'FlashArray host volumes connections', labels=['host', 'naaid'])
def metrics_setup_sta(self, metrics): metrics['c_sta_rx_bytes'] = CounterMetricFamily('unifi_sta_rx_bytes', 'Client RX bytes', labels=['mac', 'hostname', 'radio', 'essid']) metrics['c_sta_tx_bytes'] = CounterMetricFamily('unifi_sta_tx_bytes', 'Client TX bytes', labels=['mac', 'hostname', 'radio', 'essid']) metrics['g_sta_rssi'] = GaugeMetricFamily('unifi_sta_rssi', 'Client signal RSSI', labels=['mac', 'hostname', 'radio', 'essid'])
def metric_up_gauge(resource: str, succeeded=True): metric_name = resource + '_up' description = 'Did the {} fetch succeed.'.format(resource) return GaugeMetricFamily(metric_name, description, value=int(succeeded))
def collect(self): start = time.time() # Perform REST API call to fetch data data = call_rest_api('/mgmt/status/default/FilesystemStatus', self.ip, self.port, self.session, self.timeout) if data == '': return # Update Prometheus metrics g = GaugeMetricFamily( 'mqa_file_system_encrypted_bytes_free', 'Free, or unused and available, encrypted storage space on the appliance', labels=['appliance']) g.add_metric([self.appliance], data['FilesystemStatus']['FreeEncrypted'] * 1000000) yield g c = CounterMetricFamily( 'mqa_file_system_encrypted_bytes_total', 'Total encrypted storage space on the appliance (the maximum capacity)', labels=['appliance']) c.add_metric([self.appliance], data['FilesystemStatus']['TotalEncrypted'] * 1000000) yield c g = GaugeMetricFamily( 'mqa_file_system_temporary_bytes_free', 'Free, or unused and available, temporary storage space on the appliance', labels=['appliance']) g.add_metric([self.appliance], data['FilesystemStatus']['FreeTemporary'] * 1000000) yield g c = CounterMetricFamily( 'mqa_file_system_temporary_bytes_total', 'Total temporary storage space on the appliance', labels=['appliance']) c.add_metric([self.appliance], data['FilesystemStatus']['TotalTemporary'] * 1000000) yield c g = GaugeMetricFamily( 'mqa_file_system_internal_bytes_free', 'Free, or unused and available, internal storage space on the appliance', labels=['appliance']) g.add_metric([self.appliance], data['FilesystemStatus']['FreeInternal'] * 1000000) yield g c = CounterMetricFamily( 'mqa_file_system_internal_bytes_total', 'Total internal storage space on the appliance', labels=['appliance']) c.add_metric([self.appliance], data['FilesystemStatus']['TotalInternal'] * 1000000) yield c g = GaugeMetricFamily( 'mqa_exporter_file_system_elapsed_time_seconds', 'Exporter eleapsed time to collect file system metrics', labels=['appliance']) g.add_metric([self.appliance], time.time() - start) yield g
def collect(self): # The pages in the engineer mode don't need a csrf_token, other requests do # however need it. A valid token can be extracted from the index.html dsl_info = requests.get("http://{}/html/engineer/ro_dsl.htm".format( sys.argv[1]), cookies=cookie_jar) ds = parse_metric_info(dsl_info.text) # Dirty approach to DSL metrics, only gauge is available # because we can only scrape error counts, not increment them speedport_state = GaugeMetricFamily( 'speedport_state', 'DSL Sync state', labels=["host", "report"]) # State; 1=online, 0=anything else speedport_state.add_metric([sys.argv[1], ds["State"]], 1 if ds["State"] == "online" else 0) yield speedport_state speedport_actual_data_rate = GaugeMetricFamily( 'speedport_actual_data_rate_kpbs', 'Actual DSL Sync data rate', labels=["host", "method"]) # ActualDataRate speedport_actual_data_rate.add_metric([sys.argv[1], "upload"], to_float( ds["ActualDataRate"][0])) speedport_actual_data_rate.add_metric([sys.argv[1], "download"], to_float( ds["ActualDataRate"][1])) yield speedport_actual_data_rate speedport_attainable_data_rate = GaugeMetricFamily( 'speedport_attainable_data_rate_kpbs', 'Attainable DSL Sync data rate', labels=["host", "method"]) # AttainableDataRate speedport_attainable_data_rate.add_metric( [sys.argv[1], "upload"], to_float(ds["AttainableDataRate"][0])) speedport_attainable_data_rate.add_metric( [sys.argv[1], "download"], to_float(ds["AttainableDataRate"][1])) yield speedport_attainable_data_rate speedport_crc = GaugeMetricFamily('speedport_crc_error_count', 'Amount of CRC Errors', labels=["host", "method"]) # CRCerrorcount speedport_crc.add_metric([sys.argv[1], "upload"], to_float(ds["CRCerrorcount"][0])) speedport_crc.add_metric([sys.argv[1], "download"], to_float(ds["CRCerrorcount"][1])) yield speedport_crc speedport_fec = GaugeMetricFamily('speedport_fec_error_count', 'Amount of FEC Errors', labels=["host", "method"]) # FECerrorcount speedport_fec.add_metric([sys.argv[1], "upload"], to_float(ds["FECerrorcount"][0])) speedport_fec.add_metric([sys.argv[1], "download"], to_float(ds["FECerrorcount"][1])) yield speedport_fec speedport_hec = GaugeMetricFamily('speedport_hec_error_count', 'Amount of HEC Errors', labels=["host", "method"]) # HECerrorcount speedport_hec.add_metric([sys.argv[1], "upload"], to_float(ds["HECerrorcount"][0])) speedport_hec.add_metric([sys.argv[1], "download"], to_float(ds["HECerrorcount"][1])) yield speedport_hec speedport_line_attenuation = GaugeMetricFamily( 'speedport_line_attenuation', 'Line Attenuation', labels=["host", "method"]) # LineAttenuation speedport_line_attenuation.add_metric([sys.argv[1], "upload"], to_float( ds["LineAttenuation"][0])) speedport_line_attenuation.add_metric([sys.argv[1], "download"], to_float( ds["LineAttenuation"][1])) yield speedport_line_attenuation speedport_snr = GaugeMetricFamily('speedport_snr_margin', 'SNR Margin', labels=["host", "method"]) # SNRMargin speedport_snr.add_metric([sys.argv[1], "upload"], to_float(ds["SNRMargin"][0])) speedport_snr.add_metric([sys.argv[1], "download"], to_float(ds["SNRMargin"][1])) yield speedport_snr speedport_signal_level = GaugeMetricFamily('speedport_signal_level', 'Signal Level', labels=["host", "method" ]) # Signal-level speedport_signal_level.add_metric([sys.argv[1], "upload"], to_float(ds["Signal-level"][0])) speedport_signal_level.add_metric([sys.argv[1], "download"], to_float(ds["Signal-level"][1])) yield speedport_signal_level
def get_hypervisor_metrics(self): metrics = [] nova_hypervisor_up = GaugeMetricFamily( 'nova_hypervisor_up', 'Metadata about a Nova hypervisor', labels=[ 'nova_hypervisor_id', 'nova_hypervisor_name', 'nova_hypervisor_up', 'nova_hypervisor_enabled', 'nova_hypervisor_type', 'nova_hypervisor_version', 'nova_hypervisor_cpu_vendor', 'nova_hypervisor_cpu_model', 'nova_hypervisor_cpu_arch', ]) metrics.append(nova_hypervisor_up) nova_hypervisor_vcpus = GaugeMetricFamily( 'nova_hypervisor_vcpus', 'Number of vcpus available', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_vcpus) nova_hypervisor_vcpus_used = GaugeMetricFamily( 'nova_hypervisor_vcpus', 'Number of vcpus in use', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_vcpus_used) nova_hypervisor_running_vms = GaugeMetricFamily( 'nova_hypervisor_running_vms', 'Number of vms running on this hypervisor', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_running_vms) nova_hypervisor_local_disk_size = GaugeMetricFamily( 'nova_hypervisor_local_disk_size', 'Amount of local disk available on this hypervisor', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_local_disk_size) nova_hypervisor_local_disk_used = GaugeMetricFamily( 'nova_hypervisor_local_disk_used', 'Amount of local disk used on this hypervisor', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_local_disk_used) nova_hypervisor_local_disk_free = GaugeMetricFamily( 'nova_hypervisor_local_disk_free', 'Amount of local disk free on this hypervisor', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_local_disk_free) nova_hypervisor_memory_size = GaugeMetricFamily( 'nova_hypervisor_memory_size', 'Amount of memory available on this hypervisor', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_memory_size) nova_hypervisor_memory_used = GaugeMetricFamily( 'nova_hypervisor_memory_used', 'Amount of memory used on this hypervisor', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_memory_used) nova_hypervisor_memory_free = GaugeMetricFamily( 'nova_hypervisor_memory_free', 'Amount of memory free on this hypervisor', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_memory_free) nova_hypervisor_current_workload = GaugeMetricFamily( 'nova_hypervisor_current_workload', 'Number of hypervisor tasks', labels=['nova_hypervisor_id'], ) metrics.append(nova_hypervisor_current_workload) for hv in self.cloud.list_hypervisors(): LOG.debug('gathering metrics for hypervisor %s (%s)', hv.name, hv.id) if not isinstance(hv.cpu_info, dict): cpu = json.loads(hv.cpu_info) else: cpu = hv.cpu_info hvid = str(hv.id) nova_hypervisor_up.add_metric([ hvid, hv.name, 'true' if hv.state == 'up' else 'false', 'true' if hv.status == 'enabled' else 'false', hv.hypervisor_type, str(hv.hypervisor_version), cpu['vendor'], cpu['model'], cpu['arch'], ], 1.0) nova_hypervisor_vcpus.add_metric([hvid], hv.vcpus) nova_hypervisor_vcpus_used.add_metric([hvid], hv.vcpus_used) nova_hypervisor_running_vms.add_metric([hvid], hv.running_vms) nova_hypervisor_local_disk_size.add_metric([hvid], hv.local_disk_size) nova_hypervisor_local_disk_used.add_metric([hvid], hv.local_disk_used) nova_hypervisor_local_disk_free.add_metric([hvid], hv.local_disk_free) nova_hypervisor_memory_size.add_metric([hvid], hv.memory_size) nova_hypervisor_memory_used.add_metric([hvid], hv.memory_used) nova_hypervisor_memory_free.add_metric([hvid], hv.memory_free) nova_hypervisor_current_workload.add_metric([hvid], hv.current_workload) yield from iter(metrics)
def scrape(): global START START = datetime.datetime.utcnow().date().strftime('%Y-%m-%d %H:%M:%S') pushes = retrieve_recent_pub_pushes() pub_pushes_total_family = CounterMetricFamily('pub_pushes_total', 'Count of all pub pushes', labels=PUSH_LABELS) for value, labels in pub_pushes_total(pushes): pub_pushes_total_family.add_metric(labels, value) pub_push_errors_total_family = CounterMetricFamily( 'pub_push_errors_total', 'Count of all pub push errors', labels=PUSH_LABELS) error_pushes = only(pushes, states=error_states) for value, labels in pub_pushes_total(error_pushes): pub_push_errors_total_family.add_metric(labels, value) pub_in_progress_pushes_family = GaugeMetricFamily( 'pub_in_progress_pushes', 'Count of all in-progress pub pushes', labels=PUSH_LABELS, ) in_progress_pushes = retrieve_open_pub_pushes() for value, labels in pub_pushes_total(in_progress_pushes): pub_in_progress_pushes_family.add_metric(labels, value) pub_waiting_pushes_family = GaugeMetricFamily( 'pub_waiting_pushes', 'Count of all waiting, unscheduled pub pushes', labels=PUSH_LABELS, ) waiting_pushes = retrieve_waiting_pub_pushes() for value, labels in pub_pushes_total(waiting_pushes): pub_waiting_pushes_family.add_metric(labels, value) pub_push_duration_seconds_family = HistogramMetricFamily( 'pub_push_duration_seconds', 'Histogram of pub push durations', labels=PUSH_LABELS, ) for buckets, duration_sum, labels in pub_push_duration_seconds(pushes): pub_push_duration_seconds_family.add_metric(labels, buckets, sum_value=duration_sum) # Replace this in one atomic operation to avoid race condition to the Expositor metrics.update({ 'pub_pushes_total': pub_pushes_total_family, 'pub_push_errors_total': pub_push_errors_total_family, 'pub_in_progress_pushes': pub_in_progress_pushes_family, 'pub_waiting_pushes': pub_waiting_pushes_family, 'pub_push_duration_seconds': pub_push_duration_seconds_family, })
def gen_pai_pod_gauge(): return GaugeMetricFamily("pai_pod_count", "count of pai pod", labels=["service_name", "name", "namespace", "phase", "host_ip", "initialized", "pod_scheduled", "ready"])
def gen_docker_daemon_counter(): return GaugeMetricFamily("docker_daemon_count", "count of docker daemon", labels=["error"])
def gen_pai_job_pod_gauge(): return GaugeMetricFamily("pai_job_pod_count", "count of pai job pod", labels=["job_name", "name", "phase", "host_ip", "initialized", "pod_bound", "pod_scheduled", "ready"])
def gen_k8s_node_gpu_available(): return GaugeMetricFamily("k8s_node_gpu_available", "gpu available on k8s node", labels=["host_ip"])
def gen_pai_container_gauge(): return GaugeMetricFamily("pai_container_count", "count of container pod", labels=["service_name", "pod_name", "name", "namespace", "state", "host_ip", "ready"])
def gen_k8s_node_gpu_total(): return GaugeMetricFamily("k8s_node_gpu_total", "gpu total on k8s node", labels=["host_ip"])
def gen_pai_node_gauge(): return GaugeMetricFamily("pai_node_count", "count of pai node", labels=["name", "disk_pressure", "memory_pressure", "out_of_disk", "ready", "unschedulable"])
def test_gauge(self): self.custom_collector(GaugeMetricFamily('g', 'help', value=1)) self.assertEqual(1, self.registry.get_sample_value('g', {}))
def collect(self): try: hostname = socket.gethostname() # allCpu allCpu = GaugeMetricFamily( 'offline_machine_cpu_percentage', 'machine cpu percentage', labels=['host']) allCpu.add_metric([hostname], value=psutil.cpu_percent()) yield allCpu # all mem allMem = GaugeMetricFamily( 'offline_machine_mem_percentage', 'machine mem percentage', labels=['host']) allMem.add_metric([hostname], value=psutil.virtual_memory().percent) yield allMem # all disk allDisk = GaugeMetricFamily( 'offline_machine_disk_percentage', 'machine disk percentage', labels=['host']) allDisk.add_metric([hostname], value=psutil.disk_usage('/').percent) yield allDisk process_names = self.config['check_processes'] for process_name in process_names: print 'process_name = %s ' % (process_name) allProcess = get_pid(process_name) metrics = {} ThreadList = [] for i, value in enumerate(allProcess): t = CollectThread(str(i), value['pid'], metrics) ThreadList.append(t) for t in ThreadList: t.start() for t in ThreadList: t.join() for key, process_metrics in metrics.iteritems(): snake_case = process_name.lower() process_count = GaugeMetricFamily( 'offline_process_count', snake_case + ' Total Running time in seconds.', labels=['pid', 'exe', 'cmd', 'host']) process_count.add_metric([ process_metrics['pid'], process_name, process_metrics['cmdline'], hostname ], value=1) yield process_count if process_metrics: runningTime = GaugeMetricFamily( 'offline_process_running_time_seconds_total', snake_case + ' Total Running time in seconds.', labels=['pid', 'exe', 'cmd', 'host']) runningTime.add_metric( [ process_metrics['pid'], process_name, process_metrics['cmdline'], hostname ], value=process_metrics['create_time']) yield runningTime # cpu cpu = GaugeMetricFamily( 'offline_process_cpu_percentage', snake_case + ' CPU Percentage.', labels=['pid', 'exe', 'cmd', 'host']) cpu.add_metric([ process_metrics['pid'], process_name, process_metrics['cmdline'], hostname ], value=process_metrics['cpu_percent']) yield cpu # mempersent mempersent = GaugeMetricFamily( 'offline_process_mem_percentage', snake_case + ' mem Percentage.', labels=['pid', 'exe', 'cmd', 'host']) mempersent.add_metric( [ process_metrics['pid'], process_name, process_metrics['cmdline'], hostname ], value=process_metrics['memory_percent']) yield mempersent threadCount = GaugeMetricFamily( 'offline_process_threads_number', snake_case + ' Total Number of Threads.', labels=['pid', 'exe', 'cmd', 'host']) threadCount.add_metric( [ process_metrics['pid'], process_name, process_metrics['cmdline'], hostname ], value=process_metrics['num_threads']) yield threadCount else: pass except Exception, err: print 1, err
def to_metric(self, desc, tag_values, agg_data): """ to_metric translate the data that OpenCensus create to Prometheus format, using Prometheus Metric object :type desc: dict :param desc: The map that describes view definition :type tag_values: tuple of :class: `~opencensus.tags.tag_value.TagValue` :param object of opencensus.tags.tag_value.TagValue: TagValue object used as label values :type agg_data: object of :class: `~opencensus.stats.aggregation_data.AggregationData` :param object of opencensus.stats.aggregation_data.AggregationData: Aggregated data that needs to be converted as Prometheus samples :rtype: :class:`~prometheus_client.core.CounterMetricFamily` or :class:`~prometheus_client.core.HistogramMetricFamily` or :class:`~prometheus_client.core.UnknownMetricFamily` or :class:`~prometheus_client.core.GaugeMetricFamily` :returns: A Prometheus metric object """ metric_name = desc['name'] metric_description = desc['documentation'] label_keys = desc['labels'] assert (len(tag_values) == len(label_keys)) # Prometheus requires that all tag values be strings hence # the need to cast none to the empty string before exporting. See # https://github.com/census-instrumentation/opencensus-python/issues/480 tag_values = [tv if tv else "" for tv in tag_values] if isinstance(agg_data, aggregation_data_module.CountAggregationData): metric = CounterMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=tag_values, value=agg_data.count_data) return metric elif isinstance(agg_data, aggregation_data_module.DistributionAggregationData): assert (agg_data.bounds == sorted(agg_data.bounds)) # buckets are a list of buckets. Each bucket is another list with # a pair of bucket name and value, or a triple of bucket name, # value, and exemplar. buckets need to be in order. buckets = [] cum_count = 0 # Prometheus buckets expect cumulative count. for ii, bound in enumerate(agg_data.bounds): cum_count += agg_data.counts_per_bucket[ii] bucket = [str(bound), cum_count] buckets.append(bucket) # Prometheus requires buckets to be sorted, and +Inf present. # In OpenCensus we don't have +Inf in the bucket bonds so need to # append it here. buckets.append(["+Inf", agg_data.count_data]) metric = HistogramMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric( labels=tag_values, buckets=buckets, sum_value=agg_data.sum, ) return metric elif isinstance(agg_data, aggregation_data_module.SumAggregationData): metric = UnknownMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=tag_values, value=agg_data.sum_data) return metric elif isinstance(agg_data, aggregation_data_module.LastValueAggregationData): metric = GaugeMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=tag_values, value=agg_data.value) return metric else: raise ValueError("unsupported aggregation type %s" % type(agg_data))
def collect(self): """Collect metrics.""" # Task metrics task_info = get_task_state_info() t_state = GaugeMetricFamily( "airflow_task_status", "Shows the number of task instances with particular status", labels=["dag_id", "task_id", "owner", "status"], ) for task in task_info: t_state.add_metric( [task.dag_id, task.task_id, task.owners, task.state or "none"], task.value, ) yield t_state task_duration = GaugeMetricFamily( "airflow_task_duration", "Duration of successful tasks in seconds", labels=["task_id", "dag_id", "execution_date"], ) for task in get_task_duration_info(): task_duration_value = (task.end_date - task.start_date).total_seconds() task_duration.add_metric( [task.task_id, task.dag_id, str(task.execution_date.date())], task_duration_value, ) yield task_duration task_failure_count = GaugeMetricFamily( "airflow_task_fail_count", "Count of failed tasks", labels=["dag_id", "task_id"], ) for task in get_task_failure_counts(): task_failure_count.add_metric([task.dag_id, task.task_id], task.count) yield task_failure_count # Dag Metrics # Dag Metrics dag_info = get_dag_state_info() labels = ["dag_id", "owner", "status"] d_state = GaugeMetricFamily( "airflow_dag_status", "Shows the number of dag starts with this status", labels=labels, ) for dag in dag_info: k, v = get_dag_labels(dag.dag_id) d_state._labelnames = labels + k d_state.add_metric([dag.dag_id, dag.owners, dag.state] + v, dag.count) yield d_state labels = ["dag_id"] dag_duration = GaugeMetricFamily( "airflow_dag_run_duration", "Duration of successful dag_runs in seconds", labels=labels, ) for dag in get_dag_duration_info(): k, v = get_dag_labels(dag.dag_id) dag_duration._labelnames = labels + k dag_duration_value = (dag.end_date - dag.start_date).total_seconds() dag_duration.add_metric([dag.dag_id] + v, dag_duration_value) yield dag_duration # Scheduler Metrics labels = ["dag_id"] dag_scheduler_delay = GaugeMetricFamily( "airflow_dag_scheduler_delay", "Airflow DAG scheduling delay", labels=labels, ) for dag in get_dag_scheduler_delay(): k, v = get_dag_labels(dag.dag_id) dag_scheduler_delay._labelnames = labels + k dag_scheduling_delay_value = (dag.start_date - dag.execution_date).total_seconds() dag_scheduler_delay.add_metric([dag.dag_id] + v, dag_scheduling_delay_value) yield dag_scheduler_delay # XCOM parameters labels = ["dag_id", "task_id"] xcom_params = GaugeMetricFamily( "airflow_xcom_parameter", "Airflow Xcom Parameter", labels=labels, ) xcom_config = load_xcom_config() for tasks in xcom_config.get("xcom_params", []): k, v = get_dag_labels(task.dag_id) xcom_params._labelnames = labels + k for param in get_xcom_params(tasks["task_id"]): xcom_value = extract_xcom_parameter(param.value) if tasks["key"] in xcom_value: xcom_params.add_metric([param.dag_id, param.task_id] + v, xcom_value[tasks["key"]]) yield xcom_params task_scheduler_delay = GaugeMetricFamily( "airflow_task_scheduler_delay", "Airflow Task scheduling delay", labels=["queue"], ) for task in get_task_scheduler_delay(): task_scheduling_delay_value = (task.start_date - task.queued_dttm).total_seconds() task_scheduler_delay.add_metric([task.queue], task_scheduling_delay_value) yield task_scheduler_delay num_queued_tasks_metric = GaugeMetricFamily( "airflow_num_queued_tasks", "Airflow Number of Queued Tasks", ) num_queued_tasks = get_num_queued_tasks() num_queued_tasks_metric.add_metric([], num_queued_tasks) yield num_queued_tasks_metric
def add_gauge(self, name, desc, labels): self.gauges[name] = GaugeMetricFamily(name, desc, labels=labels)
def collect(self): data = {} for nsip in self.nsips: data[nsip] = {} for entity in self.metrics.keys(): logger.info('Collecting metric %s for %s' % (entity, nsip)) try: data[nsip][entity] = collect_data(nsip, entity, self.username, self.password, self.protocol, self.nitro_timeout) except Exception as e: logger.warning('Could not collect metric: ' + str(e)) # Add labels to metrics and provide to Prometheus log_prefix_match = True for entity_name, entity in self.metrics.items(): if('labels' in entity.keys()): label_names = [v[1] for v in entity['labels']] label_names.append('nsip') else: label_names = [] label_names.append('nsip') # Provide collected metric to Prometheus as a counter for ns_metric_name, prom_metric_name in entity.get('counters', []): c = CounterMetricFamily(prom_metric_name, ns_metric_name, labels=label_names) for nsip in self.nsips: entity_stats = data[nsip].get(entity_name, []) if(type(entity_stats) is not list): entity_stats = [entity_stats] for data_item in entity_stats: if not data_item: continue if ns_metric_name not in data_item.keys(): logger.warning('Counter stats for %s not enabled in netscalar %s, so could not add to %s' % (ns_metric_name, nsip, entity_name)) break if('labels' in entity.keys()): label_values = [data_item[key] for key in [v[0] for v in entity['labels']]] if os.environ.get('KUBERNETES_SERVICE_HOST') is not None: if entity_name == "lbvserver": prefix_match = update_lbvs_label(self.k8s_cic_prefix, label_values, ns_metric_name, log_prefix_match) if not prefix_match: log_prefix_match = False label_values.append(nsip) else: label_values = [nsip] try: c.add_metric(label_values, float(data_item[ns_metric_name])) except Exception as e: logger.error('Caught exception while adding counter %s to %s: %s' % (ns_metric_name, entity_name, str(e))) yield c # Provide collected metric to Prometheus as a gauge for ns_metric_name, prom_metric_name in entity.get('gauges', []): g = GaugeMetricFamily(prom_metric_name, ns_metric_name, labels=label_names) for nsip in self.nsips: entity_stats = data[nsip].get(entity_name, []) if(type(entity_stats) is not list): entity_stats = [entity_stats] for data_item in entity_stats: if not data_item: continue if ns_metric_name not in data_item.keys(): logger.warning('Gauge stats for %s not enabled in netscalar %s, so could not add to %s' % (ns_metric_name, nsip, entity_name)) break if('labels' in entity.keys()): label_values = [data_item[key] for key in [v[0] for v in entity['labels']]] if os.environ.get('KUBERNETES_SERVICE_HOST') is not None: if entity_name == "lbvserver": prefix_match = update_lbvs_label(self.k8s_cic_prefix, label_values, ns_metric_name, log_prefix_match) if not prefix_match: log_prefix_match = False label_values.append(nsip) else: label_values = [nsip] try: g.add_metric(label_values, float(data_item[ns_metric_name])) except Exception as e: logger.error('Caught exception while adding counter %s to %s: %s' % (ns_metric_name, entity_name, str(e))) yield g