def collect_tcam(self): # Get the tcam usage data switch_tcam = self.switch_command(command='show hardware capacity') if switch_tcam: used_metrics = GaugeMetricFamily('arista_tcam_used', 'TCAM Usage Data') total_metrics = GaugeMetricFamily('arista_tcam_total', 'TCAM Capacity') for entry in switch_tcam['result'][0]['tables']: try: labels = ({ 'table': entry['table'], 'chip': entry['chip'], 'feature': entry['feature'] }) logging.debug((f'Adding: table={entry["table"]} ' f'value={entry["used"]} ' f'labels={labels}')) used_metrics.add_sample('arista_tcam_used', value=entry['used'], labels=labels) total_metrics.add_sample('arista_tcam_total', value=entry['maxLimit'], labels=labels) except KeyError: logging.error('KeyError in switch_tcam entries') continue yield total_metrics yield used_metrics
def collect(self): metric = GaugeMetricFamily( 'reservation_utilization', 'Daily Reserved Instance Data', labels=["ri_metric"]) for key, value in getAWSRIMetrics().items(): metric.add_sample('reservation_utilization', value=value, labels={'ri_metric': key}) yield metric
def collect(self): # pylint: disable=inconsistent-return-statements # bearerbox server status metric = GaugeMetricFamily('bearerbox_up', 'Could the bearerbox server be reached') start = time() response = self.parse_kannel_status() if response is None: metric.add_sample('bearerbox_up', value=0, labels={}) yield metric return [] metric.add_sample('bearerbox_up', value=1, labels={}) yield metric # Version info version = bearerbox_version(response['gateway']['version']) metric = GaugeMetricFamily('bearerbox_build_info', 'Kannel bearerbox version info') metric.add_sample('bearerbox_build_info', value=1, labels={'version': version}) yield metric # Gauge for the bearerbox uptime, in seconds uptime = uptime_to_secs(response['gateway']['status']) metric = GaugeMetricFamily('bearerbox_uptime_seconds', 'Current uptime in seconds (*)') metric.add_sample('bearerbox_uptime_seconds', value=uptime, labels={}) yield metric # WDP, SMS & DLR metrics metrics = self.collect_msg_stats(response['gateway']) for metric in metrics.values(): yield metric # Box metrics metrics = self.collect_box_stats(response['gateway']['boxes']) for metric in metrics.values(): yield metric # SMSC metrics metrics = self.collect_smsc_stats( response['gateway']['smscs']['count'], response['gateway']['smscs']['smsc']) for metric in metrics.values(): yield metric duration = time() - start metric = GaugeMetricFamily( 'bearerbox_scrape_duration_seconds', 'Bearerbox metrics scrape duration in seconds (*)') metric.add_sample('bearerbox_scrape_duration_seconds', value=duration, labels={}) yield metric
def test_duplicate_timestamps(self): families = text_string_to_metric_families("""# TYPE a gauge # HELP a help a{a="1",foo="bar"} 1 0.0000000000 a{a="1",foo="bar"} 2 0.0000000001 a{a="1",foo="bar"} 3 0.0000000010 a{a="2",foo="bar"} 4 0.0000000000 a{a="2",foo="bar"} 5 0.0000000001 # EOF """) imf = GaugeMetricFamily("a", "help") imf.add_sample("a", {"a": "1", "foo": "bar"}, 1, Timestamp(0, 0)) imf.add_sample("a", {"a": "1", "foo": "bar"}, 3, Timestamp(0, 1)) imf.add_sample("a", {"a": "2", "foo": "bar"}, 4, Timestamp(0, 0)) self.assertEqual([imf], list(families))
def collect(self): yield SummaryMetricFamily('summary', 'This is simple summary', labels={'name': 'horizon.stellar.org'}) log.info('current_data.items(): %s' % current_data.items()) for k, v in current_data.items(): yield CounterMetricFamily(k, 'stellar base metric values', value=float(v)) log.info('current_payment_detail.items(): %s' % current_payment_detail.items()) for asset, asset_data in current_payment_detail.items(): summ = CounterMetricFamily('sum_payment', 'stellar payment metric values', labels=['sum_payment']) summ.add_metric(asset, asset_data['sum']) yield summ yield CounterMetricFamily('nb_payment', 'stellar payment metric values', value=float(asset_data['nm'])) metric = GaugeMetricFamily( 'large_native_payment_detail', 'large native stellar payment metric values', value=7) for from_addr, amount_by_dest in current_large_native_payment_detail.items( ): for to_addr, amount in amount_by_dest.items(): metric.add_sample('sum_large_native_payment', value=amount, labels={ 'from_addr': from_addr, 'to_addr': to_addr }) yield metric
def collect(self): # bearerbox server status metric = GaugeMetricFamily('bearerbox_up', 'Could the bearerbox server be reached') response = self.parse_kannel_status() if response is None: metric.add_sample('bearerbox_up', value=0, labels={}) yield metric return [] metric.add_sample('bearerbox_up', value=1, labels={}) yield metric # Version info version = bearerbox_version(response['gateway']['version']) metric = GaugeMetricFamily('bearerbox_build_info', 'Kannel bearerbox version info') metric.add_sample('bearerbox_build_info', value=1, labels={'version': version}) yield metric # Gauge for the bearerbox uptime, in seconds uptime = uptime_to_secs(response['gateway']['status']) metric = GaugeMetricFamily('bearerbox_uptime_seconds', 'Current uptime in seconds (*)') metric.add_sample('bearerbox_uptime_seconds', value=uptime, labels={}) yield metric # WDP, SMS & DLR metrics message_type = ['sms', 'dlr'] if self._collect_wdp is True: message_type = ['wdp'] + message_type for type in message_type: for k, v in response['gateway'][type].items(): if isinstance(v, dict): for k2, v2 in v.items(): metric_name = 'bearerbox_{0}_{1}_{2}'.format( type, k, k2) if k2 == 'total': metric_help = 'Total number of {0} {1}'.format( type.upper(), k) metric = CounterMetricFamily( metric_name, metric_help) else: metric_help = 'Number of {0} {1} in queue'.format( k, type.upper()) metric = GaugeMetricFamily(metric_name, metric_help) metric.add_sample(metric_name, value=int(v2), labels={}) yield metric elif k not in ['inbound', 'outbound']: metric_name = 'bearerbox_{0}_{1}'.format(type, k) metric_value = v metric_labels = {} if type == 'sms' and k == 'storesize': metric_help = 'Number of SMS in storesize' elif type == 'dlr': if k == 'queued': metric_help = 'Number of DLRs in queue' elif k == 'storage': metric_help = 'DLR storage type info' metric_value = 1 metric_labels = {'storage': v} metric = GaugeMetricFamily(metric_name, metric_help) metric.add_sample(metric_name, value=int(metric_value), labels=metric_labels) yield metric # Box metrics box_connections = {b: 0 for b in self._box_connections} box_details = {} metric_box_connections = GaugeMetricFamily( 'bearerbox_box_connections', 'Number of box connections') metric_box_queue = GaugeMetricFamily( 'bearerbox_box_queue', 'Number of messages in box queue') if self._collect_box_uptime is True: metric_box_uptime = GaugeMetricFamily( 'bearerbox_box_uptime_seconds', 'Box uptime in seconds (*)') if response['gateway']['boxes'] != '': # when there's only one box connected on the gateway # xmltodict returns an OrderedDict instead of a list of OrderedDicts if not isinstance(response['gateway']['boxes']['box'], list): response['gateway']['boxes']['box'] = [ response['gateway']['boxes']['box'] ] for box in response['gateway']['boxes']['box']: if box['type'] in box_connections.keys(): box_connections[box['type']] += 1 else: box_connections[box['type']] = 1 # some type of boxes (e.g wapbox) don't have IDs. if 'id' not in box.keys(): box['id'] = "" tuplkey = (box['type'], box['id'], box['IP']) # some type of boxs (e.g wapbox) don't have queues. if 'queue' in box.keys(): if tuplkey in box_details.keys(): box_details[tuplkey]['queue'] += int(box['queue']) else: box_details[tuplkey] = {} box_details[tuplkey]['queue'] = int(box['queue']) # collect box uptime metrics # In case of multiple boxes with same type, id and host. # Only the uptime of the first occurence will be exposed # in order to avoid duplicates. if self._collect_box_uptime is True: if tuplkey in box_details.keys(): if 'uptime' not in box_details[tuplkey].keys(): box_details[tuplkey]['uptime'] = uptime_to_secs( box['status']) else: box_details[tuplkey] = {} box_details[tuplkey]['uptime'] = uptime_to_secs( box['status']) for key, value in box_connections.items(): metric_box_connections.add_sample('bearerbox_box_connections', value=value, labels={'type': key}) yield metric_box_connections for key, value in box_details.items(): box_labels = {'type': key[0], 'id': key[1], 'ipaddr': key[2]} if 'queue' in value.keys(): metric_box_queue.add_sample('bearerbox_box_queue', value=value['queue'], labels=box_labels) if self._collect_box_uptime is True: metric_box_uptime.add_sample('bearerbox_box_uptime_seconds', value=value['uptime'], labels=box_labels) yield metric_box_queue if self._collect_box_uptime is True: yield metric_box_uptime # SMSC metrics metric = GaugeMetricFamily('bearerbox_smsc_connections', 'Number of SMSC connections') metric.add_sample('bearerbox_smsc_connections', value=int(response['gateway']['smscs']['count']), labels={}) yield metric if self._filter_smsc is False: metric_failed = CounterMetricFamily( 'bearerbox_smsc_failed_messages_total', 'Total number of SMSC failed messages', labels=["smsc_id"]) metric_queued = GaugeMetricFamily('bearerbox_smsc_queued_messages', 'Number of SMSC queued messages', labels=["smsc_id"]) metric_sms_received = CounterMetricFamily( 'bearerbox_smsc_received_sms_total', 'Total number of received SMS by SMSC', labels=["smsc_id"]) metric_sms_sent = CounterMetricFamily( 'bearerbox_smsc_sent_sms_total', 'Total number of SMS sent to SMSC', labels=["smsc_id"]) metric_dlr_received = CounterMetricFamily( 'bearerbox_smsc_received_dlr_total', 'Total number of DLRs received by SMSC', labels=["smsc_id"]) metric_dlr_sent = CounterMetricFamily( 'bearerbox_smsc_sent_dlr_total', 'Total number of DLRs sent to SMSC', labels=["smsc_id"]) # Group SMSCs by smsc-id smsc_stats_by_id = OrderedDict() # when there's only one smsc connection on the gateway # xmltodict returns an OrderedDict instead of a list of OrderedDicts if not isinstance(response['gateway']['smscs']['smsc'], list): response['gateway']['smscs']['smsc'] = [ response['gateway']['smscs']['smsc'] ] for smsc in response['gateway']['smscs']['smsc']: smscid = smsc['id'] if smscid in smsc_stats_by_id: smsc_stats_by_id[smscid]['failed'] += int(smsc['failed']) smsc_stats_by_id[smscid]['queued'] += int(smsc['queued']) smsc_stats_by_id[smscid]['sms']['received'] += int( smsc['sms']['received']) smsc_stats_by_id[smscid]['sms']['sent'] += int( smsc['sms']['sent']) smsc_stats_by_id[smscid]['dlr']['received'] += int( smsc['dlr']['received']) smsc_stats_by_id[smscid]['dlr']['sent'] += int( smsc['dlr']['sent']) else: smsc_stats_by_id[smscid] = OrderedDict() smsc_stats_by_id[smscid]['failed'] = int(smsc['failed']) smsc_stats_by_id[smscid]['queued'] = int(smsc['queued']) smsc_stats_by_id[smscid]['sms'] = OrderedDict() smsc_stats_by_id[smscid]['sms']['received'] = int( smsc['sms']['received']) smsc_stats_by_id[smscid]['sms']['sent'] = int( smsc['sms']['sent']) smsc_stats_by_id[smscid]['dlr'] = OrderedDict() smsc_stats_by_id[smscid]['dlr']['received'] = int( smsc['dlr']['received']) smsc_stats_by_id[smscid]['dlr']['sent'] = int( smsc['dlr']['sent']) for smsc in smsc_stats_by_id: metric_failed.add_metric([smsc], smsc_stats_by_id[smsc]['failed']) metric_queued.add_metric([smsc], smsc_stats_by_id[smsc]['queued']) metric_sms_received.add_metric( [smsc], smsc_stats_by_id[smsc]['sms']['received']) metric_sms_sent.add_metric( [smsc], smsc_stats_by_id[smsc]['sms']['sent']) metric_dlr_received.add_metric( [smsc], smsc_stats_by_id[smsc]['dlr']['received']) metric_dlr_sent.add_metric( [smsc], smsc_stats_by_id[smsc]['dlr']['sent']) yield metric_failed yield metric_queued yield metric_sms_received yield metric_sms_sent yield metric_dlr_received yield metric_dlr_sent
class AristaMetricsCollector(object): def __init__(self, config, target): self._username = os.getenv('ARISTA_USERNAME', config['username']) self._password = os.getenv('ARISTA_PASSWORD', config['password']) self._protocol = config['protocol'] or 'https' self._timeout = config['timeout'] self._target = target self._labels = {} self._switch_up = 0 self._responsetime = 0 self._memtotal = 0 self._memfree = 0 self._connection = False self._interfaces = False self._module_names = False if 'module_names' in config: self._module_names = config['module_names'] self._scrape_durations = GaugeMetricFamily( 'arista_scrape_duration_seconds', 'Duration of a collector scrape.', ) def add_scrape_duration(self, module_name, duration): self._scrape_durations.add_sample( 'arista_scrape_duration_seconds', value=duration, labels=({ 'collector': module_name }), ) def get_connection(self): # set the default timeout logging.debug(f'Setting timeout to {self._timeout}') if not self._connection: logging.info(f'Connecting to switch {self._target}') self._connection = pyeapi.connect(transport=self._protocol, host=self._target, username=self._username, password=self._password, timeout=self._timeout) return self._connection def switch_command(self, command): switch_result = '' connection = self.get_connection() try: logging.debug(f'Running command {command}') switch_result = connection.execute([command]) except pyeapi.eapilib.ConnectionError as pyeapi_connect_except: self._connection = False logging.error(('PYEAPI Client Connection Exception: ' f'{pyeapi_connect_except}')) except pyeapi.eapilib.CommandError as pyeapi_command_except: self._connection = False logging.error(('PYEAPI Client Command Exception: ' f'{pyeapi_command_except}')) finally: return switch_result def _get_labels(self): start = time.time() # Get the switch info for the labels switch_info = self.switch_command(command='show version') try: si_res = switch_info['result'][0] except Exception as e: logging.debug(f'No result from switch {self._target}: {e}') labels_switch = {'model': 'unknown', 'serial': 'unknown'} self._switch_up = 0 else: logging.debug(f'Received a result from switch {self._target}') labels_switch = { 'model': si_res['modelName'], 'serial': si_res['serialNumber'], 'version': si_res['version'] } self._memtotal = si_res['memTotal'] self._memfree = si_res['memFree'] self._switch_up = 1 end = time.time() self._responsetime = end - start self.add_scrape_duration('base', self._responsetime) self._labels.update(labels_switch) def collect_memory(self): # Export the memory usage data yield GaugeMetricFamily('arista_mem_total', 'Total memory available', value=self._memtotal) yield GaugeMetricFamily('arista_mem_free', 'Total memory free', value=self._memfree) def collect_tcam(self): # Get the tcam usage data switch_tcam = self.switch_command(command='show hardware capacity') if switch_tcam: used_metrics = GaugeMetricFamily('arista_tcam_used', 'TCAM Usage Data') total_metrics = GaugeMetricFamily('arista_tcam_total', 'TCAM Capacity') for entry in switch_tcam['result'][0]['tables']: try: labels = ({ 'table': entry['table'], 'chip': entry['chip'], 'feature': entry['feature'] }) logging.debug((f'Adding: table={entry["table"]} ' f'value={entry["used"]} ' f'labels={labels}')) used_metrics.add_sample('arista_tcam_used', value=entry['used'], labels=labels) total_metrics.add_sample('arista_tcam_total', value=entry['maxLimit'], labels=labels) except KeyError: logging.error('KeyError in switch_tcam entries') continue yield total_metrics yield used_metrics def collect_port(self): command = 'show interfaces' port_interfaces = self.switch_command(command) port_stats = { k: GaugeMetricFamily(f'arista_port_{k}', f'Port stats {k}', labels=['device', 'description', 'mac', 'mtu']) for k in PORT_STATS_NAMES } port_admin_up = GaugeMetricFamily('arista_admin_up', 'Value 1 if port is not shutdown', labels=['device', 'description']) port_l2_up = GaugeMetricFamily('arista_l2_up', 'Value 1 if port is connected', labels=['device', 'description']) if port_interfaces: self._interfaces = port_interfaces['result'][0]['interfaces'] for interface in self._interfaces: try: iface = self._interfaces[interface] data = iface['interfaceCounters'] except KeyError: logging.debug((f'Interface {interface} on {self._target}' ' does not have interfaceCounters,' ' skipping')) continue if iface['interfaceStatus'] == 'disabled': port_admin_up.add_metric( labels=[iface['name'], iface['description']], value=0) else: port_admin_up.add_metric( labels=[iface['name'], iface['description']], value=1) if iface['lineProtocolStatus'] == 'up': port_l2_up.add_metric( labels=[iface['name'], iface['description']], value=1) else: port_l2_up.add_metric( labels=[iface['name'], iface['description']], value=0) for port_stat in PORT_STATS_NAMES: metric = [ interface, iface['description'], iface['physicalAddress'], str(iface['mtu']), ] port_stats[port_stat].add_metric(metric, float(data[port_stat])) yield from port_stats.values() yield port_admin_up yield port_l2_up def collect_sfp(self): command = 'show interfaces transceiver detail' sfp = self.switch_command(command) sensor_entries = ['rxPower', 'txBias', 'txPower', 'voltage'] if sfp: sfp_labels = [ 'device', 'sensor', 'mediaType', 'serial', 'description', 'lane' ] sfp_stats_metrics = GaugeMetricFamily('arista_sfp_stats', 'SFP Statistics', labels=sfp_labels) alarm_labels = ['device', 'lane', 'sensor', 'alarmType'] sfp_alarms = GaugeMetricFamily('arista_sfp_alarms', 'SFP Alarms', labels=alarm_labels) for iface, data in sfp['result'][0]['interfaces'].items(): interface = iface lane = iface if not data: logging.debug(f'Port does not have SFP: {interface}') continue description = '' # Lane detection. Lane is an optical transmitter that is # a part of an interface. For example, 100G interface # is usually comprised of four 25G lanes or ten 10G lanes. if iface not in self._interfaces: logging.debug((f'Port {interface} not found in interfaces' '. Looking for a lane')) try_iface = '/'.join(interface.split('/')[0:-1]) + '/1' sfps = sfp['result'][0]['interfaces'] if sfps[iface]['vendorSn'] == sfps[try_iface]['vendorSn']: lane = iface interface = try_iface logging.debug((f'Setting lane {lane} as ' 'part of {interface}')) try: description = self._interfaces[interface]['description'] except KeyError: pass for sensor in sensor_entries: labels = [ interface, sensor, data['mediaType'], data['vendorSn'], description, lane ] logging.debug((f'Adding: interface={interface} ' f'sensor={sensor} value={data[sensor]} ' f'labels={labels}')) sfp_stats_metrics.add_metric(value=float(data[sensor]), labels=labels) # check thresholds and generate alerts thresholds = data['details'][sensor] labels = [interface, lane, sensor] if data[sensor] > thresholds['highAlarm']: labels.append('highAlarm') sfp_alarms.add_metric(labels=labels, value=data[sensor]) elif data[sensor] > thresholds['highWarn']: labels.append('highWarn') sfp_alarms.add_metric(labels=labels, value=data[sensor]) elif data[sensor] < thresholds['lowAlarm']: labels.append('lowAlarm') sfp_alarms.add_metric(labels=labels, value=data[sensor]) elif data[sensor] < thresholds['lowWarn']: labels.append('lowWarn') sfp_alarms.add_metric(labels=labels, value=data[sensor]) yield sfp_stats_metrics yield sfp_alarms def collect_bgp(self): command = 'show ip bgp summary' data = self.switch_command(command) ipv4 = data['result'][0]['vrfs'] command = 'show ipv6 bgp summary' data = self.switch_command(command) ipv6 = data['result'][0]['vrfs'] labels = ['vrf', 'peer', 'asn'] prefixes = GaugeMetricFamily('arista_bgp_accepted_prefixes', 'Number of prefixes accepted', labels=labels) peer_state = InfoMetricFamily('arista_bgp_peer_state', 'State of the BGP peer', labels=labels + ['state', 'router_id']) for vrf, vrf_data in ipv4.items(): if 'peers' not in vrf_data: continue router_id = vrf_data['routerId'] for peer, peer_data in vrf_data['peers'].items(): labels = { 'vrf': vrf, 'router_id': router_id, 'peer': peer, 'asn': str(peer_data['asn']), 'state': peer_data['peerState'] } peer_state.add_metric(value=labels, labels=labels) labels = [vrf, peer, str(peer_data['asn'])] prefixes.add_metric(value=peer_data['prefixReceived'], labels=labels) for vrf, vrf_data in ipv6.items(): if 'peers' not in vrf_data: continue router_id = vrf_data['routerId'] for peer, peer_data in vrf_data['peers'].items(): labels = { 'vrf': vrf, 'router_id': router_id, 'peer': peer, 'asn': str(peer_data['asn']), 'state': peer_data['peerState'] } peer_state.add_metric(value=labels, labels=labels) labels = [vrf, peer, str(peer_data['asn'])] prefixes.add_metric(value=peer_data['prefixReceived'], labels=labels) yield peer_state yield prefixes def get_all_modules(self): return { 'memory': self.collect_memory, 'tcam': self.collect_tcam, 'port': self.collect_port, 'sfp': self.collect_sfp, 'bgp': self.collect_bgp, } def get_modules(self): if not self._module_names: return self.get_all_modules() module_functions = {} modules = self._module_names.split(',') for module in modules: if module == 'all': return self.get_all_modules() elif module == 'memory': module_functions['memory'] = self.collect_memory elif module == 'tcam': module_functions['tcam'] = self.collect_tcam elif module == 'port': module_functions['port'] = self.collect_port elif module == 'sfp': module_functions['sfp'] = self.collect_sfp elif module == 'bgp': module_functions['bgp'] = self.collect_bgp else: logging.warning(f'Unknown module requested:{module}. Ignoring') return module_functions def collect(self): self._get_labels() self._interfaces = False # Export the up and response metrics yield GaugeMetricFamily('arista_up', ('Information whether the switch is reachable ' 'and responds to API calls'), value=self._switch_up) if self._switch_up == 1: yield InfoMetricFamily('arista_hw', ('Information about this arista device, ' 'such as serial number and model'), value=self._labels) for name, generator in self.get_modules().items(): start = time.time() for metric in generator(): yield metric end = time.time() self.add_scrape_duration(name, end - start) yield self._scrape_durations
def collect(self): try: # Export the up and response metrics up_metrics = GaugeMetricFamily('redfish_up','Server Monitoring for redfish availability',labels=self._labels) response_metrics = GaugeMetricFamily('redfish_response_duration_seconds','Server Monitoring for redfish response time',labels=self._labels) up_metrics.add_sample('redfish_up', value=self._redfish_up, labels=self._labels) response_metrics.add_sample('redfish_response_duration_seconds', value=self._response_time , labels=self._labels) yield up_metrics yield response_metrics if self._redfish_up == 0: return self._get_labels() powerstate_metrics = GaugeMetricFamily('redfish_powerstate','Server Monitoring Power State Data',labels=self._labels) powerstate_metrics.add_sample('redfish_powerstate', value=self._powerstate , labels=self._labels) yield powerstate_metrics logging.info("Target {0}: Collecting data ...".format(self._target)) if self._health: self._health_metrics = GaugeMetricFamily('redfish_health','Server Monitoring Health Data',labels=self._labels) current_labels = {'type': 'system', 'name': 'summary'} current_labels.update(self._labels) self._health_metrics.add_sample('redfish_health', value=self._server_health, labels=current_labels) # Get the processor health data if self._urls['Processors']: self.get_proc_health() else: logging.warning("Target {0}: No Processors URL provided! Cannot get Processors data!".format(self._target)) # Get the storage health data if self._urls['Storage']: self.get_storage_health() elif self._urls['SimpleStorage']: self.get_simple_storage_health() else: logging.warning("Target {0}: No Storage URL provided! Cannot get Storage data!".format(self._target)) # Get the chassis health data if self._urls['Chassis']: self.get_chassis_health() else: logging.warning("Target {0}: No Chassis URL provided! Cannot get Chassis data!".format(self._target)) # Get the powersupply health data if self._urls['Power']: self.get_power_health() else: logging.warning("Target {0}: No Power URL provided! Cannot get PSU data!".format(self._target)) # Get the thermal health data if self._urls['Thermal']: self.get_thermal_health() else: logging.warning("Target {0}: No Thermal URL provided! Cannot get thermal data!".format(self._target)) # Export the memory data if self._urls['Memory']: self._mem_metrics_correctable = GaugeMetricFamily('redfish_memory_correctable','Server Monitoring Memory Data for correctable errors',labels=self._labels) self._mem_metrics_unorrectable = GaugeMetricFamily('redfish_memory_uncorrectable','Server Monitoring Memory Data for uncorrectable errors',labels=self._labels) self.get_memory_health() yield self._mem_metrics_correctable yield self._mem_metrics_unorrectable else: logging.warning("Target {0}: No Memory URL provided! Cannot get memory data!".format(self._target)) yield self._health_metrics duration = round(time.time() - self._start_time,2) logging.info("Target {0}: Scrape duration: {1} seconds".format(self._target, duration)) scrape_metrics = GaugeMetricFamily('redfish_scrape_duration_seconds','Server Monitoring redfish scrabe duration in seconds',labels=self._labels) scrape_metrics.add_sample('redfish_scrape_duration_seconds', value=duration, labels=self._labels) yield scrape_metrics # Get the firmware information if self._firmware: logging.debug("Target {0}: Get the firmware information.".format(self._target)) fw_collection = self.connect_server("/redfish/v1/UpdateService/FirmwareInventory") if not fw_collection: logging.warning("Target {0}: Cannot get Firmware data!".format(self._target)) return fw_metrics = GaugeMetricFamily('server_monitoring_fwdata','Server Monitoring Firmware Data',labels=self._labels) for fw_member in fw_collection['Members']: fw_member_url = fw_member['@odata.id'] if (search(".*Dell.*", self._manufacturer) and ("Installed" in fw_member_url)) or not search(".*Dell.*", self._manufacturer): server_response = self.connect_server(fw_member_url) if not server_response: continue name = server_response['Name'].split(",",1)[0] if 'Version' in server_response: version = server_response['Version'] if version != "N/A": current_labels = {'name': name, 'version': version} current_labels.update(self._labels) fw_metrics.add_sample('redfish_version', value=1, labels=current_labels) yield fw_metrics except Exception as err: logging.error("Target {0}: An exception occured: {1}".format(self._target, err)) finally: logging.debug("Target {0}: Deleting Redfish session with server {1}".format(self._target, self._host)) if self._auth_token: session_url = "https://{0}{1}".format(self._target, self._session_url) headers = {'x-auth-token': self._auth_token} logging.debug("Target {0}: Using URL {1}".format(self._target, session_url)) response = requests.delete(session_url, verify=False, timeout=self._timeout, headers=headers) response.close() if response: logging.info("Target {0}: Redfish Session deleted successfully.".format(self._target)) else: logging.warning("Target {0}: Failed to delete session with server {1}".format(self._target, self._host)) logging.warning("Target {0}: Token: {1}".format(self._target, self._auth_token)) else: logging.debug("Target {0}: No Redfish session existing with server {1}".format(self._target, self._host)) if self._session: logging.info("Target {0}: Closing requests session.".format(self._target)) self._session.close()
class RedfishMetricsCollector(object): def __init__(self, config, target, host, usr, pwd, firmware=False, health=False): self._target = target self._host = host self._username = usr self._password = pwd self._timeout = int(os.getenv('TIMEOUT', config['timeout'])) self._labels = {'host': self._host} self._redfish_up = 0 self._response_time = 0 self._last_http_code = 0 self._powerstate = 0 self._firmware = firmware self._health = health self._systems_url = "" self._urls = { 'Memory': "", 'ManagedBy': "", 'Processors': "", 'Storage': "", 'SimpleStorage': "", 'Chassis': "", 'Power': "", 'Thermal': "", 'NetworkInterfaces': "" } self._server_health = 0 self._health_metrics = None self._mem_metrics_correctable = None self._mem_metrics_unorrectable = None self._manufacturer = "" self._model = "" self._status = {"ok": 0, "operable": 0, "enabled": 0, "good": 0, "critical": 1, "error": 1, "warning": 2} self._start_time = time.time() self._session_url = "" self._auth_token = "" self._basic_auth = False self._session = "" def get_session(self): # Get the url for the server info and messure the response time logging.info("Target {0}: Connecting to server {1}".format(self._target, self._host)) start_time = time.time() server_response = self.connect_server("/redfish/v1", noauth=True) self._response_time = round(time.time() - start_time,2) logging.info("Target {0}: Response time: {1} seconds.".format(self._target, self._response_time)) if server_response: logging.debug("Target {0}: data received from server {1}.".format(self._target, self._host)) session_service = self.connect_server(server_response['SessionService']['@odata.id'], basic_auth=True) if self._last_http_code == 200: sessions_url = "https://{0}{1}".format(self._target, session_service['Sessions']['@odata.id']) session_data = {"UserName": self._username, "Password": self._password} self._session.auth = None result = "" # Try to get a session try: result = self._session.post(sessions_url, json=session_data, verify=False, timeout=self._timeout) result.raise_for_status() except requests.exceptions.ConnectionError as err: logging.error("Target {0}: Error getting an auth token from server {1}: {2}".format(self._target, self._host, err)) self._basic_auth = True except requests.exceptions.HTTPError as err: logging.warning("Target {0}: No session received from server {1}: {2}".format(self._target, self._host, err)) logging.warning("Target {0}: Switching to basic authentication.".format(self._target)) self._basic_auth = True if result: if result.status_code in [200,201]: self._auth_token = result.headers['X-Auth-Token'] self._session_url = result.json()['@odata.id'] logging.info("Target {0}: Got an auth token from server {1}!".format(self._target, self._host)) self._redfish_up = 1 else: logging.warning("Target {0}: Failed to get a session from server {1}!".format(self._target, self._host)) else: logging.warning("Target {0}: No data received from server {1}!".format(self._target, self._host)) def connect_server(self, command, noauth = False, basic_auth = False): logging.captureWarnings(True) req = "" req_text = "" server_response = "" self._last_http_code = 200 request_duration = 0 request_start = time.time() url = "https://{0}{1}".format(self._target, command) # check if we already established a session with the server if not self._session: self._session = requests.Session() else: logging.debug("Target {0}: Using existing session.".format(self._target)) self._session.verify = False self._session.headers.update({'charset': 'utf-8'}) self._session.headers.update({'content-type': 'application/json'}) if noauth: logging.debug("Target {0}: Using no auth".format(self._target)) elif basic_auth or self._basic_auth: self._session.auth = (self._username, self._password) logging.debug("Target {0}: Using basic auth with user {1}".format(self._target, self._username)) else: logging.debug("Target {0}: Using auth token".format(self._target)) self._session.auth = None self._session.headers.update({'X-Auth-Token': self._auth_token}) logging.debug("Target {0}: Using URL {1}".format(self._target, url)) try: req = self._session.get(url, timeout = self._timeout) req.raise_for_status() except requests.exceptions.HTTPError as err: self._last_http_code = err.response.status_code if err.response.status_code == 401: logging.error("Target {0}: Authorization Error: Wrong job provided or user/password set wrong on server {1}: {2}".format(self._target, self._host, err)) else: logging.error("Target {0}: HTTP Error on server {1}: {2}".format(self._target, self._host, err)) except requests.exceptions.ConnectTimeout: logging.error("Target {0}: Timeout while connecting to {1}".format(self._target, self._host)) self._last_http_code = 408 except requests.exceptions.ReadTimeout: logging.error("Target {0}: Timeout while reading data from {1}".format(self._target, self._host)) self._last_http_code = 408 except requests.exceptions.ConnectionError as excptn: logging.error("Target {0}: Unable to connect to {1}: {2}".format(self._target, self._host, excptn)) self._last_http_code = 444 except: logging.error("Target {0}: Unexpected error: {1}".format(self._target, sys.exc_info()[0])) self._last_http_code = 500 else: self._last_http_code = req.status_code if req != "": try: req_text = req.json() except: logging.debug("Target {0}: No json data received.".format(self._target)) # req will evaluate to True if the status code was between 200 and 400 and False otherwise. if req: server_response = req_text # if the request fails the server might give a hint in the ExtendedInfo field else: if req_text: logging.debug("Target {0}: {1}: {2}".format(self._target, req_text['error']['code'], req_text['error']['message'])) if '@Message.ExtendedInfo' in req_text['error']: if type(req_text['error']['@Message.ExtendedInfo']) == list: if 'Message' in req_text['error']['@Message.ExtendedInfo'][0]: logging.debug("Target {0}: {1}".format(self._target, req_text['error']['@Message.ExtendedInfo'][0]['Message'])) elif type(req_text['error']['@Message.ExtendedInfo']) == dict: if 'Message' in req_text['error']['@Message.ExtendedInfo']: logging.debug("Target {0}: {1}".format(self._target, req_text['error']['@Message.ExtendedInfo']['Message'])) else: pass request_duration = round(time.time() - request_start,2) logging.debug("Target {0}: Request duration: {1}".format(self._target, request_duration)) return server_response def _get_labels(self): systems = self.connect_server("/redfish/v1/Systems") if not systems: return powerstates = {'off': 0, 'on': 1} # Get the server info for the labels self._systems_url = systems['Members'][0]['@odata.id'] server_info = self.connect_server(self._systems_url) if not server_info: return self._manufacturer = server_info['Manufacturer'] self._model = server_info['Model'] self._powerstate = powerstates[server_info['PowerState'].lower()] if 'SKU' in server_info: serial = server_info['SKU'] else: serial = server_info['SerialNumber'] self._labels.update({'host': self._host, 'server_manufacturer': self._manufacturer, 'server_model': self._model, 'server_serial': serial}) self._server_health = self._status[server_info['Status']['Health'].lower()] # get the links of the parts for later if type(server_info['Links']['Chassis'][0]) == str: self._urls['Chassis'] = server_info['Links']['Chassis'][0] self._urls['ManagedBy'] = server_info['Links']['ManagedBy'][0] else: self._urls['Chassis'] = server_info['Links']['Chassis'][0]['@odata.id'] self._urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]['@odata.id'] if 'Memory' in server_info: self._urls['Memory'] = server_info['Memory']['@odata.id'] if 'NetworkInterfaces' in server_info: self._urls['NetworkInterfaces'] = server_info['NetworkInterfaces']['@odata.id'] if 'Processors' in server_info: self._urls['Processors'] = server_info['Processors']['@odata.id'] if 'Storage' in server_info: self._urls['Storage'] = server_info['Storage']['@odata.id'] if 'SimpleStorage' in server_info: self._urls['SimpleStorage'] = server_info['SimpleStorage']['@odata.id'] def get_proc_health(self): logging.debug("Target {0}: Get the CPU health data.".format(self._target)) processor_collection = self.connect_server(self._urls['Processors']) if not processor_collection: return for processor in processor_collection['Members']: processor_data = self.connect_server(processor['@odata.id']) if not processor_data: continue current_labels = {'type': 'processor', 'name': processor_data.get('Socket', "unknown"), 'cpu_type': processor_data.get('ProcessorType', "unknown"), 'cpu_model': processor_data.get('Model', "unknown"), 'cpu_cores': str(processor_data.get('TotalCores', "unknown")), 'cpu_threads': str(processor_data.get('TotalThreads', "unknown"))} current_labels.update(self._labels) if processor_data['Status']['Health']: self._health_metrics.add_sample('redfish_health', value=self._status[processor_data['Status']['Health'].lower()], labels=current_labels) else: logging.warning("Target {0}: No Processor health data provided ({1})!".format(self._target, processor['@odata.id'])) self._health_metrics.add_sample('redfish_health', value=math.nan, labels=current_labels) def get_storage_health(self): logging.debug("Target {0}: Get the storage health data.".format(self._target)) storage_collection = self.connect_server(self._urls['Storage']) if not storage_collection: return for controller in storage_collection['Members']: controller_data = self.connect_server(controller['@odata.id']) if not controller_data: continue if controller_data.get('StorageControllers'): # Cisco sometimes uses a list or a dict if type(controller_data['StorageControllers']) == list: controller_details = controller_data['StorageControllers'][0] else: controller_details = controller_data['StorageControllers'] else: controller_details = controller_data # HPE ILO5 is missing the Name in the details of the controllers if 'Name' in controller_details: controller_name = controller_details['Name'] elif 'Name' in controller_data: controller_name = controller_data['Name'] else: controller_name = 'unknown' if 'Health' in controller_details['Status']: # Cisco sometimes uses None as status for onboard controllers controller_status = math.nan if controller_details['Status']['Health'] is None else self._status[controller_details['Status']['Health'].lower()] else: logging.warning("Target {0}, Host {1}, Model {2}, Controller {3}: No health data found.".format(self._target, self._host,self._model, controller_name)) current_labels = {'type': 'storage', 'name': controller_name, 'controller_model': controller_details.get('Model', 'unknown'), 'controller_manufacturer': controller_details.get('Manufacturer', 'unknown')} current_labels.update(self._labels) self._health_metrics.add_sample('redfish_health', value=controller_status, labels=current_labels) # Sometimes not all attributes are implemented. Checking if existing one by one. disk_attributes = {'Name': 'name', 'MediaType': 'disk_type', 'Model': 'disk_model', 'Manufacturer': 'disk_manufacturer', 'CapacityBytes': 'disk_capacity', 'Protocol': 'disk_protocol'} for disk in controller_data['Drives']: current_labels = {'type': 'disk'} disk_data = self.connect_server(disk['@odata.id']) if disk_data == '': continue for disk_attribute in disk_attributes: if disk_attribute in disk_data: current_labels.update({disk_attributes[disk_attribute]: str(disk_data[disk_attribute])}) current_labels.update(self._labels) if 'Health' in disk_data['Status']: disk_status = math.nan if disk_data['Status']['Health'] is None else self._status[disk_data['Status']['Health'].lower()] self._health_metrics.add_sample('redfish_health', value=disk_status, labels=current_labels) else: logging.warning("Target {0}, Host {1}, Model {2}, Disk {3}: No health data found.".format(self._target, self._host,self._model, disk_data['name'])) def get_simple_storage_health(self): storage_collection = self.connect_server(self._urls['SimpleStorage']) if not storage_collection: return for controller in storage_collection['Members']: controller_data = self.connect_server(controller['@odata.id']) if not controller_data: continue controller_name = controller_data['Name'] controller_status = math.nan if controller_data['Status']['Health'] is None else self._status[controller_data['Status']['Health'].lower()] current_labels = {'type': 'storage', 'name': controller_name} current_labels.update(self._labels) self._health_metrics.add_sample('redfish_health', value=controller_status, labels=current_labels) # Sometimes not all attributes are implemented. Checking if existing one by one. disk_attributes = {'Name': 'name', 'Model': 'disk_model', 'Manufacturer': 'disk_manufacturer'} for disk in controller_data['Devices']: current_labels = {'type': 'disk'} if disk['Status']['State'] != 'Absent': for disk_attribute in disk_attributes: if disk_attribute in disk: current_labels.update({disk_attributes[disk_attribute]: disk[disk_attribute]}) current_labels.update(self._labels) self._health_metrics.add_sample('redfish_health', value=self._status[disk['Status']['Health'].lower()], labels=current_labels) def get_chassis_health(self): logging.debug("Target {0}: Get the Chassis health data.".format(self._target)) chassis_data = self.connect_server(self._urls['Chassis']) if not chassis_data: return current_labels = {'type': 'chassis', 'name': chassis_data['Name']} current_labels.update(self._labels) self._health_metrics.add_sample('redfish_health', value=self._status[chassis_data['Status']['Health'].lower()], labels=current_labels) if 'Power' in chassis_data: self._urls['Power'] = chassis_data['Power']['@odata.id'] if 'Thermal' in chassis_data: self._urls['Thermal'] = chassis_data['Thermal']['@odata.id'] def get_power_health(self): logging.debug("Target {0}: Get the PDU health data.".format(self._target)) power_data = self.connect_server(self._urls['Power']) if not power_data: return for psu in power_data['PowerSupplies']: psu_name = psu.get('Name', 'unknown') current_labels = {'type': 'powersupply', 'name': psu_name} current_labels.update(self._labels) psu_health = math.nan psu_status = dict((k.lower(),v) for k,v in psu['Status'].items()) # convert to lower case because there are differences per vendor if 'state' in psu_status: if psu_status['state'] != 'absent': if 'health' in psu_status: psu_health = math.nan if psu_status['health'] is None else self._status[psu_status['health'].lower()] elif 'state' in psu_status: psu_health = math.nan if psu_status['state'] is None else self._status[psu_status['state'].lower()] if psu_health is math.nan: logging.warning("Target {0}, Host {1}, Model {2}, PSU {3}: No health data found.".format(self._target, self._host,self._model, psu_name)) self._health_metrics.add_sample('redfish_health', value=psu_health, labels=current_labels) def get_thermal_health(self): logging.debug("Target {0}: Get the thermal health data.".format(self._target)) thermal_data = self.connect_server(self._urls['Thermal']) if not thermal_data: return for fan in thermal_data['Fans']: fan_name = fan.get('Name', 'unknown') current_labels = {'type': 'fan', 'name': fan_name} current_labels.update(self._labels) fan_health = math.nan fan_status = dict((k.lower(),v) for k,v in fan['Status'].items()) # convert to lower case because there are differences per vendor if 'state' in fan_status: if fan_status['state'] != 'absent': if 'health' in fan_status: fan_health = math.nan if fan_status['health'] is None or fan_status['health'] == '' else self._status[fan_status['health'].lower()] elif 'state' in fan_status: fan_health = math.nan if fan_status['state'] is None else self._status[fan_status['state'].lower()] if fan_health is math.nan: logging.warning("Target {0}, Host {1}, Model {2}, Fan {3}: No health data found.".format(self._target, self._host,self._model, fan['Name'])) self._health_metrics.add_sample('redfish_health', value=fan_health, labels=current_labels) def get_memory_health(self): logging.debug("Target {0}: Get the Memory data.".format(self._target)) memory_collection = self.connect_server(self._urls['Memory']) if not memory_collection: return for dimm_url in memory_collection['Members']: dimm_info = self.connect_server(dimm_url['@odata.id']) if not dimm_info: continue current_labels = {'type': 'memory', 'name': dimm_info['Name']} current_labels.update(self._labels) if type(dimm_info['Status']) == str: dimm_health = self._status[dimm_info['Status'].lower()] else: dimm_health = math.nan dimm_status = dict((k.lower(),v) for k,v in dimm_info['Status'].items()) # convert to lower case because there are differences per vendor if 'state' in dimm_status: if dimm_status['state'] is not None: if dimm_status['state'].lower() == 'absent': logging.warning("Target {0}, Host {1}, Model {2}, Dimm {3}: absent.".format(self._target, self._host,self._model, dimm_info['Name'])) continue if 'Manufacturer' in dimm_info: manufacturer = dimm_info['Manufacturer'] if 'Oem' in dimm_info: if 'Hpe' in dimm_info['Oem']: manufacturer = dimm_info['Oem']['Hpe']['VendorName'] current_labels.update({'dimm_capacity': str(dimm_info['CapacityMiB']), 'dimm_speed': str(dimm_info['OperatingSpeedMhz']), 'dimm_type': dimm_info['MemoryDeviceType'], 'dimm_manufacturer': manufacturer}) if 'health' in dimm_status: dimm_health = math.nan if dimm_info['Status']['Health'] is None else self._status[dimm_info['Status']['Health'].lower()] elif 'state' in dimm_status: dimm_health = math.nan if dimm_info['Status']['State'] is None else self._status[dimm_info['Status']['State'].lower()] if dimm_health is math.nan: logging.warning("Target {0}, Host {1}, Model {2}, Dimm {3}: No health data found.".format(self._target, self._host,self._model, dimm_info['Name'])) self._health_metrics.add_sample('redfish_health', value=dimm_health, labels=current_labels) if 'Metrics' in dimm_info: dimm_metrics = self.connect_server(dimm_info['Metrics']['@odata.id']) if not dimm_metrics: continue correctable_ecc_error = math.nan if dimm_metrics['HealthData']['AlarmTrips']['CorrectableECCError'] is None else int(dimm_metrics['HealthData']['AlarmTrips']['CorrectableECCError']) uncorrectable_ecc_error = math.nan if dimm_metrics['HealthData']['AlarmTrips']['UncorrectableECCError'] is None else int(dimm_metrics['HealthData']['AlarmTrips']['UncorrectableECCError']) self._mem_metrics_correctable.add_sample('redfish_memory_correctable', value=correctable_ecc_error, labels=current_labels) self._mem_metrics_unorrectable.add_sample('redfish_memory_uncorrectable', value=uncorrectable_ecc_error, labels=current_labels) else: logging.warning("Target {0}, Host {1}, Model {2}: Dimm {3}: No Dimm Metrics found.".format(self._target, self._host,self._model, dimm_info['Name'])) def collect(self): try: # Export the up and response metrics up_metrics = GaugeMetricFamily('redfish_up','Server Monitoring for redfish availability',labels=self._labels) response_metrics = GaugeMetricFamily('redfish_response_duration_seconds','Server Monitoring for redfish response time',labels=self._labels) up_metrics.add_sample('redfish_up', value=self._redfish_up, labels=self._labels) response_metrics.add_sample('redfish_response_duration_seconds', value=self._response_time , labels=self._labels) yield up_metrics yield response_metrics if self._redfish_up == 0: return self._get_labels() powerstate_metrics = GaugeMetricFamily('redfish_powerstate','Server Monitoring Power State Data',labels=self._labels) powerstate_metrics.add_sample('redfish_powerstate', value=self._powerstate , labels=self._labels) yield powerstate_metrics logging.info("Target {0}: Collecting data ...".format(self._target)) if self._health: self._health_metrics = GaugeMetricFamily('redfish_health','Server Monitoring Health Data',labels=self._labels) current_labels = {'type': 'system', 'name': 'summary'} current_labels.update(self._labels) self._health_metrics.add_sample('redfish_health', value=self._server_health, labels=current_labels) # Get the processor health data if self._urls['Processors']: self.get_proc_health() else: logging.warning("Target {0}: No Processors URL provided! Cannot get Processors data!".format(self._target)) # Get the storage health data if self._urls['Storage']: self.get_storage_health() elif self._urls['SimpleStorage']: self.get_simple_storage_health() else: logging.warning("Target {0}: No Storage URL provided! Cannot get Storage data!".format(self._target)) # Get the chassis health data if self._urls['Chassis']: self.get_chassis_health() else: logging.warning("Target {0}: No Chassis URL provided! Cannot get Chassis data!".format(self._target)) # Get the powersupply health data if self._urls['Power']: self.get_power_health() else: logging.warning("Target {0}: No Power URL provided! Cannot get PSU data!".format(self._target)) # Get the thermal health data if self._urls['Thermal']: self.get_thermal_health() else: logging.warning("Target {0}: No Thermal URL provided! Cannot get thermal data!".format(self._target)) # Export the memory data if self._urls['Memory']: self._mem_metrics_correctable = GaugeMetricFamily('redfish_memory_correctable','Server Monitoring Memory Data for correctable errors',labels=self._labels) self._mem_metrics_unorrectable = GaugeMetricFamily('redfish_memory_uncorrectable','Server Monitoring Memory Data for uncorrectable errors',labels=self._labels) self.get_memory_health() yield self._mem_metrics_correctable yield self._mem_metrics_unorrectable else: logging.warning("Target {0}: No Memory URL provided! Cannot get memory data!".format(self._target)) yield self._health_metrics duration = round(time.time() - self._start_time,2) logging.info("Target {0}: Scrape duration: {1} seconds".format(self._target, duration)) scrape_metrics = GaugeMetricFamily('redfish_scrape_duration_seconds','Server Monitoring redfish scrabe duration in seconds',labels=self._labels) scrape_metrics.add_sample('redfish_scrape_duration_seconds', value=duration, labels=self._labels) yield scrape_metrics # Get the firmware information if self._firmware: logging.debug("Target {0}: Get the firmware information.".format(self._target)) fw_collection = self.connect_server("/redfish/v1/UpdateService/FirmwareInventory") if not fw_collection: logging.warning("Target {0}: Cannot get Firmware data!".format(self._target)) return fw_metrics = GaugeMetricFamily('server_monitoring_fwdata','Server Monitoring Firmware Data',labels=self._labels) for fw_member in fw_collection['Members']: fw_member_url = fw_member['@odata.id'] if (search(".*Dell.*", self._manufacturer) and ("Installed" in fw_member_url)) or not search(".*Dell.*", self._manufacturer): server_response = self.connect_server(fw_member_url) if not server_response: continue name = server_response['Name'].split(",",1)[0] if 'Version' in server_response: version = server_response['Version'] if version != "N/A": current_labels = {'name': name, 'version': version} current_labels.update(self._labels) fw_metrics.add_sample('redfish_version', value=1, labels=current_labels) yield fw_metrics except Exception as err: logging.error("Target {0}: An exception occured: {1}".format(self._target, err)) finally: logging.debug("Target {0}: Deleting Redfish session with server {1}".format(self._target, self._host)) if self._auth_token: session_url = "https://{0}{1}".format(self._target, self._session_url) headers = {'x-auth-token': self._auth_token} logging.debug("Target {0}: Using URL {1}".format(self._target, session_url)) response = requests.delete(session_url, verify=False, timeout=self._timeout, headers=headers) response.close() if response: logging.info("Target {0}: Redfish Session deleted successfully.".format(self._target)) else: logging.warning("Target {0}: Failed to delete session with server {1}".format(self._target, self._host)) logging.warning("Target {0}: Token: {1}".format(self._target, self._auth_token)) else: logging.debug("Target {0}: No Redfish session existing with server {1}".format(self._target, self._host)) if self._session: logging.info("Target {0}: Closing requests session.".format(self._target)) self._session.close()
def collect(self): # Export the up and response metrics info_metrics = GaugeMetricFamily('arista_monitoring_info','Arista Switch Monitoring',labels=self._labels) info_metrics.add_sample('arista_up', value=self._switch_up, labels=self._labels) info_metrics.add_sample('arista_response', value=self._responstime, labels=self._labels) yield info_metrics if self._switch_up == 1: logging.debug("Switch is rechable.") # Export the memory usage data mem_metrics = GaugeMetricFamily('switch_monitoring_memdata','Arista Switch Monitoring Memory Usage Data',labels=self._labels) mem_metrics.add_sample('arista_mem_total', value=self._memtotal, labels=self._labels) mem_metrics.add_sample('arista_mem_free', value=self._memfree, labels=self._labels) logging.debug("Exporting metrics arista_mem_total=%s", self._memtotal) logging.debug("Exporting metrics arista_mem_free=%s", self._memfree) yield mem_metrics # Get the tcam usage data switch_tcam = self.connect_switch (command="show hardware capacity") if switch_tcam: tcam_metrics = GaugeMetricFamily('switch_monitoring_data','Arista Switch Monitoring TCAM Usage Data',labels=self._labels) for entry in switch_tcam['result'][0]['tables']: # add the chip and feature names as labels to the switch info labels labels = {} labels = ({'table': entry['table'], 'chip':entry["chip"], 'feature':entry["feature"]}) if entry['table'] not in self._exclude: #logging.debug("Adding: table=%s value=%s labels=%s", entry['table'], entry["usedPercent"], labels) labels.update(self._labels) tcam_metrics.add_sample('arista_tcam', value=entry["usedPercent"], labels=labels) else: logging.debug("Excluding: table=%s value=%s labels=%s", entry['table'], entry["usedPercent"], labels) yield tcam_metrics else: pass switch_port_stats = self.connect_switch(command="show interfaces counters rates") regex_pattern = re.compile('.*reserved.*', re.IGNORECASE) if switch_port_stats: port_stats_metrics = GaugeMetricFamily('switch_monitoring_ports','Arista Switch Monitoring Port Statistics',labels=self._labels) for port_entry in switch_port_stats['result'][0]['interfaces']: port_values = switch_port_stats['result'][0]['interfaces'][port_entry] port_description = port_values['description'].replace("-> ","") for port_value in port_values: if port_value != "description" and port_value != 'interval' and not regex_pattern.match(port_description): labels = {} labels = ({'port': port_entry, 'stat': port_value, 'description': port_description}) labels.update(self._labels) #logging.debug("Adding: port=%s stat=%s value=%s labels=%s", port_entry, port_value, port_values[port_value], labels) port_stats_metrics.add_sample('arista_port_stats', value=float(port_values[port_value]), labels=labels) yield port_stats_metrics else: pass