Beispiel #1
0
    def collect_tcam(self):
        # Get the tcam usage data
        switch_tcam = self.switch_command(command='show hardware capacity')

        if switch_tcam:
            used_metrics = GaugeMetricFamily('arista_tcam_used',
                                             'TCAM Usage Data')
            total_metrics = GaugeMetricFamily('arista_tcam_total',
                                              'TCAM Capacity')
            for entry in switch_tcam['result'][0]['tables']:
                try:
                    labels = ({
                        'table': entry['table'],
                        'chip': entry['chip'],
                        'feature': entry['feature']
                    })
                    logging.debug((f'Adding: table={entry["table"]} '
                                   f'value={entry["used"]} '
                                   f'labels={labels}'))
                    used_metrics.add_sample('arista_tcam_used',
                                            value=entry['used'],
                                            labels=labels)
                    total_metrics.add_sample('arista_tcam_total',
                                             value=entry['maxLimit'],
                                             labels=labels)
                except KeyError:
                    logging.error('KeyError in switch_tcam entries')
                    continue

            yield total_metrics
            yield used_metrics
 def collect(self):
     metric = GaugeMetricFamily(
         'reservation_utilization', 'Daily Reserved Instance Data', labels=["ri_metric"])
     for key, value in getAWSRIMetrics().items():
         metric.add_sample('reservation_utilization',
                           value=value, labels={'ri_metric': key})
     yield metric
    def collect(self):  # pylint: disable=inconsistent-return-statements
        # bearerbox server status
        metric = GaugeMetricFamily('bearerbox_up',
                                   'Could the bearerbox server be reached')

        start = time()

        response = self.parse_kannel_status()

        if response is None:
            metric.add_sample('bearerbox_up', value=0, labels={})
            yield metric
            return []

        metric.add_sample('bearerbox_up', value=1, labels={})
        yield metric

        # Version info
        version = bearerbox_version(response['gateway']['version'])
        metric = GaugeMetricFamily('bearerbox_build_info',
                                   'Kannel bearerbox version info')
        metric.add_sample('bearerbox_build_info',
                          value=1,
                          labels={'version': version})
        yield metric

        # Gauge for the bearerbox uptime, in seconds
        uptime = uptime_to_secs(response['gateway']['status'])
        metric = GaugeMetricFamily('bearerbox_uptime_seconds',
                                   'Current uptime in seconds (*)')
        metric.add_sample('bearerbox_uptime_seconds', value=uptime, labels={})
        yield metric

        # WDP, SMS & DLR metrics
        metrics = self.collect_msg_stats(response['gateway'])
        for metric in metrics.values():
            yield metric

        # Box metrics
        metrics = self.collect_box_stats(response['gateway']['boxes'])
        for metric in metrics.values():
            yield metric

        # SMSC metrics
        metrics = self.collect_smsc_stats(
            response['gateway']['smscs']['count'],
            response['gateway']['smscs']['smsc'])
        for metric in metrics.values():
            yield metric

        duration = time() - start
        metric = GaugeMetricFamily(
            'bearerbox_scrape_duration_seconds',
            'Bearerbox metrics scrape duration in seconds (*)')
        metric.add_sample('bearerbox_scrape_duration_seconds',
                          value=duration,
                          labels={})
        yield metric
Beispiel #4
0
    def test_duplicate_timestamps(self):
        families = text_string_to_metric_families("""# TYPE a gauge
# HELP a help
a{a="1",foo="bar"} 1 0.0000000000
a{a="1",foo="bar"} 2 0.0000000001
a{a="1",foo="bar"} 3 0.0000000010
a{a="2",foo="bar"} 4 0.0000000000
a{a="2",foo="bar"} 5 0.0000000001
# EOF
""")
        imf = GaugeMetricFamily("a", "help")
        imf.add_sample("a", {"a": "1", "foo": "bar"}, 1, Timestamp(0, 0))
        imf.add_sample("a", {"a": "1", "foo": "bar"}, 3, Timestamp(0, 1))
        imf.add_sample("a", {"a": "2", "foo": "bar"}, 4, Timestamp(0, 0))
        self.assertEqual([imf], list(families))
Beispiel #5
0
    def collect(self):

        yield SummaryMetricFamily('summary',
                                  'This is simple summary',
                                  labels={'name': 'horizon.stellar.org'})

        log.info('current_data.items(): %s' % current_data.items())

        for k, v in current_data.items():
            yield CounterMetricFamily(k,
                                      'stellar base metric values',
                                      value=float(v))

        log.info('current_payment_detail.items(): %s' %
                 current_payment_detail.items())

        for asset, asset_data in current_payment_detail.items():
            summ = CounterMetricFamily('sum_payment',
                                       'stellar payment metric values',
                                       labels=['sum_payment'])
            summ.add_metric(asset, asset_data['sum'])
            yield summ
            yield CounterMetricFamily('nb_payment',
                                      'stellar payment metric values',
                                      value=float(asset_data['nm']))

        metric = GaugeMetricFamily(
            'large_native_payment_detail',
            'large native stellar payment metric values',
            value=7)
        for from_addr, amount_by_dest in current_large_native_payment_detail.items(
        ):
            for to_addr, amount in amount_by_dest.items():
                metric.add_sample('sum_large_native_payment',
                                  value=amount,
                                  labels={
                                      'from_addr': from_addr,
                                      'to_addr': to_addr
                                  })
        yield metric
Beispiel #6
0
    def collect(self):
        # bearerbox server status
        metric = GaugeMetricFamily('bearerbox_up',
                                   'Could the bearerbox server be reached')

        response = self.parse_kannel_status()

        if response is None:
            metric.add_sample('bearerbox_up', value=0, labels={})
            yield metric
            return []

        metric.add_sample('bearerbox_up', value=1, labels={})
        yield metric

        # Version info
        version = bearerbox_version(response['gateway']['version'])
        metric = GaugeMetricFamily('bearerbox_build_info',
                                   'Kannel bearerbox version info')
        metric.add_sample('bearerbox_build_info',
                          value=1,
                          labels={'version': version})
        yield metric

        # Gauge for the bearerbox uptime, in seconds
        uptime = uptime_to_secs(response['gateway']['status'])
        metric = GaugeMetricFamily('bearerbox_uptime_seconds',
                                   'Current uptime in seconds (*)')
        metric.add_sample('bearerbox_uptime_seconds', value=uptime, labels={})
        yield metric

        # WDP, SMS & DLR metrics
        message_type = ['sms', 'dlr']
        if self._collect_wdp is True:
            message_type = ['wdp'] + message_type

        for type in message_type:
            for k, v in response['gateway'][type].items():
                if isinstance(v, dict):
                    for k2, v2 in v.items():
                        metric_name = 'bearerbox_{0}_{1}_{2}'.format(
                            type, k, k2)
                        if k2 == 'total':
                            metric_help = 'Total number of {0} {1}'.format(
                                type.upper(), k)
                            metric = CounterMetricFamily(
                                metric_name, metric_help)
                        else:
                            metric_help = 'Number of {0} {1} in queue'.format(
                                k, type.upper())
                            metric = GaugeMetricFamily(metric_name,
                                                       metric_help)

                        metric.add_sample(metric_name,
                                          value=int(v2),
                                          labels={})
                        yield metric

                elif k not in ['inbound', 'outbound']:
                    metric_name = 'bearerbox_{0}_{1}'.format(type, k)
                    metric_value = v
                    metric_labels = {}

                    if type == 'sms' and k == 'storesize':
                        metric_help = 'Number of SMS in storesize'
                    elif type == 'dlr':
                        if k == 'queued':
                            metric_help = 'Number of DLRs in queue'
                        elif k == 'storage':
                            metric_help = 'DLR storage type info'
                            metric_value = 1
                            metric_labels = {'storage': v}

                    metric = GaugeMetricFamily(metric_name, metric_help)
                    metric.add_sample(metric_name,
                                      value=int(metric_value),
                                      labels=metric_labels)
                    yield metric

        # Box metrics
        box_connections = {b: 0 for b in self._box_connections}
        box_details = {}
        metric_box_connections = GaugeMetricFamily(
            'bearerbox_box_connections', 'Number of box connections')
        metric_box_queue = GaugeMetricFamily(
            'bearerbox_box_queue', 'Number of messages in box queue')

        if self._collect_box_uptime is True:
            metric_box_uptime = GaugeMetricFamily(
                'bearerbox_box_uptime_seconds', 'Box uptime in seconds (*)')
        if response['gateway']['boxes'] != '':
            # when there's only one box connected on the gateway
            # xmltodict returns an OrderedDict instead of a list of OrderedDicts
            if not isinstance(response['gateway']['boxes']['box'], list):
                response['gateway']['boxes']['box'] = [
                    response['gateway']['boxes']['box']
                ]

            for box in response['gateway']['boxes']['box']:
                if box['type'] in box_connections.keys():
                    box_connections[box['type']] += 1
                else:
                    box_connections[box['type']] = 1

                # some type of boxes (e.g wapbox) don't have IDs.
                if 'id' not in box.keys():
                    box['id'] = ""

                tuplkey = (box['type'], box['id'], box['IP'])

                # some type of boxs (e.g wapbox) don't have queues.
                if 'queue' in box.keys():
                    if tuplkey in box_details.keys():
                        box_details[tuplkey]['queue'] += int(box['queue'])
                    else:
                        box_details[tuplkey] = {}
                        box_details[tuplkey]['queue'] = int(box['queue'])

                # collect box uptime metrics
                # In case of multiple boxes with same type, id and host.
                # Only the uptime of the first occurence will be exposed
                # in order to avoid duplicates.
                if self._collect_box_uptime is True:
                    if tuplkey in box_details.keys():
                        if 'uptime' not in box_details[tuplkey].keys():
                            box_details[tuplkey]['uptime'] = uptime_to_secs(
                                box['status'])
                    else:
                        box_details[tuplkey] = {}
                        box_details[tuplkey]['uptime'] = uptime_to_secs(
                            box['status'])

        for key, value in box_connections.items():
            metric_box_connections.add_sample('bearerbox_box_connections',
                                              value=value,
                                              labels={'type': key})
        yield metric_box_connections

        for key, value in box_details.items():
            box_labels = {'type': key[0], 'id': key[1], 'ipaddr': key[2]}
            if 'queue' in value.keys():
                metric_box_queue.add_sample('bearerbox_box_queue',
                                            value=value['queue'],
                                            labels=box_labels)
            if self._collect_box_uptime is True:
                metric_box_uptime.add_sample('bearerbox_box_uptime_seconds',
                                             value=value['uptime'],
                                             labels=box_labels)

        yield metric_box_queue
        if self._collect_box_uptime is True:
            yield metric_box_uptime

        # SMSC metrics
        metric = GaugeMetricFamily('bearerbox_smsc_connections',
                                   'Number of SMSC connections')
        metric.add_sample('bearerbox_smsc_connections',
                          value=int(response['gateway']['smscs']['count']),
                          labels={})
        yield metric

        if self._filter_smsc is False:
            metric_failed = CounterMetricFamily(
                'bearerbox_smsc_failed_messages_total',
                'Total number of SMSC failed messages',
                labels=["smsc_id"])
            metric_queued = GaugeMetricFamily('bearerbox_smsc_queued_messages',
                                              'Number of SMSC queued messages',
                                              labels=["smsc_id"])
            metric_sms_received = CounterMetricFamily(
                'bearerbox_smsc_received_sms_total',
                'Total number of received SMS by SMSC',
                labels=["smsc_id"])
            metric_sms_sent = CounterMetricFamily(
                'bearerbox_smsc_sent_sms_total',
                'Total number of SMS sent to SMSC',
                labels=["smsc_id"])
            metric_dlr_received = CounterMetricFamily(
                'bearerbox_smsc_received_dlr_total',
                'Total number of DLRs received by SMSC',
                labels=["smsc_id"])
            metric_dlr_sent = CounterMetricFamily(
                'bearerbox_smsc_sent_dlr_total',
                'Total number of DLRs sent to SMSC',
                labels=["smsc_id"])

            # Group SMSCs by smsc-id
            smsc_stats_by_id = OrderedDict()

            # when there's only one smsc connection on the gateway
            # xmltodict returns an OrderedDict instead of a list of OrderedDicts
            if not isinstance(response['gateway']['smscs']['smsc'], list):
                response['gateway']['smscs']['smsc'] = [
                    response['gateway']['smscs']['smsc']
                ]

            for smsc in response['gateway']['smscs']['smsc']:
                smscid = smsc['id']
                if smscid in smsc_stats_by_id:
                    smsc_stats_by_id[smscid]['failed'] += int(smsc['failed'])
                    smsc_stats_by_id[smscid]['queued'] += int(smsc['queued'])
                    smsc_stats_by_id[smscid]['sms']['received'] += int(
                        smsc['sms']['received'])
                    smsc_stats_by_id[smscid]['sms']['sent'] += int(
                        smsc['sms']['sent'])
                    smsc_stats_by_id[smscid]['dlr']['received'] += int(
                        smsc['dlr']['received'])
                    smsc_stats_by_id[smscid]['dlr']['sent'] += int(
                        smsc['dlr']['sent'])
                else:
                    smsc_stats_by_id[smscid] = OrderedDict()
                    smsc_stats_by_id[smscid]['failed'] = int(smsc['failed'])
                    smsc_stats_by_id[smscid]['queued'] = int(smsc['queued'])
                    smsc_stats_by_id[smscid]['sms'] = OrderedDict()
                    smsc_stats_by_id[smscid]['sms']['received'] = int(
                        smsc['sms']['received'])
                    smsc_stats_by_id[smscid]['sms']['sent'] = int(
                        smsc['sms']['sent'])
                    smsc_stats_by_id[smscid]['dlr'] = OrderedDict()
                    smsc_stats_by_id[smscid]['dlr']['received'] = int(
                        smsc['dlr']['received'])
                    smsc_stats_by_id[smscid]['dlr']['sent'] = int(
                        smsc['dlr']['sent'])

            for smsc in smsc_stats_by_id:
                metric_failed.add_metric([smsc],
                                         smsc_stats_by_id[smsc]['failed'])
                metric_queued.add_metric([smsc],
                                         smsc_stats_by_id[smsc]['queued'])
                metric_sms_received.add_metric(
                    [smsc], smsc_stats_by_id[smsc]['sms']['received'])
                metric_sms_sent.add_metric(
                    [smsc], smsc_stats_by_id[smsc]['sms']['sent'])
                metric_dlr_received.add_metric(
                    [smsc], smsc_stats_by_id[smsc]['dlr']['received'])
                metric_dlr_sent.add_metric(
                    [smsc], smsc_stats_by_id[smsc]['dlr']['sent'])

            yield metric_failed
            yield metric_queued
            yield metric_sms_received
            yield metric_sms_sent
            yield metric_dlr_received
            yield metric_dlr_sent
Beispiel #7
0
class AristaMetricsCollector(object):
    def __init__(self, config, target):
        self._username = os.getenv('ARISTA_USERNAME', config['username'])
        self._password = os.getenv('ARISTA_PASSWORD', config['password'])
        self._protocol = config['protocol'] or 'https'
        self._timeout = config['timeout']
        self._target = target
        self._labels = {}
        self._switch_up = 0
        self._responsetime = 0
        self._memtotal = 0
        self._memfree = 0
        self._connection = False
        self._interfaces = False
        self._module_names = False
        if 'module_names' in config:
            self._module_names = config['module_names']
        self._scrape_durations = GaugeMetricFamily(
            'arista_scrape_duration_seconds',
            'Duration of a collector scrape.',
        )

    def add_scrape_duration(self, module_name, duration):
        self._scrape_durations.add_sample(
            'arista_scrape_duration_seconds',
            value=duration,
            labels=({
                'collector': module_name
            }),
        )

    def get_connection(self):
        # set the default timeout
        logging.debug(f'Setting timeout to {self._timeout}')
        if not self._connection:
            logging.info(f'Connecting to switch {self._target}')
            self._connection = pyeapi.connect(transport=self._protocol,
                                              host=self._target,
                                              username=self._username,
                                              password=self._password,
                                              timeout=self._timeout)
        return self._connection

    def switch_command(self, command):
        switch_result = ''

        connection = self.get_connection()

        try:
            logging.debug(f'Running command {command}')
            switch_result = connection.execute([command])
        except pyeapi.eapilib.ConnectionError as pyeapi_connect_except:
            self._connection = False
            logging.error(('PYEAPI Client Connection Exception: '
                           f'{pyeapi_connect_except}'))
        except pyeapi.eapilib.CommandError as pyeapi_command_except:
            self._connection = False
            logging.error(('PYEAPI Client Command Exception: '
                           f'{pyeapi_command_except}'))
        finally:
            return switch_result

    def _get_labels(self):
        start = time.time()
        # Get the switch info for the labels
        switch_info = self.switch_command(command='show version')
        try:
            si_res = switch_info['result'][0]
        except Exception as e:
            logging.debug(f'No result from switch {self._target}: {e}')
            labels_switch = {'model': 'unknown', 'serial': 'unknown'}
            self._switch_up = 0
        else:
            logging.debug(f'Received a result from switch {self._target}')
            labels_switch = {
                'model': si_res['modelName'],
                'serial': si_res['serialNumber'],
                'version': si_res['version']
            }
            self._memtotal = si_res['memTotal']
            self._memfree = si_res['memFree']
            self._switch_up = 1

        end = time.time()
        self._responsetime = end - start
        self.add_scrape_duration('base', self._responsetime)
        self._labels.update(labels_switch)

    def collect_memory(self):
        # Export the memory usage data
        yield GaugeMetricFamily('arista_mem_total',
                                'Total memory available',
                                value=self._memtotal)
        yield GaugeMetricFamily('arista_mem_free',
                                'Total memory free',
                                value=self._memfree)

    def collect_tcam(self):
        # Get the tcam usage data
        switch_tcam = self.switch_command(command='show hardware capacity')

        if switch_tcam:
            used_metrics = GaugeMetricFamily('arista_tcam_used',
                                             'TCAM Usage Data')
            total_metrics = GaugeMetricFamily('arista_tcam_total',
                                              'TCAM Capacity')
            for entry in switch_tcam['result'][0]['tables']:
                try:
                    labels = ({
                        'table': entry['table'],
                        'chip': entry['chip'],
                        'feature': entry['feature']
                    })
                    logging.debug((f'Adding: table={entry["table"]} '
                                   f'value={entry["used"]} '
                                   f'labels={labels}'))
                    used_metrics.add_sample('arista_tcam_used',
                                            value=entry['used'],
                                            labels=labels)
                    total_metrics.add_sample('arista_tcam_total',
                                             value=entry['maxLimit'],
                                             labels=labels)
                except KeyError:
                    logging.error('KeyError in switch_tcam entries')
                    continue

            yield total_metrics
            yield used_metrics

    def collect_port(self):
        command = 'show interfaces'
        port_interfaces = self.switch_command(command)
        port_stats = {
            k:
            GaugeMetricFamily(f'arista_port_{k}',
                              f'Port stats {k}',
                              labels=['device', 'description', 'mac', 'mtu'])
            for k in PORT_STATS_NAMES
        }
        port_admin_up = GaugeMetricFamily('arista_admin_up',
                                          'Value 1 if port is not shutdown',
                                          labels=['device', 'description'])
        port_l2_up = GaugeMetricFamily('arista_l2_up',
                                       'Value 1 if port is connected',
                                       labels=['device', 'description'])

        if port_interfaces:
            self._interfaces = port_interfaces['result'][0]['interfaces']
            for interface in self._interfaces:
                try:
                    iface = self._interfaces[interface]
                    data = iface['interfaceCounters']
                except KeyError:
                    logging.debug((f'Interface {interface} on {self._target}'
                                   ' does not have interfaceCounters,'
                                   ' skipping'))
                    continue
                if iface['interfaceStatus'] == 'disabled':
                    port_admin_up.add_metric(
                        labels=[iface['name'], iface['description']], value=0)
                else:
                    port_admin_up.add_metric(
                        labels=[iface['name'], iface['description']], value=1)
                if iface['lineProtocolStatus'] == 'up':
                    port_l2_up.add_metric(
                        labels=[iface['name'], iface['description']], value=1)
                else:
                    port_l2_up.add_metric(
                        labels=[iface['name'], iface['description']], value=0)

                for port_stat in PORT_STATS_NAMES:
                    metric = [
                        interface,
                        iface['description'],
                        iface['physicalAddress'],
                        str(iface['mtu']),
                    ]
                    port_stats[port_stat].add_metric(metric,
                                                     float(data[port_stat]))
            yield from port_stats.values()
            yield port_admin_up
            yield port_l2_up

    def collect_sfp(self):
        command = 'show interfaces transceiver detail'
        sfp = self.switch_command(command)
        sensor_entries = ['rxPower', 'txBias', 'txPower', 'voltage']

        if sfp:
            sfp_labels = [
                'device', 'sensor', 'mediaType', 'serial', 'description',
                'lane'
            ]
            sfp_stats_metrics = GaugeMetricFamily('arista_sfp_stats',
                                                  'SFP Statistics',
                                                  labels=sfp_labels)
            alarm_labels = ['device', 'lane', 'sensor', 'alarmType']
            sfp_alarms = GaugeMetricFamily('arista_sfp_alarms',
                                           'SFP Alarms',
                                           labels=alarm_labels)
            for iface, data in sfp['result'][0]['interfaces'].items():
                interface = iface
                lane = iface
                if not data:
                    logging.debug(f'Port does not have SFP: {interface}')
                    continue
                description = ''
                # Lane detection. Lane is an optical transmitter that is
                # a part of an interface. For example, 100G interface
                # is usually comprised of four 25G lanes or ten 10G lanes.
                if iface not in self._interfaces:
                    logging.debug((f'Port {interface} not found in interfaces'
                                   '. Looking for a lane'))
                    try_iface = '/'.join(interface.split('/')[0:-1]) + '/1'
                    sfps = sfp['result'][0]['interfaces']
                    if sfps[iface]['vendorSn'] == sfps[try_iface]['vendorSn']:
                        lane = iface
                        interface = try_iface
                        logging.debug((f'Setting lane {lane} as '
                                       'part of {interface}'))
                try:
                    description = self._interfaces[interface]['description']
                except KeyError:
                    pass
                for sensor in sensor_entries:
                    labels = [
                        interface, sensor, data['mediaType'], data['vendorSn'],
                        description, lane
                    ]
                    logging.debug((f'Adding: interface={interface} '
                                   f'sensor={sensor} value={data[sensor]} '
                                   f'labels={labels}'))
                    sfp_stats_metrics.add_metric(value=float(data[sensor]),
                                                 labels=labels)
                    # check thresholds and generate alerts
                    thresholds = data['details'][sensor]
                    labels = [interface, lane, sensor]
                    if data[sensor] > thresholds['highAlarm']:
                        labels.append('highAlarm')
                        sfp_alarms.add_metric(labels=labels,
                                              value=data[sensor])
                    elif data[sensor] > thresholds['highWarn']:
                        labels.append('highWarn')
                        sfp_alarms.add_metric(labels=labels,
                                              value=data[sensor])
                    elif data[sensor] < thresholds['lowAlarm']:
                        labels.append('lowAlarm')
                        sfp_alarms.add_metric(labels=labels,
                                              value=data[sensor])
                    elif data[sensor] < thresholds['lowWarn']:
                        labels.append('lowWarn')
                        sfp_alarms.add_metric(labels=labels,
                                              value=data[sensor])

            yield sfp_stats_metrics
            yield sfp_alarms

    def collect_bgp(self):
        command = 'show ip bgp summary'
        data = self.switch_command(command)
        ipv4 = data['result'][0]['vrfs']
        command = 'show ipv6 bgp summary'
        data = self.switch_command(command)
        ipv6 = data['result'][0]['vrfs']

        labels = ['vrf', 'peer', 'asn']
        prefixes = GaugeMetricFamily('arista_bgp_accepted_prefixes',
                                     'Number of prefixes accepted',
                                     labels=labels)
        peer_state = InfoMetricFamily('arista_bgp_peer_state',
                                      'State of the BGP peer',
                                      labels=labels + ['state', 'router_id'])

        for vrf, vrf_data in ipv4.items():
            if 'peers' not in vrf_data:
                continue
            router_id = vrf_data['routerId']
            for peer, peer_data in vrf_data['peers'].items():
                labels = {
                    'vrf': vrf,
                    'router_id': router_id,
                    'peer': peer,
                    'asn': str(peer_data['asn']),
                    'state': peer_data['peerState']
                }
                peer_state.add_metric(value=labels, labels=labels)
                labels = [vrf, peer, str(peer_data['asn'])]
                prefixes.add_metric(value=peer_data['prefixReceived'],
                                    labels=labels)
        for vrf, vrf_data in ipv6.items():
            if 'peers' not in vrf_data:
                continue
            router_id = vrf_data['routerId']
            for peer, peer_data in vrf_data['peers'].items():
                labels = {
                    'vrf': vrf,
                    'router_id': router_id,
                    'peer': peer,
                    'asn': str(peer_data['asn']),
                    'state': peer_data['peerState']
                }
                peer_state.add_metric(value=labels, labels=labels)
                labels = [vrf, peer, str(peer_data['asn'])]
                prefixes.add_metric(value=peer_data['prefixReceived'],
                                    labels=labels)
        yield peer_state
        yield prefixes

    def get_all_modules(self):
        return {
            'memory': self.collect_memory,
            'tcam': self.collect_tcam,
            'port': self.collect_port,
            'sfp': self.collect_sfp,
            'bgp': self.collect_bgp,
        }

    def get_modules(self):
        if not self._module_names:
            return self.get_all_modules()
        module_functions = {}
        modules = self._module_names.split(',')
        for module in modules:
            if module == 'all':
                return self.get_all_modules()
            elif module == 'memory':
                module_functions['memory'] = self.collect_memory
            elif module == 'tcam':
                module_functions['tcam'] = self.collect_tcam
            elif module == 'port':
                module_functions['port'] = self.collect_port
            elif module == 'sfp':
                module_functions['sfp'] = self.collect_sfp
            elif module == 'bgp':
                module_functions['bgp'] = self.collect_bgp
            else:
                logging.warning(f'Unknown module requested:{module}. Ignoring')
        return module_functions

    def collect(self):
        self._get_labels()
        self._interfaces = False
        # Export the up and response metrics
        yield GaugeMetricFamily('arista_up',
                                ('Information whether the switch is reachable '
                                 'and responds to API calls'),
                                value=self._switch_up)

        if self._switch_up == 1:

            yield InfoMetricFamily('arista_hw',
                                   ('Information about this arista device, '
                                    'such as serial number and model'),
                                   value=self._labels)

            for name, generator in self.get_modules().items():
                start = time.time()
                for metric in generator():
                    yield metric
                end = time.time()
                self.add_scrape_duration(name, end - start)
        yield self._scrape_durations
Beispiel #8
0
    def collect(self):

        try:
            # Export the up and response metrics
            up_metrics = GaugeMetricFamily('redfish_up','Server Monitoring for redfish availability',labels=self._labels)
            response_metrics = GaugeMetricFamily('redfish_response_duration_seconds','Server Monitoring for redfish response time',labels=self._labels)

            up_metrics.add_sample('redfish_up', value=self._redfish_up, labels=self._labels)
            response_metrics.add_sample('redfish_response_duration_seconds', value=self._response_time , labels=self._labels)
            yield up_metrics
            yield response_metrics

            if self._redfish_up == 0:
                return

            self._get_labels()
            powerstate_metrics = GaugeMetricFamily('redfish_powerstate','Server Monitoring Power State Data',labels=self._labels)
            powerstate_metrics.add_sample('redfish_powerstate', value=self._powerstate , labels=self._labels)
            yield powerstate_metrics
            
            logging.info("Target {0}: Collecting data ...".format(self._target))
            if self._health:
                self._health_metrics = GaugeMetricFamily('redfish_health','Server Monitoring Health Data',labels=self._labels)

                current_labels = {'type': 'system', 'name': 'summary'}
                current_labels.update(self._labels)
                self._health_metrics.add_sample('redfish_health', value=self._server_health, labels=current_labels)

                # Get the processor health data
                if self._urls['Processors']:
                    self.get_proc_health()
                else:
                    logging.warning("Target {0}: No Processors URL provided! Cannot get Processors data!".format(self._target))

                # Get the storage health data
                if self._urls['Storage']:
                    self.get_storage_health()
                elif self._urls['SimpleStorage']:
                    self.get_simple_storage_health()
                else:
                    logging.warning("Target {0}: No Storage URL provided! Cannot get Storage data!".format(self._target))


                # Get the chassis health data
                if self._urls['Chassis']:
                    self.get_chassis_health()
                else:
                    logging.warning("Target {0}: No Chassis URL provided! Cannot get Chassis data!".format(self._target))

                # Get the powersupply health data
                if self._urls['Power']:
                    self.get_power_health()
                else:
                    logging.warning("Target {0}: No Power URL provided! Cannot get PSU data!".format(self._target))

                # Get the thermal health data
                if self._urls['Thermal']:
                    self.get_thermal_health()
                else:
                    logging.warning("Target {0}: No Thermal URL provided! Cannot get thermal data!".format(self._target))

                # Export the memory data
                if self._urls['Memory']:
                    self._mem_metrics_correctable = GaugeMetricFamily('redfish_memory_correctable','Server Monitoring Memory Data for correctable errors',labels=self._labels)
                    self._mem_metrics_unorrectable = GaugeMetricFamily('redfish_memory_uncorrectable','Server Monitoring Memory Data for uncorrectable errors',labels=self._labels)
                    self.get_memory_health()
                    yield self._mem_metrics_correctable
                    yield self._mem_metrics_unorrectable
                else:
                    logging.warning("Target {0}: No Memory URL provided! Cannot get memory data!".format(self._target))

                yield self._health_metrics

                duration = round(time.time() - self._start_time,2)
                logging.info("Target {0}: Scrape duration: {1} seconds".format(self._target, duration))
                scrape_metrics = GaugeMetricFamily('redfish_scrape_duration_seconds','Server Monitoring redfish scrabe duration in seconds',labels=self._labels)
                scrape_metrics.add_sample('redfish_scrape_duration_seconds', value=duration, labels=self._labels)
                yield scrape_metrics

            # Get the firmware information
            if self._firmware:
                logging.debug("Target {0}: Get the firmware information.".format(self._target))

                fw_collection = self.connect_server("/redfish/v1/UpdateService/FirmwareInventory")
                if not fw_collection:
                    logging.warning("Target {0}: Cannot get Firmware data!".format(self._target))
                    return
                fw_metrics = GaugeMetricFamily('server_monitoring_fwdata','Server Monitoring Firmware Data',labels=self._labels)
                for fw_member in fw_collection['Members']:
                    fw_member_url = fw_member['@odata.id']
                    if (search(".*Dell.*", self._manufacturer) and ("Installed" in fw_member_url)) or not search(".*Dell.*", self._manufacturer):
                        server_response = self.connect_server(fw_member_url)
                        if not server_response:
                            continue
                        name = server_response['Name'].split(",",1)[0]
                        if 'Version' in server_response:
                            version = server_response['Version']
                            if version != "N/A":
                                current_labels = {'name': name, 'version': version}
                                current_labels.update(self._labels)
                                fw_metrics.add_sample('redfish_version', value=1, labels=current_labels)

                yield fw_metrics

        except Exception as err:
            logging.error("Target {0}: An exception occured: {1}".format(self._target, err))
        

        finally:
            logging.debug("Target {0}: Deleting Redfish session with server {1}".format(self._target, self._host))

            if self._auth_token:
                session_url = "https://{0}{1}".format(self._target, self._session_url)
                headers = {'x-auth-token': self._auth_token}

                logging.debug("Target {0}: Using URL {1}".format(self._target, session_url))

                response = requests.delete(session_url, verify=False, timeout=self._timeout, headers=headers)
                response.close()

                if response:
                    logging.info("Target {0}: Redfish Session deleted successfully.".format(self._target))
                else:
                    logging.warning("Target {0}: Failed to delete session with server {1}".format(self._target, self._host))
                    logging.warning("Target {0}: Token: {1}".format(self._target, self._auth_token))

            else:
                logging.debug("Target {0}: No Redfish session existing with server {1}".format(self._target, self._host))

            if self._session:
                logging.info("Target {0}: Closing requests session.".format(self._target))
                self._session.close()
Beispiel #9
0
class RedfishMetricsCollector(object):
    def __init__(self, config, target, host, usr, pwd, firmware=False, health=False):

        self._target = target
        self._host = host

        self._username = usr
        self._password = pwd

        self._timeout = int(os.getenv('TIMEOUT', config['timeout']))
        self._labels = {'host': self._host}
        self._redfish_up = 0
        self._response_time = 0
        self._last_http_code = 0
        self._powerstate = 0

        self._firmware = firmware
        self._health = health

        self._systems_url = ""
        self._urls = {
            'Memory': "",
            'ManagedBy': "",
            'Processors': "",
            'Storage': "",
            'SimpleStorage': "",
            'Chassis': "",
            'Power': "",
            'Thermal': "",
            'NetworkInterfaces': ""
        }

        self._server_health = 0
        self._health_metrics = None
        self._mem_metrics_correctable = None
        self._mem_metrics_unorrectable = None
        self._manufacturer = ""
        self._model = ""
        self._status = {"ok": 0, "operable": 0, "enabled": 0, "good": 0, "critical": 1, "error": 1, "warning": 2}
        self._start_time = time.time()
        
        self._session_url = ""
        self._auth_token = ""
        self._basic_auth = False
        self._session = ""
       
    def get_session(self):
        # Get the url for the server info and messure the response time
        logging.info("Target {0}: Connecting to server {1}".format(self._target, self._host))
        start_time = time.time()
        server_response = self.connect_server("/redfish/v1", noauth=True)
        self._response_time = round(time.time() - start_time,2)
        logging.info("Target {0}: Response time: {1} seconds.".format(self._target, self._response_time))

        if server_response:
            logging.debug("Target {0}: data received from server {1}.".format(self._target, self._host))
            session_service = self.connect_server(server_response['SessionService']['@odata.id'], basic_auth=True)
            if self._last_http_code == 200:
                sessions_url = "https://{0}{1}".format(self._target, session_service['Sessions']['@odata.id'])
                session_data = {"UserName": self._username, "Password": self._password}
                self._session.auth = None
                result = ""

                # Try to get a session
                try:
                    result = self._session.post(sessions_url, json=session_data, verify=False, timeout=self._timeout)
                    result.raise_for_status()

                except requests.exceptions.ConnectionError as err:
                    logging.error("Target {0}: Error getting an auth token from server {1}: {2}".format(self._target, self._host, err))
                    self._basic_auth = True

                except requests.exceptions.HTTPError as err:
                    logging.warning("Target {0}: No session received from server {1}: {2}".format(self._target, self._host, err))
                    logging.warning("Target {0}: Switching to basic authentication.".format(self._target))
                    self._basic_auth = True

                if result:
                    if result.status_code in [200,201]:
                        self._auth_token = result.headers['X-Auth-Token']
                        self._session_url = result.json()['@odata.id']
                        logging.info("Target {0}: Got an auth token from server {1}!".format(self._target, self._host))
                        self._redfish_up = 1

            else:
                logging.warning("Target {0}: Failed to get a session from server {1}!".format(self._target, self._host))

        else:
            logging.warning("Target {0}: No data received from server {1}!".format(self._target, self._host))
    
    def connect_server(self, command, noauth = False, basic_auth = False):
        logging.captureWarnings(True)
        
        req = ""
        req_text = ""
        server_response = ""
        self._last_http_code = 200
        request_duration = 0
        request_start = time.time()

        url = "https://{0}{1}".format(self._target, command)

        # check if we already established a session with the server
        if not self._session:
            self._session = requests.Session()
        else:
            logging.debug("Target {0}: Using existing session.".format(self._target))
        self._session.verify = False
        self._session.headers.update({'charset': 'utf-8'})
        self._session.headers.update({'content-type': 'application/json'})

        if noauth:
            logging.debug("Target {0}: Using no auth".format(self._target))
        elif basic_auth or self._basic_auth:
            self._session.auth = (self._username, self._password)
            logging.debug("Target {0}: Using basic auth with user {1}".format(self._target, self._username))
        else:
            logging.debug("Target {0}: Using auth token".format(self._target))
            self._session.auth = None
            self._session.headers.update({'X-Auth-Token': self._auth_token})

        logging.debug("Target {0}: Using URL {1}".format(self._target, url))
        try:
            req = self._session.get(url, timeout = self._timeout)
            req.raise_for_status()

        except requests.exceptions.HTTPError as err:
            self._last_http_code = err.response.status_code
            if err.response.status_code == 401:
                logging.error("Target {0}: Authorization Error: Wrong job provided or user/password set wrong on server {1}: {2}".format(self._target, self._host, err))
            else:
                logging.error("Target {0}: HTTP Error on server {1}: {2}".format(self._target, self._host, err))

        except requests.exceptions.ConnectTimeout:
            logging.error("Target {0}: Timeout while connecting to {1}".format(self._target, self._host))
            self._last_http_code = 408

        except requests.exceptions.ReadTimeout:
            logging.error("Target {0}: Timeout while reading data from {1}".format(self._target, self._host))
            self._last_http_code = 408

        except requests.exceptions.ConnectionError as excptn:
            logging.error("Target {0}: Unable to connect to {1}: {2}".format(self._target, self._host, excptn))
            self._last_http_code = 444

        except:
            logging.error("Target {0}: Unexpected error: {1}".format(self._target, sys.exc_info()[0]))
            self._last_http_code = 500

        else:
            self._last_http_code = req.status_code

        if req != "":
            try: 
                req_text = req.json()

            except:
                logging.debug("Target {0}: No json data received.".format(self._target))

            # req will evaluate to True if the status code was between 200 and 400 and False otherwise.
            if req:
                server_response = req_text

            # if the request fails the server might give a hint in the ExtendedInfo field
            else:
                if req_text:
                    logging.debug("Target {0}: {1}: {2}".format(self._target, req_text['error']['code'], req_text['error']['message']))
                    if '@Message.ExtendedInfo' in req_text['error']:
                        if type(req_text['error']['@Message.ExtendedInfo']) == list:
                            if 'Message' in req_text['error']['@Message.ExtendedInfo'][0]:
                                logging.debug("Target {0}: {1}".format(self._target, req_text['error']['@Message.ExtendedInfo'][0]['Message']))
                        elif type(req_text['error']['@Message.ExtendedInfo']) == dict:
                            if 'Message' in req_text['error']['@Message.ExtendedInfo']:
                                logging.debug("Target {0}: {1}".format(self._target, req_text['error']['@Message.ExtendedInfo']['Message']))
                        else:
                            pass

        request_duration = round(time.time() - request_start,2)
        logging.debug("Target {0}: Request duration: {1}".format(self._target, request_duration))
        return server_response


    def _get_labels(self):

        systems = self.connect_server("/redfish/v1/Systems")

        if not systems:
            return

        powerstates = {'off': 0, 'on': 1}
        # Get the server info for the labels
        self._systems_url = systems['Members'][0]['@odata.id']
        server_info = self.connect_server(self._systems_url)
        if not server_info:
            return
        self._manufacturer = server_info['Manufacturer']
        self._model = server_info['Model']
        self._powerstate = powerstates[server_info['PowerState'].lower()]
        if 'SKU' in server_info:
            serial = server_info['SKU']
        else:
            serial = server_info['SerialNumber']
        self._labels.update({'host': self._host, 'server_manufacturer': self._manufacturer, 'server_model': self._model, 'server_serial': serial})

        self._server_health = self._status[server_info['Status']['Health'].lower()]

        # get the links of the parts for later
        if type(server_info['Links']['Chassis'][0]) == str:
            self._urls['Chassis'] = server_info['Links']['Chassis'][0]
            self._urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]
        else:
            self._urls['Chassis'] = server_info['Links']['Chassis'][0]['@odata.id']
            self._urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]['@odata.id']
        if 'Memory' in server_info:
            self._urls['Memory'] = server_info['Memory']['@odata.id']
        if 'NetworkInterfaces' in server_info:
            self._urls['NetworkInterfaces'] = server_info['NetworkInterfaces']['@odata.id']
        if 'Processors' in server_info:
            self._urls['Processors'] = server_info['Processors']['@odata.id']
        if 'Storage' in server_info:
            self._urls['Storage'] = server_info['Storage']['@odata.id']
        if 'SimpleStorage' in server_info:
            self._urls['SimpleStorage'] = server_info['SimpleStorage']['@odata.id']

    def get_proc_health(self):
        logging.debug("Target {0}: Get the CPU health data.".format(self._target))
        processor_collection = self.connect_server(self._urls['Processors'])

        if not processor_collection:
            return
        for processor in processor_collection['Members']:
            processor_data = self.connect_server(processor['@odata.id'])
            if not processor_data:
                continue
            current_labels = {'type': 'processor', 'name': processor_data.get('Socket', "unknown"), 'cpu_type': processor_data.get('ProcessorType', "unknown"), 'cpu_model': processor_data.get('Model', "unknown"), 'cpu_cores': str(processor_data.get('TotalCores', "unknown")), 'cpu_threads': str(processor_data.get('TotalThreads', "unknown"))}
            current_labels.update(self._labels)
            if processor_data['Status']['Health']:
                self._health_metrics.add_sample('redfish_health', value=self._status[processor_data['Status']['Health'].lower()], labels=current_labels)
            else:
                logging.warning("Target {0}: No Processor health data provided ({1})!".format(self._target, processor['@odata.id']))
                self._health_metrics.add_sample('redfish_health', value=math.nan, labels=current_labels)

    def get_storage_health(self):
        logging.debug("Target {0}: Get the storage health data.".format(self._target))
        storage_collection = self.connect_server(self._urls['Storage'])

        if not storage_collection:
            return
        for controller in storage_collection['Members']:
            controller_data = self.connect_server(controller['@odata.id'])
            if not controller_data:
                continue
            if controller_data.get('StorageControllers'):
                # Cisco sometimes uses a list or a dict
                if type(controller_data['StorageControllers']) == list:
                    controller_details = controller_data['StorageControllers'][0]
                else:
                    controller_details = controller_data['StorageControllers']
            else:
                controller_details = controller_data

            # HPE ILO5 is missing the Name in the details of the controllers
            if 'Name' in controller_details:
                controller_name = controller_details['Name']
            elif 'Name' in controller_data:
                controller_name = controller_data['Name']
            else:
                controller_name = 'unknown'

            if 'Health' in controller_details['Status']:
                # Cisco sometimes uses None as status for onboard controllers
                controller_status = math.nan if controller_details['Status']['Health'] is None else self._status[controller_details['Status']['Health'].lower()]
            else:
                logging.warning("Target {0}, Host {1}, Model {2}, Controller {3}: No health data found.".format(self._target, self._host,self._model, controller_name))

            current_labels = {'type': 'storage', 'name': controller_name, 'controller_model': controller_details.get('Model', 'unknown'), 'controller_manufacturer': controller_details.get('Manufacturer', 'unknown')}
            current_labels.update(self._labels)
            self._health_metrics.add_sample('redfish_health', value=controller_status, labels=current_labels)
            
            # Sometimes not all attributes are implemented. Checking if existing one by one.
            disk_attributes = {'Name': 'name', 'MediaType': 'disk_type', 'Model': 'disk_model', 'Manufacturer': 'disk_manufacturer', 'CapacityBytes': 'disk_capacity', 'Protocol': 'disk_protocol'}
            for disk in controller_data['Drives']:
                current_labels = {'type': 'disk'}
                disk_data = self.connect_server(disk['@odata.id'])
                if disk_data == '':
                    continue

                for disk_attribute in disk_attributes:
                    if disk_attribute in disk_data:
                        current_labels.update({disk_attributes[disk_attribute]: str(disk_data[disk_attribute])})

                current_labels.update(self._labels)
                if 'Health' in disk_data['Status']:
                    disk_status = math.nan if disk_data['Status']['Health'] is None else self._status[disk_data['Status']['Health'].lower()]
                    self._health_metrics.add_sample('redfish_health', value=disk_status, labels=current_labels)
                else:
                    logging.warning("Target {0}, Host {1}, Model {2}, Disk {3}: No health data found.".format(self._target, self._host,self._model, disk_data['name']))

    def get_simple_storage_health(self):
        storage_collection = self.connect_server(self._urls['SimpleStorage'])
        if not storage_collection:
            return
        for controller in storage_collection['Members']:
            controller_data = self.connect_server(controller['@odata.id'])
            if not controller_data:
                continue
            controller_name = controller_data['Name']
            controller_status = math.nan if controller_data['Status']['Health'] is None else self._status[controller_data['Status']['Health'].lower()]

            current_labels = {'type': 'storage', 'name': controller_name}
            current_labels.update(self._labels)
            self._health_metrics.add_sample('redfish_health', value=controller_status, labels=current_labels)
            # Sometimes not all attributes are implemented. Checking if existing one by one.
            disk_attributes = {'Name': 'name', 'Model': 'disk_model', 'Manufacturer': 'disk_manufacturer'}
            for disk in controller_data['Devices']:
                current_labels = {'type': 'disk'}
                if disk['Status']['State'] != 'Absent':
                    for disk_attribute in disk_attributes:
                        if disk_attribute in disk:
                            current_labels.update({disk_attributes[disk_attribute]: disk[disk_attribute]})

                    current_labels.update(self._labels)
                    self._health_metrics.add_sample('redfish_health', value=self._status[disk['Status']['Health'].lower()], labels=current_labels)

    def get_chassis_health(self):
        logging.debug("Target {0}: Get the Chassis health data.".format(self._target))
        chassis_data = self.connect_server(self._urls['Chassis'])
        if not chassis_data:
            return

        current_labels = {'type': 'chassis', 'name': chassis_data['Name']}
        current_labels.update(self._labels)
        self._health_metrics.add_sample('redfish_health', value=self._status[chassis_data['Status']['Health'].lower()], labels=current_labels)
        if 'Power' in chassis_data:
            self._urls['Power'] = chassis_data['Power']['@odata.id']
        if 'Thermal' in chassis_data:
            self._urls['Thermal'] = chassis_data['Thermal']['@odata.id']

    def get_power_health(self):
        logging.debug("Target {0}: Get the PDU health data.".format(self._target))
        power_data = self.connect_server(self._urls['Power'])
        if not power_data:
            return

        for psu in power_data['PowerSupplies']:
            psu_name = psu.get('Name', 'unknown')
            current_labels = {'type': 'powersupply', 'name': psu_name}
            current_labels.update(self._labels)
            psu_health = math.nan
            psu_status = dict((k.lower(),v) for k,v in psu['Status'].items()) # convert to lower case because there are differences per vendor
            if 'state' in psu_status:
                if psu_status['state'] != 'absent':
                    if 'health' in psu_status:
                        psu_health = math.nan if psu_status['health'] is None else self._status[psu_status['health'].lower()]
                    elif 'state' in psu_status:
                        psu_health = math.nan if psu_status['state'] is None else self._status[psu_status['state'].lower()]
            
            if psu_health is math.nan: 
                logging.warning("Target {0}, Host {1}, Model {2}, PSU {3}: No health data found.".format(self._target, self._host,self._model, psu_name))

            self._health_metrics.add_sample('redfish_health', value=psu_health, labels=current_labels)

    def get_thermal_health(self):
        logging.debug("Target {0}: Get the thermal health data.".format(self._target))
        thermal_data = self.connect_server(self._urls['Thermal'])
        if not thermal_data:
            return

        for fan in thermal_data['Fans']:
            fan_name = fan.get('Name', 'unknown')
            current_labels = {'type': 'fan', 'name': fan_name}
            current_labels.update(self._labels)
            fan_health = math.nan
            fan_status = dict((k.lower(),v) for k,v in fan['Status'].items()) # convert to lower case because there are differences per vendor
            if 'state' in fan_status:
                if fan_status['state'] != 'absent':
                    if 'health' in fan_status:
                        fan_health = math.nan if fan_status['health'] is None or fan_status['health'] == '' else self._status[fan_status['health'].lower()]
                    elif 'state' in fan_status:
                        fan_health = math.nan if fan_status['state'] is None else self._status[fan_status['state'].lower()]

            if fan_health is math.nan: 
                logging.warning("Target {0}, Host {1}, Model {2}, Fan {3}: No health data found.".format(self._target, self._host,self._model, fan['Name']))

            self._health_metrics.add_sample('redfish_health', value=fan_health, labels=current_labels)

    def get_memory_health(self):
        logging.debug("Target {0}: Get the Memory data.".format(self._target))

        memory_collection = self.connect_server(self._urls['Memory'])
        if not memory_collection:
            return

        for dimm_url in memory_collection['Members']:
            dimm_info = self.connect_server(dimm_url['@odata.id'])
            if not dimm_info:
                continue
            current_labels = {'type': 'memory', 'name': dimm_info['Name']}
            current_labels.update(self._labels)
            if type(dimm_info['Status']) == str:
                dimm_health = self._status[dimm_info['Status'].lower()]
            else:
                dimm_health = math.nan
                dimm_status = dict((k.lower(),v) for k,v in dimm_info['Status'].items()) # convert to lower case because there are differences per vendor
                if 'state' in dimm_status:
                    if dimm_status['state'] is not None:
                        if dimm_status['state'].lower() == 'absent':
                            logging.warning("Target {0}, Host {1}, Model {2}, Dimm {3}: absent.".format(self._target, self._host,self._model, dimm_info['Name']))
                            continue
                    if 'Manufacturer' in dimm_info:
                        manufacturer = dimm_info['Manufacturer']
                    if 'Oem' in dimm_info:
                        if 'Hpe' in dimm_info['Oem']:
                            manufacturer = dimm_info['Oem']['Hpe']['VendorName']
                    current_labels.update({'dimm_capacity': str(dimm_info['CapacityMiB']), 'dimm_speed': str(dimm_info['OperatingSpeedMhz']), 'dimm_type': dimm_info['MemoryDeviceType'], 'dimm_manufacturer': manufacturer})
                    if 'health' in dimm_status:
                        dimm_health = math.nan if dimm_info['Status']['Health'] is None else self._status[dimm_info['Status']['Health'].lower()]
                    elif 'state' in dimm_status:
                        dimm_health = math.nan if dimm_info['Status']['State'] is None else self._status[dimm_info['Status']['State'].lower()]

            if dimm_health is math.nan:
                logging.warning("Target {0}, Host {1}, Model {2}, Dimm {3}: No health data found.".format(self._target, self._host,self._model, dimm_info['Name']))
            
            self._health_metrics.add_sample('redfish_health', value=dimm_health, labels=current_labels)

            if 'Metrics' in dimm_info:
                dimm_metrics = self.connect_server(dimm_info['Metrics']['@odata.id'])
                if not dimm_metrics:
                    continue
                correctable_ecc_error = math.nan if dimm_metrics['HealthData']['AlarmTrips']['CorrectableECCError'] is None else int(dimm_metrics['HealthData']['AlarmTrips']['CorrectableECCError'])
                uncorrectable_ecc_error = math.nan if dimm_metrics['HealthData']['AlarmTrips']['UncorrectableECCError'] is None else int(dimm_metrics['HealthData']['AlarmTrips']['UncorrectableECCError'])
                self._mem_metrics_correctable.add_sample('redfish_memory_correctable', value=correctable_ecc_error, labels=current_labels)
                self._mem_metrics_unorrectable.add_sample('redfish_memory_uncorrectable', value=uncorrectable_ecc_error, labels=current_labels)
            else:
                logging.warning("Target {0}, Host {1}, Model {2}: Dimm {3}: No Dimm Metrics found.".format(self._target, self._host,self._model, dimm_info['Name']))
            
    def collect(self):

        try:
            # Export the up and response metrics
            up_metrics = GaugeMetricFamily('redfish_up','Server Monitoring for redfish availability',labels=self._labels)
            response_metrics = GaugeMetricFamily('redfish_response_duration_seconds','Server Monitoring for redfish response time',labels=self._labels)

            up_metrics.add_sample('redfish_up', value=self._redfish_up, labels=self._labels)
            response_metrics.add_sample('redfish_response_duration_seconds', value=self._response_time , labels=self._labels)
            yield up_metrics
            yield response_metrics

            if self._redfish_up == 0:
                return

            self._get_labels()
            powerstate_metrics = GaugeMetricFamily('redfish_powerstate','Server Monitoring Power State Data',labels=self._labels)
            powerstate_metrics.add_sample('redfish_powerstate', value=self._powerstate , labels=self._labels)
            yield powerstate_metrics
            
            logging.info("Target {0}: Collecting data ...".format(self._target))
            if self._health:
                self._health_metrics = GaugeMetricFamily('redfish_health','Server Monitoring Health Data',labels=self._labels)

                current_labels = {'type': 'system', 'name': 'summary'}
                current_labels.update(self._labels)
                self._health_metrics.add_sample('redfish_health', value=self._server_health, labels=current_labels)

                # Get the processor health data
                if self._urls['Processors']:
                    self.get_proc_health()
                else:
                    logging.warning("Target {0}: No Processors URL provided! Cannot get Processors data!".format(self._target))

                # Get the storage health data
                if self._urls['Storage']:
                    self.get_storage_health()
                elif self._urls['SimpleStorage']:
                    self.get_simple_storage_health()
                else:
                    logging.warning("Target {0}: No Storage URL provided! Cannot get Storage data!".format(self._target))


                # Get the chassis health data
                if self._urls['Chassis']:
                    self.get_chassis_health()
                else:
                    logging.warning("Target {0}: No Chassis URL provided! Cannot get Chassis data!".format(self._target))

                # Get the powersupply health data
                if self._urls['Power']:
                    self.get_power_health()
                else:
                    logging.warning("Target {0}: No Power URL provided! Cannot get PSU data!".format(self._target))

                # Get the thermal health data
                if self._urls['Thermal']:
                    self.get_thermal_health()
                else:
                    logging.warning("Target {0}: No Thermal URL provided! Cannot get thermal data!".format(self._target))

                # Export the memory data
                if self._urls['Memory']:
                    self._mem_metrics_correctable = GaugeMetricFamily('redfish_memory_correctable','Server Monitoring Memory Data for correctable errors',labels=self._labels)
                    self._mem_metrics_unorrectable = GaugeMetricFamily('redfish_memory_uncorrectable','Server Monitoring Memory Data for uncorrectable errors',labels=self._labels)
                    self.get_memory_health()
                    yield self._mem_metrics_correctable
                    yield self._mem_metrics_unorrectable
                else:
                    logging.warning("Target {0}: No Memory URL provided! Cannot get memory data!".format(self._target))

                yield self._health_metrics

                duration = round(time.time() - self._start_time,2)
                logging.info("Target {0}: Scrape duration: {1} seconds".format(self._target, duration))
                scrape_metrics = GaugeMetricFamily('redfish_scrape_duration_seconds','Server Monitoring redfish scrabe duration in seconds',labels=self._labels)
                scrape_metrics.add_sample('redfish_scrape_duration_seconds', value=duration, labels=self._labels)
                yield scrape_metrics

            # Get the firmware information
            if self._firmware:
                logging.debug("Target {0}: Get the firmware information.".format(self._target))

                fw_collection = self.connect_server("/redfish/v1/UpdateService/FirmwareInventory")
                if not fw_collection:
                    logging.warning("Target {0}: Cannot get Firmware data!".format(self._target))
                    return
                fw_metrics = GaugeMetricFamily('server_monitoring_fwdata','Server Monitoring Firmware Data',labels=self._labels)
                for fw_member in fw_collection['Members']:
                    fw_member_url = fw_member['@odata.id']
                    if (search(".*Dell.*", self._manufacturer) and ("Installed" in fw_member_url)) or not search(".*Dell.*", self._manufacturer):
                        server_response = self.connect_server(fw_member_url)
                        if not server_response:
                            continue
                        name = server_response['Name'].split(",",1)[0]
                        if 'Version' in server_response:
                            version = server_response['Version']
                            if version != "N/A":
                                current_labels = {'name': name, 'version': version}
                                current_labels.update(self._labels)
                                fw_metrics.add_sample('redfish_version', value=1, labels=current_labels)

                yield fw_metrics

        except Exception as err:
            logging.error("Target {0}: An exception occured: {1}".format(self._target, err))
        

        finally:
            logging.debug("Target {0}: Deleting Redfish session with server {1}".format(self._target, self._host))

            if self._auth_token:
                session_url = "https://{0}{1}".format(self._target, self._session_url)
                headers = {'x-auth-token': self._auth_token}

                logging.debug("Target {0}: Using URL {1}".format(self._target, session_url))

                response = requests.delete(session_url, verify=False, timeout=self._timeout, headers=headers)
                response.close()

                if response:
                    logging.info("Target {0}: Redfish Session deleted successfully.".format(self._target))
                else:
                    logging.warning("Target {0}: Failed to delete session with server {1}".format(self._target, self._host))
                    logging.warning("Target {0}: Token: {1}".format(self._target, self._auth_token))

            else:
                logging.debug("Target {0}: No Redfish session existing with server {1}".format(self._target, self._host))

            if self._session:
                logging.info("Target {0}: Closing requests session.".format(self._target))
                self._session.close()
Beispiel #10
0
    def collect(self):

        # Export the up and response metrics
        info_metrics = GaugeMetricFamily('arista_monitoring_info','Arista Switch Monitoring',labels=self._labels)
        info_metrics.add_sample('arista_up', value=self._switch_up, labels=self._labels)
        info_metrics.add_sample('arista_response', value=self._responstime, labels=self._labels)
        
        yield info_metrics

        if self._switch_up == 1:

            logging.debug("Switch is rechable.")
            # Export the memory usage data
            mem_metrics = GaugeMetricFamily('switch_monitoring_memdata','Arista Switch Monitoring Memory Usage Data',labels=self._labels)
            mem_metrics.add_sample('arista_mem_total', value=self._memtotal, labels=self._labels)
            mem_metrics.add_sample('arista_mem_free', value=self._memfree, labels=self._labels)
            logging.debug("Exporting metrics arista_mem_total=%s", self._memtotal)
            logging.debug("Exporting metrics arista_mem_free=%s", self._memfree)
            yield mem_metrics

            # Get the tcam usage data
            switch_tcam = self.connect_switch (command="show hardware capacity")

            if switch_tcam:
                tcam_metrics = GaugeMetricFamily('switch_monitoring_data','Arista Switch Monitoring TCAM Usage Data',labels=self._labels)
                for entry in switch_tcam['result'][0]['tables']:
                    # add the chip and feature names as labels to the switch info labels
                    labels = {}
                    labels = ({'table': entry['table'], 'chip':entry["chip"], 'feature':entry["feature"]})
                    if entry['table'] not in self._exclude:
                        #logging.debug("Adding: table=%s value=%s labels=%s", entry['table'], entry["usedPercent"], labels)
                        labels.update(self._labels)
                        tcam_metrics.add_sample('arista_tcam', value=entry["usedPercent"], labels=labels)
                    else:
                        logging.debug("Excluding: table=%s value=%s labels=%s", entry['table'], entry["usedPercent"], labels)

                yield tcam_metrics
            
            else:
                pass

            switch_port_stats = self.connect_switch(command="show interfaces counters rates")
            regex_pattern = re.compile('.*reserved.*', re.IGNORECASE)

            if switch_port_stats:
                port_stats_metrics = GaugeMetricFamily('switch_monitoring_ports','Arista Switch Monitoring Port Statistics',labels=self._labels)
                for port_entry in switch_port_stats['result'][0]['interfaces']:
                    port_values = switch_port_stats['result'][0]['interfaces'][port_entry]
                    port_description = port_values['description'].replace("-> ","")
                    for port_value in port_values:
                        if port_value != "description" and port_value != 'interval' and not regex_pattern.match(port_description):
                            labels = {}
                            labels = ({'port': port_entry, 'stat': port_value, 'description': port_description})
                            labels.update(self._labels)
                            #logging.debug("Adding: port=%s stat=%s value=%s labels=%s", port_entry, port_value, port_values[port_value], labels)
                            port_stats_metrics.add_sample('arista_port_stats', value=float(port_values[port_value]), labels=labels)

                yield port_stats_metrics
        else:
            pass