Ejemplo n.º 1
0
    def itermetrics(self):

        payload = {'q': 'show stats'}
        url = "http://{}:{}/query".format(self.address, self.port)

        try:
            r = self.session.get(url, params=payload)
        except Exception as e:
            msg = "Got {0} when getting stats from {1}".format(e, url)
            raise base.CheckException(msg)

        if r.status_code != 200:
            msg = "Got response {0} from {0}".format(r.status_code, url)
            raise base.CheckException(msg)

        data = r.json()
        try:
            series_list = data['results'][0]['series']
        except:
            self.logger.error("Failed to retrieve series for InfluxDB cluster")
            return

        for serie in series_list:
            metrics_list = METRICS_BY_NAME.get(serie['name'], None)
            if not metrics_list:
                continue
            for i in range(len(serie['columns'])):
                metric_name = serie['columns'][i]
                if metric_name in metrics_list:
                    yield {
                        'type_instance': metrics_list[metric_name][0],
                        'type': metrics_list[metric_name][1],
                        'values': [serie['values'][0][i]],
                    }
Ejemplo n.º 2
0
 def itermetrics(self):
     url = "{}://{}:{}/Snh_SandeshTraceRequest?x=IFMapTraceBuf".format(
         self.protocol, self.host, self.port)
     try:
         with contextlib.closing(urllib2.urlopen(url, None, 5)) as response:
             rcode = response.getcode()
             if rcode == 200:
                 tree = ElementTree.fromstring(response.read())
                 items = [(int(exc.text.split()[0]), exc.text)
                          for exc in tree.iter('element')]
                 if len(items) > 0:
                     last_entry = sorted(items, reverse=True)[0][0]
                     now = time.time()
                     age = now - last_entry / 1000000.0
                     msg = "The last entry is {} seconds old."
                     self.logger.info(msg.format(age))
                     yield {'values': age, 'type': 'gauge'}
                 else:
                     msg = "No entry in IF-MAP trace buffer!"
                     raise base.CheckException(msg)
             else:
                 msg = "Unexpected code {} while contacting {}"
                 raise base.CheckException(msg.format(rcode, url))
     except urllib2.URLError as exc:
         msg = "Cannot retrieve last entry from IF-MAP trace buffer: {}!"
         raise base.CheckException(msg.format(str(exc)))
    def itermetrics(self):
        try:
            r = self.session.get(self.url)
        except Exception as e:
            msg = "Got exception for '{}': {}".format(self.url, e)
            raise base.CheckException(msg)

        if r.status_code != 200:
            msg = "{} responded with code {}".format(
                self.url, r.status_code)
            raise base.CheckException(msg)

        data = r.json()
        self.logger.debug("Got response from Elasticsearch: '%s'" % data)

        yield {
            'type_instance': 'health',
            'values': HEALTH_MAP[data['status']]
        }

        for metric in METRICS:
            value = data.get(metric)
            if value is None:
                # Depending on the Elasticsearch version, not all metrics are
                # available
                self.logger.info("Couldn't find {} metric".format(metric))
                continue
            yield {
                'type_instance': metric,
                'values': value
            }
Ejemplo n.º 4
0
    def itermetrics(self):
        check_errors = []
        checked = 0
        for socket_name in glob.glob(self.socket_glob):
            m = RE_OSD_ID.match(socket_name)
            if not m:
                continue

            osd_id = m.group(1)
            perf_dump = self.execute_to_json(
                'ceph --admin-daemon %s perf dump' % socket_name)
            if not perf_dump:
                check_errors.append(osd_id)
                continue

            checked += 1
            for prefix, stats in perf_dump.iteritems():
                if prefix not in self.PREFIXES or not stats:
                    continue

                for k in sorted(stats.iterkeys()):
                    yield {
                        'type': self.convert_to_collectd_type(prefix, k),
                        'type_instance': osd_id,
                        'values': self.convert_to_collectd_value(stats[k])
                    }

        if check_errors:
            raise base.CheckException(
                "Fail to run 'ceph perf dump' for OSD(s): {}".format(
                    ', '.join(check_errors)))

        if checked == 0:
            raise base.CheckException(
                'Could not find any OSD socket in {}'.format(self.socket_glob))
Ejemplo n.º 5
0
 def itermetrics(self):
     cmd = [DOCKER_BINARY, 'info', '-f', "{{ json .}}"]
     (retcode, out, err) = self.execute(cmd, shell=False, log_error=True)
     if retcode != 0:
         raise base.CheckException("{} : {}".format(DOCKER_BINARY, err))
     try:
         infos = json.loads(out)
     except ValueError as e:
         raise base.CheckException("{}: document: '{}'".format(e, out))
     else:
         yield {'values': infos.get('Containers', 0),
                'plugin_instance': 'containers_total',
                }
         yield {'values': infos.get('ContainersPaused', 0),
                'plugin_instance': 'containers',
                'meta': {'status': 'paused'},
                }
         yield {'values': infos.get('ContainersRunning', 0),
                'plugin_instance': 'containers',
                'meta': {'status': 'running'},
                }
         yield {'values': infos.get('ContainersStopped', 0),
                'plugin_instance': 'containers',
                'meta': {'status': 'stopped'},
                }
         yield {'values': infos.get('Images', 0),
                'plugin_instance': 'images',
                }
Ejemplo n.º 6
0
    def itermetrics(self):
        stats = {}
        try:
            r = self.session.get(self.api_overview_url, timeout=self.timeout)
            overview = r.json()
        except Exception as e:
            msg = "Got exception for '{}': {}".format(self.api_overview_url, e)
            raise base.CheckException(msg)

        if r.status_code != 200:
            msg = "{} responded with code {}".format(self.api_overview_url,
                                                     r.status_code)
            raise base.CheckException(msg)

        objects = overview.get('object_totals', {})
        stats['queues'] = objects.get('queues', 0)
        stats['consumers'] = objects.get('consumers', 0)
        stats['connections'] = objects.get('connections', 0)
        stats['exchanges'] = objects.get('exchanges', 0)
        stats['channels'] = objects.get('channels', 0)
        stats['messages'] = overview.get('queue_totals', {}).get('messages', 0)
        stats['running_nodes'] = len(overview.get('contexts', []))

        for k, v in stats.iteritems():
            yield {'type_instance': k, 'values': v}

        stats = {}
        nodename = overview['node']
        try:
            r = self.session.get("{}/{}".format(self.api_nodes_url, nodename),
                                 timeout=self.timeout)
            node = r.json()
        except Exception as e:
            msg = "Got exception for '{}': {}".format(self.api_nodes_url, e)
            raise base.CheckException(msg)

        if r.status_code != 200:
            msg = "{} responded with code {}".format(self.api_nodes_url,
                                                     r.status_code)
            self.logger.error(msg)
            raise base.CheckException(msg)

        stats['disk_free_limit'] = node['disk_free_limit']
        stats['disk_free'] = node['disk_free']
        stats['remaining_disk'] = node['disk_free'] - node['disk_free_limit']

        stats['used_memory'] = node['mem_used']
        stats['vm_memory_limit'] = node['mem_limit']
        stats['remaining_memory'] = node['mem_limit'] - node['mem_used']

        for k, v in stats.iteritems():
            yield {'type_instance': k, 'values': v}
Ejemplo n.º 7
0
    def query_api(self, resource):
        url = "{}{}".format(self.url, resource)
        try:
            r = self.session.get(url, timeout=self.timeout)
        except Exception as e:
            msg = "Got exception for '{}': {}".format(url, e)
            raise base.CheckException(msg)

        if r.status_code != 200:
            msg = "{} responded with code {}".format(url, r.status_code)
            raise base.CheckException(msg)

        return r.json()
 def itermetrics(self):
     for name, url in self.urls.items():
         self.logger.debug("Requesting {} URL {}".format(
             name, url)
         )
         try:
             r = self.session.get(url, timeout=self.timeout)
         except Exception as e:
             msg = "Got exception for '{}': {}".format(name, e)
             raise base.CheckException(msg)
         else:
             if r.status_code != 200:
                 self.logger.error(
                     ("{} ({}) responded with code {} "
                      "").format(name, url,
                                 r.status_code))
                 yield {'type_instance': name, 'values': self.FAIL}
             else:
                 try:
                     self.logger.debug(
                         "Got response from {}: '{}'"
                         "".format(url, r.text))
                     px = xml.dom.minidom.parseString(r.text)
                     itemlist = px.getElementsByTagName(
                         self.xml_element[name]
                     )
                     if name not in self.result_type:
                         count = 0
                         state = self.state.get('name')
                         for i in itemlist:
                             if state is None or check_state(i, state):
                                 count = count + 1
                         self.logger.debug(
                             "Got count for {}: '{}'".format(name, count))
                         yield {'type_instance': name, 'values': count}
                     else:
                         rval = itemlist[0].getElementsByTagName(
                             self.result_type[name]
                         )[0].childNodes[0].toxml()
                     self.logger.debug(
                         "Got val for {}: '{}'".format(name, rval))
                     yield {'type_instance': name, 'values': rval}
                 except Exception as e:
                     msg = ("Got exception while parsing "
                            "response for '{}': {}").format(name, e)
                     raise base.CheckException(msg)
Ejemplo n.º 9
0
        def get():
            try:
                r = self.session.get(url, timeout=self.timeout)
                data = r.json()
            except Exception as e:
                self.logger.warning("Got exception for '{}': {}".format(
                    url, e)
                )
                raise base.CheckException('Fail to get {}'.self(url))

            else:

                if r.status_code != 200:
                    msg = ("{} responded with code {} "
                           "while 200 is expected").format(url, r.status_code)
                    self.logger.warning(msg)
                    raise base.CheckException(msg)
            return data.get('items', [])
 def _run_birdcl_command(self, sockf, args):
     cmd = [
         BIRDCL_BINARY,
         '-s',
         sockf
     ] + args
     retcode, out, err = self.execute(cmd, shell=False)
     if retcode == 0:
         return out
     msg = "Failed to execute {} '{}'".format(cmd, err)
     raise base.CheckException(msg)
Ejemplo n.º 11
0
    def itermetrics(self):
        if "all" in self.bonds or len(self.bonds) == 0:
            try:
                self.bonds = os.listdir(self.bond_dir)
            except OSError as e:
                msg = "Error listing all bonds in {}".format(self.bond_dir)
                raise base.CheckException(msg)

        for bond in self.bonds:
            try:
                with open(self.bond_dir + bond, 'r') as fp:
                    bond_info = fp.readlines()
            except IOError as e:
                msg = "Error reading bond info for {}".format(bond)
                self.logger.error(msg)
                continue

            links_total = 0
            links_down = 0
            skip_first_mii_status = True
            for line in bond_info:
                if line.startswith("MII Status:"):
                    # First occurance of "MII Status" is for the bond as a
                    # whole, but we only want individual links.
                    if skip_first_mii_status:
                        skip_first_mii_status = False
                        continue

                    status = line[12:]
                    if status == "down":
                        links_down += 1
                    links_total += 1

            yield {
                'type': 'links',
                'type_instance': 'total',
                'values': [links_total],
                'meta': {
                    'interface': bond
                }
            }
            yield {
                'type': 'links',
                'type_instance': 'down',
                'values': [links_down],
                'meta': {
                    'interface': bond
                }
            }
Ejemplo n.º 12
0
    def itermetrics(self):
        status = self.execute_to_json('ceph -s --format json')
        if not status:
            raise base.CheckException("Fail to execute 'ceph -s'")

        yield {
            'type': 'health',
            'values': HEALTH_MAP[status['health']['overall_status']],
        }

        if 'mons' in status['monmap']:
            monitor_nb = len(status['monmap']['mons'])
        else:
            monitor_nb = 0
        yield {
            'type': 'monitor_count',
            'values': monitor_nb
        }

        yield {
            'type': 'quorum_count',
            'values': len(status.get('quorum', []))
        }

        pgmap = status['pgmap']
        yield {
            'type': 'pg_bytes',
            'values': [pgmap['bytes_used'], pgmap['bytes_avail'],
                       pgmap['bytes_total']],
        }
        yield {
            'type': 'pg_data_bytes',
            'values': pgmap['data_bytes']
        }
        yield {
            'type': 'pg_count',
            'values': pgmap['num_pgs']
        }

        for state in pgmap['pgs_by_state']:
            yield {
                'type': 'pg_state_count',
                'type_instance': state['state_name'],
                'values': state['count']
            }
Ejemplo n.º 13
0
    def itermetrics(self):
        osd_stats = self.execute_to_json('ceph pg dump osds --format json')
        if not osd_stats:
            raise base.CheckException("Fail to execute 'pg dump osds'")

        for osd in osd_stats:
            osd_id = osd['osd']

            yield {
                'type_instance': osd_id,
                'type': 'osd_space',
                'values': [osd['kb_used'] * 1000, osd['kb'] * 1000],
            }

            yield {
                'type_instance': osd_id,
                'type': 'osd_latency',
                'values': [osd['fs_perf_stat']['apply_latency_ms'],
                           osd['fs_perf_stat']['commit_latency_ms']],
            }
    def itermetrics(self):
        # Collect peers' metrics
        retcode, out, err = self.execute([GLUSTER_BINARY, 'peer', 'status'],
                                         shell=False)
        if retcode != 0:
            raise base.CheckException("Failed to execute 'gluster peer'")

        total = 0
        total_by_state = {
            'up': 0,
            'down': 0
        }

        for line in out.split('\n\n'):
            peer_m = peer_re.search(line)
            state_m = state_re.search(line)
            if peer_m and state_m:
                total += 1
                if state_m.group('state') == 'Peer in Cluster (Connected)':
                    v = 1
                    total_by_state['up'] += 1
                else:
                    v = 0
                    total_by_state['down'] += 1
                yield {
                    'type_instance': 'peer_state',
                    'values': v,
                    'meta': {
                        'peer': peer_m.group('peer')
                    }
                }

        for state, count in total_by_state.items():
            yield {
                'type_instance': 'peers_count',
                'values': count,
                'meta': {
                    'state': state
                }
            }
            yield {
                'type_instance': 'peers_percent',
                'values': 100.0 * count / total,
                'meta': {
                    'state': state
                }
            }

        # Collect volumes' metrics
        cmd = [GLUSTER_BINARY, 'volume', 'status', 'all', 'detail']
        retcode, out, err = self.execute(cmd, shell=False, log_error=False)
        if retcode != 0:
            if err and vol_status_transaction_in_progress_re.match(err):
                # "transaction already in progress" error, we assume volumes
                # metrics are being collected on another glusterfs node, and
                # just silently skip the collecting of the volume metrics
                # this time
                self.logger.info("Command '%s' failed because of a "
                                 "transaction is already in progress, "
                                 "ignore the error" % cmd)
            else:
                self.logger.error("Command '%s' failed: %s" % (cmd, err))
                raise base.CheckException("Failed to execute 'gluster volume'")
        else:
            for vol_block in vol_status_re.split(out):
                volume_m = volume_re.search(vol_block)
                if not volume_m:
                    continue
                volume = volume_m.group('volume')
                for line in vol_block_re.split(vol_block):
                    peer_m = brick_server_re.search(line)
                    if not peer_m:
                        continue
                    volume = volume_m.group('volume')
                    peer = peer_m.group('peer')
                    disk_free_m = disk_free_re.search(line)
                    disk_total_m = disk_total_re.search(line)
                    inode_free_m = inode_free_re.search(line)
                    inode_count_m = inode_count_re.search(line)
                    if disk_free_m and disk_total_m:
                        free = convert_to_bytes(
                            disk_free_m.group('disk_free'),
                            disk_free_m.group('unit'))
                        total = convert_to_bytes(
                            disk_total_m.group('disk_total'),
                            disk_total_m.group('unit'))
                        used = total - free
                        yield {
                            'type_instance': 'space_free',
                            'values': free,
                            'meta': {
                                'volume': volume,
                                'peer': peer,
                            }
                        }
                        yield {
                            'type_instance': 'space_percent_free',
                            'values': free * 100.0 / total,
                            'meta': {
                                'volume': volume,
                                'peer': peer,
                            }
                        }
                        yield {
                            'type_instance': 'space_used',
                            'values': used,
                            'meta': {
                                'volume': volume,
                                'peer': peer,
                            }
                        }
                        yield {
                            'type_instance': 'space_percent_used',
                            'values': used * 100.0 / total,
                            'meta': {
                                'volume': volume,
                                'peer': peer,
                            }
                        }
                    if inode_free_m and inode_count_m:
                        free = int(inode_free_m.group('inode_free'))
                        total = int(inode_count_m.group('inode_count'))
                        used = total - free
                        yield {
                            'type_instance': 'inodes_free',
                            'values': free,
                            'meta': {
                                'volume': volume,
                                'peer': peer,
                            }
                        }
                        yield {
                            'type_instance': 'inodes_percent_free',
                            'values': free * 100.0 / total,
                            'meta': {
                                'volume': volume,
                                'peer': peer,
                            }
                        }
                        yield {
                            'type_instance': 'inodes_used',
                            'values': used,
                            'meta': {
                                'volume': volume,
                                'peer': peer,
                            }
                        }
                        yield {
                            'type_instance': 'inodes_percent_used',
                            'values': used * 100.0 / total,
                            'meta': {
                                'volume': volume,
                                'peer': peer,
                            }
                        }
Ejemplo n.º 15
0
    def itermetrics(self):
        haproxy = HAProxySocket(self.socket)

        # Collect server statistics
        if 'server' in self.proxy_monitors:
            try:
                stats = haproxy.get_server_info()
            except socket.error:
                msg = "Unable to connect to HAProxy socket at {}".format(
                    self.socket)
                raise base.CheckException(msg)
            else:
                for k, v in stats.iteritems():
                    if k not in SERVER_METRICS:
                        continue
                    type_instance = SERVER_METRICS[k][0]
                    type_ = SERVER_METRICS[k][1]
                    yield {
                        'type_instance': type_instance,
                        'type': type_,
                        'values': int(v),
                    }

        try:
            stats = haproxy.get_server_stats()
        except socket.error:
            msg = "Unable to connect to HAProxy socket at {}".format(
                self.socket)
            raise base.CheckException(msg)

        def match(x):
            if x['pxname'] in self.proxy_ignore:
                return False
            return (x['svname'].lower() in self.proxy_monitors
                    or x['pxname'].lower() in self.proxy_monitors
                    or ('backend_server' in self.proxy_monitors
                        and x['type'] == BACKEND_SERVER_TYPE))

        stats = filter(match, stats)
        for stat in stats:
            stat['pxname'] = self.get_proxy_name(stat['pxname'])

        # Collect statistics for the frontends and the backends
        for stat in itertools.ifilter(
                lambda x: x['type'] == FRONTEND_TYPE or x['type'] ==
                BACKEND_TYPE, stats):
            if stat['type'] == FRONTEND_TYPE:
                metrics = FRONTEND_METRIC_TYPES
                side = 'frontend'
            else:
                metrics = BACKEND_METRIC_TYPES
                side = 'backend'
            for k, metric in metrics.iteritems():
                if k not in stat:
                    self.logger.warning("Can't find {} metric".format(k))
                    continue
                value = stat[k]

                metric_name = '{}_{}'.format(side, metric[0])
                meta = {side: stat['pxname']}

                if metric[0] == 'status':
                    value = STATUS_MAP[value]
                else:
                    value = int(value) if value else 0

                yield {
                    'type_instance': metric_name,
                    'type': metric[1],
                    'values': value,
                    'meta': meta
                }

        # Count the number of servers per backend and state
        backend_server_states = {}
        for stat in itertools.ifilter(
                lambda x: x['type'] == BACKEND_SERVER_TYPE, stats):
            pxname = stat['pxname']
            if pxname not in backend_server_states:
                backend_server_states[pxname] = defaultdict(int)

            # The status field for a server has the following syntax when a
            # transition occurs with HAproxy >=1.6: "DOWN 17/30" or "UP 1/3".
            status = stat['status'].split(' ')[0]

            # We only pick up the UP and DOWN status while it can be one of
            # NOLB/MAINT/MAINT(via)...
            if status in STATUS_MAP:
                backend_server_states[pxname][status] += 1
                backend_server_states[pxname]['_count'] += 1
                # Emit metric for the backend server
                yield {
                    'type_instance': 'backend_server',
                    'values': STATUS_MAP[status],
                    'meta': {
                        'backend': pxname,
                        'state': status.lower(),
                        'server': stat['svname'],
                    }
                }

        for pxname, states in backend_server_states.iteritems():
            for s in STATUS_MAP.keys():
                val = states.get(s, 0)
                yield {
                    'type_instance': 'backend_servers',
                    'values': val,
                    'meta': {
                        'backend': pxname,
                        'state': s.lower()
                    }
                }

                if backend_server_states[pxname]['_count'] == 0:
                    prct = 0
                else:
                    prct = (100.0 * val) / \
                        backend_server_states[pxname]['_count']
                yield {
                    'type_instance': 'backend_servers_percent',
                    'values': prct,
                    'meta': {
                        'backend': pxname,
                        'state': s.lower()
                    }
                }
    def itermetrics(self):
        def str_to_bool(v):
            return str(v).lower() == 'true'

        def str_to_boolint(v):
            if str_to_bool(v):
                return 1
            else:
                return 0

        def shorten_hostname(v):
            return v.split('.')[0]

        def same_hostname(v):
            if v is not None and v.get('name') == self.hostname:
                return 1
            return 0

        out, err = self.execute([self.crm_mon_binary, '--as-xml', '-r', '-f'],
                                shell=False)
        if not out:
            raise base.CheckException(
                "Failed to execute crm_mon '{}'".format(err))

        try:
            root = ET.fromstring(out)
        except ET.ParseError:
            raise base.CheckException("Failed to parse XML '{}'".format(
                out[:64]))

        if self.notify_resource:
            # Notify the other collectd plugins whether the resource runs
            # locally or not
            node = root.find('resources/resource[@id="{}"]/node'.format(
                self.notify_resource))
            self.collectd.Notification(
                type='gauge',
                message='{{"resource":"{}","value":{}}}'.format(
                    self.notify_resource, same_hostname(node)),
                severity=self.collectd.NOTIF_OKAY).dispatch()
            # The metric needs to be emitted too for the Lua plugins executed
            # by the metric_collector service
            yield {
                'type_instance': 'local_resource_active',
                'values': same_hostname(node),
                'meta': {
                    'resource': self.notify_resource
                }
            }

        summary = root.find('summary')
        current_dc = summary.find('current_dc')
        # The metric needs to be emitted for the alarms that leverage the other
        # metrics emitted by the plugin
        yield {
            'type_instance': 'local_dc_active',
            'values': same_hostname(current_dc),
        }

        if current_dc.get('name') != self.hostname:
            # The other metrics are only collected from the cluster's DC
            return

        # Report global cluster metrics
        yield {
            'type_instance': 'dc',
            'values': str_to_boolint(current_dc.get('present', 'false'))
        }

        yield {
            'type_instance': 'quorum_status',
            'values': str_to_boolint(current_dc.get('with_quorum', 'false'))
        }
        yield {
            'type_instance': 'configured_nodes',
            'values': int(summary.find('nodes_configured').get('number'))
        }
        yield {
            'type_instance': 'configured_resources',
            'values': int(summary.find('resources_configured').get('number'))
        }

        # Report node status metrics
        cluster_nodes = []
        aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0}
        nodes_total = 0
        for node in root.find('nodes').iter('node'):
            nodes_total += 1
            hostname = shorten_hostname(node.get('name'))
            cluster_nodes.append(node.get('name'))
            if str_to_bool(node.get('online')):
                if str_to_bool(node.get('maintenance')):
                    aggregated_nodes_status['maintenance'] += 1
                    yield {
                        'type_instance': 'node_status',
                        'values': MAINTENANCE_STATUS,
                        'meta': {
                            'status': 'maintenance',
                            'host': hostname
                        }
                    }
                else:
                    aggregated_nodes_status['online'] += 1
                    yield {
                        'type_instance': 'node_status',
                        'values': ONLINE_STATUS,
                        'meta': {
                            'status': 'online',
                            'host': hostname
                        }
                    }
            else:
                aggregated_nodes_status['offline'] += 1
                yield {
                    'type_instance': 'node_status',
                    'values': OFFLINE_STATUS,
                    'meta': {
                        'status': 'offline',
                        'host': hostname
                    }
                }

        for status, cnt in aggregated_nodes_status.items():
            yield {
                'type_instance': 'nodes_count',
                'values': cnt,
                'meta': {
                    'status': status
                }
            }
            yield {
                'type_instance': 'nodes_percent',
                'values': 100.0 * cnt / nodes_total,
                'meta': {
                    'status': status
                }
            }

        # Report the number of resources per status
        # Clone resources can run on multipe nodes while "simple" resources run
        # only one node at the same time
        aggregated_resources = defaultdict(Counter)
        resources = root.find('resources')
        for resource_id, resource_name in self.resources.iteritems():
            resource_elts = []
            simple_resource = None
            clone_resource = resources.find(
                'clone/resource[@id="{}"]/..'.format(resource_id))
            if not clone_resource:
                simple_resource = resources.find(
                    'resource[@id="{}"]'.format(resource_id))
                if simple_resource:
                    resource_elts = [simple_resource]
            else:
                resource_elts = clone_resource.findall('resource')

            if not resource_elts:
                self.logger.error("{}: Couldn't find resource '{}'".format(
                    self.plugin, resource_id))
                continue

            total = 0
            for item in resource_elts:
                total += 1
                if (item.get('role') in ('Slave', 'Master')
                        and not str_to_bool(item.get('failed'))):
                    # Multi-master resource
                    aggregated_resources[resource_name]['up'] += 1
                elif item.get('role') == 'Started':
                    aggregated_resources[resource_name]['up'] += 1
                else:
                    aggregated_resources[resource_name]['down'] += 1

            if simple_resource:
                # Report on which node the "simple" resource is running
                for node in cluster_nodes:
                    yield {
                        'type_instance':
                        'local_resource_active',
                        'values':
                        str_to_boolint(
                            node == simple_resource.find('node').get('name')),
                        'meta': {
                            'resource': resource_name,
                            'host': shorten_hostname(node)
                        }
                    }

            for status in ('up', 'down'):
                cnt = aggregated_resources[resource_name][status]
                yield {
                    'type_instance': 'resource_count',
                    'values': cnt,
                    'meta': {
                        'status': status,
                        'resource': resource_name
                    }
                }
                yield {
                    'type_instance': 'resource_percent',
                    'values': 100.0 * cnt / total,
                    'meta': {
                        'status': status,
                        'resource': resource_name
                    }
                }

        # Collect operations' history metrics for the monitored resources
        #
        # The reported count for the resource's operations is an approximate
        # value because crm_mon doesn't provide the exact number. To estimate
        # the number of operations applied to a resource, the plugin keeps a
        # copy of call_ids and compares it with the current value.
        for node in root.find('node_history').iter('node'):
            hostname = shorten_hostname(node.get('name'))
            if hostname not in self.history:
                self.history[hostname] = {}

            for resource_id, resource_name in self.resources.iteritems():
                if resource_id not in self.history[hostname]:
                    self.history[hostname][resource_id] = {
                        'fail_count': 0,
                        'ops_count': 0,
                        'call_ids': Set([])
                    }
                v = self.history[hostname][resource_id]

                res_history = node.find(
                    'resource_history[@id="{}"]'.format(resource_id))
                if res_history:
                    # For simple resources, the resource_history element only
                    # exists for the node that runs the resource
                    v['fail_count'] += int(res_history.get('fail-count', 0))
                    call_ids = Set([
                        i.get('call')
                        for i in res_history.findall('operation_history')
                    ])
                    if call_ids:
                        v['ops_count'] += len(call_ids - v['call_ids'])
                        v['call_ids'] = call_ids

                yield {
                    'type_instance': 'resource_failures',
                    'values': v['fail_count'],
                    'meta': {
                        'resource': resource_name,
                        'host': hostname
                    }
                }
                yield {
                    'type_instance': 'resource_operations',
                    'values': v['ops_count'],
                    'meta': {
                        'resource': resource_name,
                        'host': hostname
                    }
                }
Ejemplo n.º 17
0
    def itermetrics(self):
        df = self.execute_to_json('ceph df --format json')
        if not df:
            raise base.CheckException("Fail to run 'ceph df'")

        objects_count = 0
        for pool in df['pools']:
            objects_count += pool['stats'].get('objects', 0)
            for m in ('bytes_used', 'max_avail', 'objects'):
                yield {
                    'type': 'pool_%s' % m,
                    'type_instance': pool['name'],
                    'values': pool['stats'].get(m, 0),
                }

        yield {'type': 'objects_count', 'values': objects_count}
        yield {'type': 'pool_count', 'values': len(df['pools'])}

        if 'total_bytes' in df['stats']:
            # compatibility with 0.84+
            total = df['stats']['total_bytes']
            used = df['stats']['total_used_bytes']
            avail = df['stats']['total_avail_bytes']
        else:
            # compatibility with <0.84
            total = df['stats']['total_space'] * 1024
            used = df['stats']['total_used'] * 1024
            avail = df['stats']['total_avail'] * 1024

        yield {'type': 'pool_total_bytes', 'values': [used, avail, total]}
        yield {
            'type': 'pool_total_percent',
            'values': [100.0 * used / total, 100.0 * avail / total]
        }

        stats = self.execute_to_json('ceph osd pool stats --format json')
        if not stats:
            raise base.CheckException("Fail to run 'ceph osd pool stats'")

        for pool in stats:
            client_io_rate = pool.get('client_io_rate', {})
            yield {
                'type':
                'pool_bytes_rate',
                'type_instance':
                pool['pool_name'],
                'values': [
                    client_io_rate.get('read_bytes_sec', 0),
                    client_io_rate.get('write_bytes_sec', 0)
                ]
            }
            yield {
                'type': 'pool_ops_rate',
                'type_instance': pool['pool_name'],
                'values': client_io_rate.get('op_per_sec', 0)
            }

        osd = self.execute_to_json('ceph osd dump --format json')
        if not osd:
            raise base.CheckException("Fail to run 'ceph osd dump'")

        for pool in osd['pools']:
            for name in ('size', 'pg_num', 'pg_placement_num'):
                yield {
                    'type': 'pool_%s' % name,
                    'type_instance': pool['pool_name'],
                    'values': pool[name]
                }

        _up, _down, _in, _out = (0, 0, 0, 0)
        for osd in osd['osds']:
            if osd['up'] == 1:
                _up += 1
            else:
                _down += 1
            if osd['in'] == 1:
                _in += 1
            else:
                _out += 1

        yield {'type': 'osd_count', 'values': [_up, _down, _in, _out]}
Ejemplo n.º 18
0
    def itermetrics(self):
        haproxy = HAProxySocket(self.socket)

        # Collect server statistics
        if 'server' in self.proxy_monitors:
            try:
                stats = haproxy.get_server_info()
            except socket.error:
                msg = "Unable to connect to HAProxy socket at {}".format(
                    self.socket)
                raise base.CheckException(msg)
            else:
                for k, v in stats.iteritems():
                    if k not in SERVER_METRICS:
                        continue
                    type_instance = SERVER_METRICS[k][0]
                    type_ = SERVER_METRICS[k][1]
                    yield {
                        'type_instance': type_instance,
                        'type': type_,
                        'values': int(v),
                    }

        try:
            stats = haproxy.get_server_stats()
        except socket.error:
            msg = "Unable to connect to HAProxy socket at {}".format(
                self.socket)
            raise base.CheckException(msg)

        def match(x):
            if x['pxname'] in self.proxy_ignore:
                return False
            return (x['svname'].lower() in self.proxy_monitors
                    or x['pxname'].lower() in self.proxy_monitors
                    or ('backend_server' in self.proxy_monitors
                        and x['type'] == BACKEND_SERVER_TYPE))

        stats = filter(match, stats)
        for stat in stats:
            stat['pxname'] = self.get_proxy_name(stat['pxname'])

        # Collect statistics for the frontends and the backends
        for stat in itertools.ifilter(
                lambda x: x['type'] == FRONTEND_TYPE or x['type'] ==
                BACKEND_TYPE, stats):
            if stat['type'] == FRONTEND_TYPE:
                metrics = FRONTEND_METRIC_TYPES
                side = 'frontend'
            else:
                metrics = BACKEND_METRIC_TYPES
                side = 'backend'
            for k, metric in metrics.iteritems():
                if k not in stat:
                    self.logger.warning("Can't find {} metric".format(k))
                    continue
                value = stat[k]

                metric_name = '{}_{}'.format(side, metric[0])
                meta = {side: stat['pxname']}

                if metric[0] == 'status':
                    value = STATUS_MAP[value]
                else:
                    value = int(value) if value else 0

                yield {
                    'type_instance': metric_name,
                    'type': metric[1],
                    'values': value,
                    'meta': meta
                }

        # Count the number of servers per backend and state
        backend_server_states = {}
        for stat in itertools.ifilter(
                lambda x: x['type'] == BACKEND_SERVER_TYPE, stats):
            pxname = stat['pxname']
            if pxname not in backend_server_states:
                backend_server_states[pxname] = defaultdict(int)
            backend_server_states[pxname][stat['status']] += 1
            # Emit metric for the backend server
            yield {
                'type_instance': 'backend_server',
                'values': STATUS_MAP[stat['status']],
                'meta': {
                    'backend': pxname,
                    'state': stat['status'].lower(),
                    'server': stat['svname'],
                }
            }

        for pxname, states in backend_server_states.iteritems():
            for s in STATUS_MAP.keys():
                yield {
                    'type_instance': 'backend_servers',
                    'values': states.get(s, 0),
                    'meta': {
                        'backend': pxname,
                        'state': s.lower()
                    }
                }
Ejemplo n.º 19
0
    def itermetrics(self):
        if self.url:
            self.logger.debug("Requesting URL {}".format(
                self.url)
            )
            try:
                r = self.session.get(self.url, timeout=self.timeout)
            except Exception as e:
                msg = "Got exception for '{}': {}".format(self.url, e)
                raise base.CheckException(msg)

            if r.status_code != 200:
                self.logger.error(
                    ("{} responded with code {} "
                     "").format(self.url,
                                r.status_code))
                raise base.CheckException(
                    "Failed to gather Calico Felix metrics ({})".format(
                        r.status_code
                    )
                )
            self.logger.debug(
                "Got response from {}: '{}'"
                "".format(self.url, r.text))
            # Example payload:
            # # HELP felix_active_local_endpoints Number
            # # of active endpoints on this host.
            # # TYPE felix_active_local_endpoints gauge
            # felix_active_local_endpoints 1
            # # HELP felix_iptables_chains Number of active iptables chains.
            # # TYPE felix_iptables_chains gauge
            # felix_iptables_chains{ip_version="4",table="filter"} 14
            # felix_iptables_chains{ip_version="4",table="nat"} 6
            # felix_iptables_chains{ip_version="4",table="raw"} 6
            # felix_iptables_chains{ip_version="6",table="filter"} 14
            # felix_iptables_chains{ip_version="6",table="nat"} 6
            # felix_iptables_chains{ip_version="6",table="raw"} 6
            # # HELP go_goroutines Number of goroutines that currently exist.
            # # TYPE go_goroutineqs gauge
            # go_goroutines 39
            for l in r.text.split('\n'):
                # Line is empty or is a comment
                if not l or l.startswith('#'):
                    continue

                (name, rval) = l.split()
                self.logger.debug(
                    "Got val for {}: '{}'".format(name, rval))
                # For some metrics, remove the existing felix prefix
                # to ensure homogeneity
                if name.startswith('felix_'):
                    name = name.replace('felix_', '', 1)
                # Initialization of returned metric
                ret_metric = {
                    'values': rval
                }
                # Metric can have implicit dimensions. For example:
                # felix_iptables_rules{ip_version="4",table="filter"}
                m = re.search(
                    '^(?P<name>[^{]+)(?:{(?P<dimensions>.[^}]+)})?$',
                    name)
                if not m:
                    self.logger.error(
                        "Error parsing metric name {}".format(
                            name))
                    continue

                if m.group('dimensions'):
                    name = m.group('name')
                    meta = {}
                    for d in m.group('dimensions').split(','):
                        (k, v) = d.split('=')
                        meta[k] = v.strip('"')
                    if len(meta) > 0:
                        ret_metric['meta'] = meta
                ret_metric['type_instance'] = m.group('name')
                yield ret_metric