def itermetrics(self): payload = {'q': 'show stats'} url = "http://{}:{}/query".format(self.address, self.port) try: r = self.session.get(url, params=payload) except Exception as e: msg = "Got {0} when getting stats from {1}".format(e, url) raise base.CheckException(msg) if r.status_code != 200: msg = "Got response {0} from {0}".format(r.status_code, url) raise base.CheckException(msg) data = r.json() try: series_list = data['results'][0]['series'] except: self.logger.error("Failed to retrieve series for InfluxDB cluster") return for serie in series_list: metrics_list = METRICS_BY_NAME.get(serie['name'], None) if not metrics_list: continue for i in range(len(serie['columns'])): metric_name = serie['columns'][i] if metric_name in metrics_list: yield { 'type_instance': metrics_list[metric_name][0], 'type': metrics_list[metric_name][1], 'values': [serie['values'][0][i]], }
def itermetrics(self): url = "{}://{}:{}/Snh_SandeshTraceRequest?x=IFMapTraceBuf".format( self.protocol, self.host, self.port) try: with contextlib.closing(urllib2.urlopen(url, None, 5)) as response: rcode = response.getcode() if rcode == 200: tree = ElementTree.fromstring(response.read()) items = [(int(exc.text.split()[0]), exc.text) for exc in tree.iter('element')] if len(items) > 0: last_entry = sorted(items, reverse=True)[0][0] now = time.time() age = now - last_entry / 1000000.0 msg = "The last entry is {} seconds old." self.logger.info(msg.format(age)) yield {'values': age, 'type': 'gauge'} else: msg = "No entry in IF-MAP trace buffer!" raise base.CheckException(msg) else: msg = "Unexpected code {} while contacting {}" raise base.CheckException(msg.format(rcode, url)) except urllib2.URLError as exc: msg = "Cannot retrieve last entry from IF-MAP trace buffer: {}!" raise base.CheckException(msg.format(str(exc)))
def itermetrics(self): try: r = self.session.get(self.url) except Exception as e: msg = "Got exception for '{}': {}".format(self.url, e) raise base.CheckException(msg) if r.status_code != 200: msg = "{} responded with code {}".format( self.url, r.status_code) raise base.CheckException(msg) data = r.json() self.logger.debug("Got response from Elasticsearch: '%s'" % data) yield { 'type_instance': 'health', 'values': HEALTH_MAP[data['status']] } for metric in METRICS: value = data.get(metric) if value is None: # Depending on the Elasticsearch version, not all metrics are # available self.logger.info("Couldn't find {} metric".format(metric)) continue yield { 'type_instance': metric, 'values': value }
def itermetrics(self): check_errors = [] checked = 0 for socket_name in glob.glob(self.socket_glob): m = RE_OSD_ID.match(socket_name) if not m: continue osd_id = m.group(1) perf_dump = self.execute_to_json( 'ceph --admin-daemon %s perf dump' % socket_name) if not perf_dump: check_errors.append(osd_id) continue checked += 1 for prefix, stats in perf_dump.iteritems(): if prefix not in self.PREFIXES or not stats: continue for k in sorted(stats.iterkeys()): yield { 'type': self.convert_to_collectd_type(prefix, k), 'type_instance': osd_id, 'values': self.convert_to_collectd_value(stats[k]) } if check_errors: raise base.CheckException( "Fail to run 'ceph perf dump' for OSD(s): {}".format( ', '.join(check_errors))) if checked == 0: raise base.CheckException( 'Could not find any OSD socket in {}'.format(self.socket_glob))
def itermetrics(self): cmd = [DOCKER_BINARY, 'info', '-f', "{{ json .}}"] (retcode, out, err) = self.execute(cmd, shell=False, log_error=True) if retcode != 0: raise base.CheckException("{} : {}".format(DOCKER_BINARY, err)) try: infos = json.loads(out) except ValueError as e: raise base.CheckException("{}: document: '{}'".format(e, out)) else: yield {'values': infos.get('Containers', 0), 'plugin_instance': 'containers_total', } yield {'values': infos.get('ContainersPaused', 0), 'plugin_instance': 'containers', 'meta': {'status': 'paused'}, } yield {'values': infos.get('ContainersRunning', 0), 'plugin_instance': 'containers', 'meta': {'status': 'running'}, } yield {'values': infos.get('ContainersStopped', 0), 'plugin_instance': 'containers', 'meta': {'status': 'stopped'}, } yield {'values': infos.get('Images', 0), 'plugin_instance': 'images', }
def itermetrics(self): stats = {} try: r = self.session.get(self.api_overview_url, timeout=self.timeout) overview = r.json() except Exception as e: msg = "Got exception for '{}': {}".format(self.api_overview_url, e) raise base.CheckException(msg) if r.status_code != 200: msg = "{} responded with code {}".format(self.api_overview_url, r.status_code) raise base.CheckException(msg) objects = overview.get('object_totals', {}) stats['queues'] = objects.get('queues', 0) stats['consumers'] = objects.get('consumers', 0) stats['connections'] = objects.get('connections', 0) stats['exchanges'] = objects.get('exchanges', 0) stats['channels'] = objects.get('channels', 0) stats['messages'] = overview.get('queue_totals', {}).get('messages', 0) stats['running_nodes'] = len(overview.get('contexts', [])) for k, v in stats.iteritems(): yield {'type_instance': k, 'values': v} stats = {} nodename = overview['node'] try: r = self.session.get("{}/{}".format(self.api_nodes_url, nodename), timeout=self.timeout) node = r.json() except Exception as e: msg = "Got exception for '{}': {}".format(self.api_nodes_url, e) raise base.CheckException(msg) if r.status_code != 200: msg = "{} responded with code {}".format(self.api_nodes_url, r.status_code) self.logger.error(msg) raise base.CheckException(msg) stats['disk_free_limit'] = node['disk_free_limit'] stats['disk_free'] = node['disk_free'] stats['remaining_disk'] = node['disk_free'] - node['disk_free_limit'] stats['used_memory'] = node['mem_used'] stats['vm_memory_limit'] = node['mem_limit'] stats['remaining_memory'] = node['mem_limit'] - node['mem_used'] for k, v in stats.iteritems(): yield {'type_instance': k, 'values': v}
def query_api(self, resource): url = "{}{}".format(self.url, resource) try: r = self.session.get(url, timeout=self.timeout) except Exception as e: msg = "Got exception for '{}': {}".format(url, e) raise base.CheckException(msg) if r.status_code != 200: msg = "{} responded with code {}".format(url, r.status_code) raise base.CheckException(msg) return r.json()
def itermetrics(self): for name, url in self.urls.items(): self.logger.debug("Requesting {} URL {}".format( name, url) ) try: r = self.session.get(url, timeout=self.timeout) except Exception as e: msg = "Got exception for '{}': {}".format(name, e) raise base.CheckException(msg) else: if r.status_code != 200: self.logger.error( ("{} ({}) responded with code {} " "").format(name, url, r.status_code)) yield {'type_instance': name, 'values': self.FAIL} else: try: self.logger.debug( "Got response from {}: '{}'" "".format(url, r.text)) px = xml.dom.minidom.parseString(r.text) itemlist = px.getElementsByTagName( self.xml_element[name] ) if name not in self.result_type: count = 0 state = self.state.get('name') for i in itemlist: if state is None or check_state(i, state): count = count + 1 self.logger.debug( "Got count for {}: '{}'".format(name, count)) yield {'type_instance': name, 'values': count} else: rval = itemlist[0].getElementsByTagName( self.result_type[name] )[0].childNodes[0].toxml() self.logger.debug( "Got val for {}: '{}'".format(name, rval)) yield {'type_instance': name, 'values': rval} except Exception as e: msg = ("Got exception while parsing " "response for '{}': {}").format(name, e) raise base.CheckException(msg)
def get(): try: r = self.session.get(url, timeout=self.timeout) data = r.json() except Exception as e: self.logger.warning("Got exception for '{}': {}".format( url, e) ) raise base.CheckException('Fail to get {}'.self(url)) else: if r.status_code != 200: msg = ("{} responded with code {} " "while 200 is expected").format(url, r.status_code) self.logger.warning(msg) raise base.CheckException(msg) return data.get('items', [])
def _run_birdcl_command(self, sockf, args): cmd = [ BIRDCL_BINARY, '-s', sockf ] + args retcode, out, err = self.execute(cmd, shell=False) if retcode == 0: return out msg = "Failed to execute {} '{}'".format(cmd, err) raise base.CheckException(msg)
def itermetrics(self): if "all" in self.bonds or len(self.bonds) == 0: try: self.bonds = os.listdir(self.bond_dir) except OSError as e: msg = "Error listing all bonds in {}".format(self.bond_dir) raise base.CheckException(msg) for bond in self.bonds: try: with open(self.bond_dir + bond, 'r') as fp: bond_info = fp.readlines() except IOError as e: msg = "Error reading bond info for {}".format(bond) self.logger.error(msg) continue links_total = 0 links_down = 0 skip_first_mii_status = True for line in bond_info: if line.startswith("MII Status:"): # First occurance of "MII Status" is for the bond as a # whole, but we only want individual links. if skip_first_mii_status: skip_first_mii_status = False continue status = line[12:] if status == "down": links_down += 1 links_total += 1 yield { 'type': 'links', 'type_instance': 'total', 'values': [links_total], 'meta': { 'interface': bond } } yield { 'type': 'links', 'type_instance': 'down', 'values': [links_down], 'meta': { 'interface': bond } }
def itermetrics(self): status = self.execute_to_json('ceph -s --format json') if not status: raise base.CheckException("Fail to execute 'ceph -s'") yield { 'type': 'health', 'values': HEALTH_MAP[status['health']['overall_status']], } if 'mons' in status['monmap']: monitor_nb = len(status['monmap']['mons']) else: monitor_nb = 0 yield { 'type': 'monitor_count', 'values': monitor_nb } yield { 'type': 'quorum_count', 'values': len(status.get('quorum', [])) } pgmap = status['pgmap'] yield { 'type': 'pg_bytes', 'values': [pgmap['bytes_used'], pgmap['bytes_avail'], pgmap['bytes_total']], } yield { 'type': 'pg_data_bytes', 'values': pgmap['data_bytes'] } yield { 'type': 'pg_count', 'values': pgmap['num_pgs'] } for state in pgmap['pgs_by_state']: yield { 'type': 'pg_state_count', 'type_instance': state['state_name'], 'values': state['count'] }
def itermetrics(self): osd_stats = self.execute_to_json('ceph pg dump osds --format json') if not osd_stats: raise base.CheckException("Fail to execute 'pg dump osds'") for osd in osd_stats: osd_id = osd['osd'] yield { 'type_instance': osd_id, 'type': 'osd_space', 'values': [osd['kb_used'] * 1000, osd['kb'] * 1000], } yield { 'type_instance': osd_id, 'type': 'osd_latency', 'values': [osd['fs_perf_stat']['apply_latency_ms'], osd['fs_perf_stat']['commit_latency_ms']], }
def itermetrics(self): # Collect peers' metrics retcode, out, err = self.execute([GLUSTER_BINARY, 'peer', 'status'], shell=False) if retcode != 0: raise base.CheckException("Failed to execute 'gluster peer'") total = 0 total_by_state = { 'up': 0, 'down': 0 } for line in out.split('\n\n'): peer_m = peer_re.search(line) state_m = state_re.search(line) if peer_m and state_m: total += 1 if state_m.group('state') == 'Peer in Cluster (Connected)': v = 1 total_by_state['up'] += 1 else: v = 0 total_by_state['down'] += 1 yield { 'type_instance': 'peer_state', 'values': v, 'meta': { 'peer': peer_m.group('peer') } } for state, count in total_by_state.items(): yield { 'type_instance': 'peers_count', 'values': count, 'meta': { 'state': state } } yield { 'type_instance': 'peers_percent', 'values': 100.0 * count / total, 'meta': { 'state': state } } # Collect volumes' metrics cmd = [GLUSTER_BINARY, 'volume', 'status', 'all', 'detail'] retcode, out, err = self.execute(cmd, shell=False, log_error=False) if retcode != 0: if err and vol_status_transaction_in_progress_re.match(err): # "transaction already in progress" error, we assume volumes # metrics are being collected on another glusterfs node, and # just silently skip the collecting of the volume metrics # this time self.logger.info("Command '%s' failed because of a " "transaction is already in progress, " "ignore the error" % cmd) else: self.logger.error("Command '%s' failed: %s" % (cmd, err)) raise base.CheckException("Failed to execute 'gluster volume'") else: for vol_block in vol_status_re.split(out): volume_m = volume_re.search(vol_block) if not volume_m: continue volume = volume_m.group('volume') for line in vol_block_re.split(vol_block): peer_m = brick_server_re.search(line) if not peer_m: continue volume = volume_m.group('volume') peer = peer_m.group('peer') disk_free_m = disk_free_re.search(line) disk_total_m = disk_total_re.search(line) inode_free_m = inode_free_re.search(line) inode_count_m = inode_count_re.search(line) if disk_free_m and disk_total_m: free = convert_to_bytes( disk_free_m.group('disk_free'), disk_free_m.group('unit')) total = convert_to_bytes( disk_total_m.group('disk_total'), disk_total_m.group('unit')) used = total - free yield { 'type_instance': 'space_free', 'values': free, 'meta': { 'volume': volume, 'peer': peer, } } yield { 'type_instance': 'space_percent_free', 'values': free * 100.0 / total, 'meta': { 'volume': volume, 'peer': peer, } } yield { 'type_instance': 'space_used', 'values': used, 'meta': { 'volume': volume, 'peer': peer, } } yield { 'type_instance': 'space_percent_used', 'values': used * 100.0 / total, 'meta': { 'volume': volume, 'peer': peer, } } if inode_free_m and inode_count_m: free = int(inode_free_m.group('inode_free')) total = int(inode_count_m.group('inode_count')) used = total - free yield { 'type_instance': 'inodes_free', 'values': free, 'meta': { 'volume': volume, 'peer': peer, } } yield { 'type_instance': 'inodes_percent_free', 'values': free * 100.0 / total, 'meta': { 'volume': volume, 'peer': peer, } } yield { 'type_instance': 'inodes_used', 'values': used, 'meta': { 'volume': volume, 'peer': peer, } } yield { 'type_instance': 'inodes_percent_used', 'values': used * 100.0 / total, 'meta': { 'volume': volume, 'peer': peer, } }
def itermetrics(self): haproxy = HAProxySocket(self.socket) # Collect server statistics if 'server' in self.proxy_monitors: try: stats = haproxy.get_server_info() except socket.error: msg = "Unable to connect to HAProxy socket at {}".format( self.socket) raise base.CheckException(msg) else: for k, v in stats.iteritems(): if k not in SERVER_METRICS: continue type_instance = SERVER_METRICS[k][0] type_ = SERVER_METRICS[k][1] yield { 'type_instance': type_instance, 'type': type_, 'values': int(v), } try: stats = haproxy.get_server_stats() except socket.error: msg = "Unable to connect to HAProxy socket at {}".format( self.socket) raise base.CheckException(msg) def match(x): if x['pxname'] in self.proxy_ignore: return False return (x['svname'].lower() in self.proxy_monitors or x['pxname'].lower() in self.proxy_monitors or ('backend_server' in self.proxy_monitors and x['type'] == BACKEND_SERVER_TYPE)) stats = filter(match, stats) for stat in stats: stat['pxname'] = self.get_proxy_name(stat['pxname']) # Collect statistics for the frontends and the backends for stat in itertools.ifilter( lambda x: x['type'] == FRONTEND_TYPE or x['type'] == BACKEND_TYPE, stats): if stat['type'] == FRONTEND_TYPE: metrics = FRONTEND_METRIC_TYPES side = 'frontend' else: metrics = BACKEND_METRIC_TYPES side = 'backend' for k, metric in metrics.iteritems(): if k not in stat: self.logger.warning("Can't find {} metric".format(k)) continue value = stat[k] metric_name = '{}_{}'.format(side, metric[0]) meta = {side: stat['pxname']} if metric[0] == 'status': value = STATUS_MAP[value] else: value = int(value) if value else 0 yield { 'type_instance': metric_name, 'type': metric[1], 'values': value, 'meta': meta } # Count the number of servers per backend and state backend_server_states = {} for stat in itertools.ifilter( lambda x: x['type'] == BACKEND_SERVER_TYPE, stats): pxname = stat['pxname'] if pxname not in backend_server_states: backend_server_states[pxname] = defaultdict(int) # The status field for a server has the following syntax when a # transition occurs with HAproxy >=1.6: "DOWN 17/30" or "UP 1/3". status = stat['status'].split(' ')[0] # We only pick up the UP and DOWN status while it can be one of # NOLB/MAINT/MAINT(via)... if status in STATUS_MAP: backend_server_states[pxname][status] += 1 backend_server_states[pxname]['_count'] += 1 # Emit metric for the backend server yield { 'type_instance': 'backend_server', 'values': STATUS_MAP[status], 'meta': { 'backend': pxname, 'state': status.lower(), 'server': stat['svname'], } } for pxname, states in backend_server_states.iteritems(): for s in STATUS_MAP.keys(): val = states.get(s, 0) yield { 'type_instance': 'backend_servers', 'values': val, 'meta': { 'backend': pxname, 'state': s.lower() } } if backend_server_states[pxname]['_count'] == 0: prct = 0 else: prct = (100.0 * val) / \ backend_server_states[pxname]['_count'] yield { 'type_instance': 'backend_servers_percent', 'values': prct, 'meta': { 'backend': pxname, 'state': s.lower() } }
def itermetrics(self): def str_to_bool(v): return str(v).lower() == 'true' def str_to_boolint(v): if str_to_bool(v): return 1 else: return 0 def shorten_hostname(v): return v.split('.')[0] def same_hostname(v): if v is not None and v.get('name') == self.hostname: return 1 return 0 out, err = self.execute([self.crm_mon_binary, '--as-xml', '-r', '-f'], shell=False) if not out: raise base.CheckException( "Failed to execute crm_mon '{}'".format(err)) try: root = ET.fromstring(out) except ET.ParseError: raise base.CheckException("Failed to parse XML '{}'".format( out[:64])) if self.notify_resource: # Notify the other collectd plugins whether the resource runs # locally or not node = root.find('resources/resource[@id="{}"]/node'.format( self.notify_resource)) self.collectd.Notification( type='gauge', message='{{"resource":"{}","value":{}}}'.format( self.notify_resource, same_hostname(node)), severity=self.collectd.NOTIF_OKAY).dispatch() # The metric needs to be emitted too for the Lua plugins executed # by the metric_collector service yield { 'type_instance': 'local_resource_active', 'values': same_hostname(node), 'meta': { 'resource': self.notify_resource } } summary = root.find('summary') current_dc = summary.find('current_dc') # The metric needs to be emitted for the alarms that leverage the other # metrics emitted by the plugin yield { 'type_instance': 'local_dc_active', 'values': same_hostname(current_dc), } if current_dc.get('name') != self.hostname: # The other metrics are only collected from the cluster's DC return # Report global cluster metrics yield { 'type_instance': 'dc', 'values': str_to_boolint(current_dc.get('present', 'false')) } yield { 'type_instance': 'quorum_status', 'values': str_to_boolint(current_dc.get('with_quorum', 'false')) } yield { 'type_instance': 'configured_nodes', 'values': int(summary.find('nodes_configured').get('number')) } yield { 'type_instance': 'configured_resources', 'values': int(summary.find('resources_configured').get('number')) } # Report node status metrics cluster_nodes = [] aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0} nodes_total = 0 for node in root.find('nodes').iter('node'): nodes_total += 1 hostname = shorten_hostname(node.get('name')) cluster_nodes.append(node.get('name')) if str_to_bool(node.get('online')): if str_to_bool(node.get('maintenance')): aggregated_nodes_status['maintenance'] += 1 yield { 'type_instance': 'node_status', 'values': MAINTENANCE_STATUS, 'meta': { 'status': 'maintenance', 'host': hostname } } else: aggregated_nodes_status['online'] += 1 yield { 'type_instance': 'node_status', 'values': ONLINE_STATUS, 'meta': { 'status': 'online', 'host': hostname } } else: aggregated_nodes_status['offline'] += 1 yield { 'type_instance': 'node_status', 'values': OFFLINE_STATUS, 'meta': { 'status': 'offline', 'host': hostname } } for status, cnt in aggregated_nodes_status.items(): yield { 'type_instance': 'nodes_count', 'values': cnt, 'meta': { 'status': status } } yield { 'type_instance': 'nodes_percent', 'values': 100.0 * cnt / nodes_total, 'meta': { 'status': status } } # Report the number of resources per status # Clone resources can run on multipe nodes while "simple" resources run # only one node at the same time aggregated_resources = defaultdict(Counter) resources = root.find('resources') for resource_id, resource_name in self.resources.iteritems(): resource_elts = [] simple_resource = None clone_resource = resources.find( 'clone/resource[@id="{}"]/..'.format(resource_id)) if not clone_resource: simple_resource = resources.find( 'resource[@id="{}"]'.format(resource_id)) if simple_resource: resource_elts = [simple_resource] else: resource_elts = clone_resource.findall('resource') if not resource_elts: self.logger.error("{}: Couldn't find resource '{}'".format( self.plugin, resource_id)) continue total = 0 for item in resource_elts: total += 1 if (item.get('role') in ('Slave', 'Master') and not str_to_bool(item.get('failed'))): # Multi-master resource aggregated_resources[resource_name]['up'] += 1 elif item.get('role') == 'Started': aggregated_resources[resource_name]['up'] += 1 else: aggregated_resources[resource_name]['down'] += 1 if simple_resource: # Report on which node the "simple" resource is running for node in cluster_nodes: yield { 'type_instance': 'local_resource_active', 'values': str_to_boolint( node == simple_resource.find('node').get('name')), 'meta': { 'resource': resource_name, 'host': shorten_hostname(node) } } for status in ('up', 'down'): cnt = aggregated_resources[resource_name][status] yield { 'type_instance': 'resource_count', 'values': cnt, 'meta': { 'status': status, 'resource': resource_name } } yield { 'type_instance': 'resource_percent', 'values': 100.0 * cnt / total, 'meta': { 'status': status, 'resource': resource_name } } # Collect operations' history metrics for the monitored resources # # The reported count for the resource's operations is an approximate # value because crm_mon doesn't provide the exact number. To estimate # the number of operations applied to a resource, the plugin keeps a # copy of call_ids and compares it with the current value. for node in root.find('node_history').iter('node'): hostname = shorten_hostname(node.get('name')) if hostname not in self.history: self.history[hostname] = {} for resource_id, resource_name in self.resources.iteritems(): if resource_id not in self.history[hostname]: self.history[hostname][resource_id] = { 'fail_count': 0, 'ops_count': 0, 'call_ids': Set([]) } v = self.history[hostname][resource_id] res_history = node.find( 'resource_history[@id="{}"]'.format(resource_id)) if res_history: # For simple resources, the resource_history element only # exists for the node that runs the resource v['fail_count'] += int(res_history.get('fail-count', 0)) call_ids = Set([ i.get('call') for i in res_history.findall('operation_history') ]) if call_ids: v['ops_count'] += len(call_ids - v['call_ids']) v['call_ids'] = call_ids yield { 'type_instance': 'resource_failures', 'values': v['fail_count'], 'meta': { 'resource': resource_name, 'host': hostname } } yield { 'type_instance': 'resource_operations', 'values': v['ops_count'], 'meta': { 'resource': resource_name, 'host': hostname } }
def itermetrics(self): df = self.execute_to_json('ceph df --format json') if not df: raise base.CheckException("Fail to run 'ceph df'") objects_count = 0 for pool in df['pools']: objects_count += pool['stats'].get('objects', 0) for m in ('bytes_used', 'max_avail', 'objects'): yield { 'type': 'pool_%s' % m, 'type_instance': pool['name'], 'values': pool['stats'].get(m, 0), } yield {'type': 'objects_count', 'values': objects_count} yield {'type': 'pool_count', 'values': len(df['pools'])} if 'total_bytes' in df['stats']: # compatibility with 0.84+ total = df['stats']['total_bytes'] used = df['stats']['total_used_bytes'] avail = df['stats']['total_avail_bytes'] else: # compatibility with <0.84 total = df['stats']['total_space'] * 1024 used = df['stats']['total_used'] * 1024 avail = df['stats']['total_avail'] * 1024 yield {'type': 'pool_total_bytes', 'values': [used, avail, total]} yield { 'type': 'pool_total_percent', 'values': [100.0 * used / total, 100.0 * avail / total] } stats = self.execute_to_json('ceph osd pool stats --format json') if not stats: raise base.CheckException("Fail to run 'ceph osd pool stats'") for pool in stats: client_io_rate = pool.get('client_io_rate', {}) yield { 'type': 'pool_bytes_rate', 'type_instance': pool['pool_name'], 'values': [ client_io_rate.get('read_bytes_sec', 0), client_io_rate.get('write_bytes_sec', 0) ] } yield { 'type': 'pool_ops_rate', 'type_instance': pool['pool_name'], 'values': client_io_rate.get('op_per_sec', 0) } osd = self.execute_to_json('ceph osd dump --format json') if not osd: raise base.CheckException("Fail to run 'ceph osd dump'") for pool in osd['pools']: for name in ('size', 'pg_num', 'pg_placement_num'): yield { 'type': 'pool_%s' % name, 'type_instance': pool['pool_name'], 'values': pool[name] } _up, _down, _in, _out = (0, 0, 0, 0) for osd in osd['osds']: if osd['up'] == 1: _up += 1 else: _down += 1 if osd['in'] == 1: _in += 1 else: _out += 1 yield {'type': 'osd_count', 'values': [_up, _down, _in, _out]}
def itermetrics(self): haproxy = HAProxySocket(self.socket) # Collect server statistics if 'server' in self.proxy_monitors: try: stats = haproxy.get_server_info() except socket.error: msg = "Unable to connect to HAProxy socket at {}".format( self.socket) raise base.CheckException(msg) else: for k, v in stats.iteritems(): if k not in SERVER_METRICS: continue type_instance = SERVER_METRICS[k][0] type_ = SERVER_METRICS[k][1] yield { 'type_instance': type_instance, 'type': type_, 'values': int(v), } try: stats = haproxy.get_server_stats() except socket.error: msg = "Unable to connect to HAProxy socket at {}".format( self.socket) raise base.CheckException(msg) def match(x): if x['pxname'] in self.proxy_ignore: return False return (x['svname'].lower() in self.proxy_monitors or x['pxname'].lower() in self.proxy_monitors or ('backend_server' in self.proxy_monitors and x['type'] == BACKEND_SERVER_TYPE)) stats = filter(match, stats) for stat in stats: stat['pxname'] = self.get_proxy_name(stat['pxname']) # Collect statistics for the frontends and the backends for stat in itertools.ifilter( lambda x: x['type'] == FRONTEND_TYPE or x['type'] == BACKEND_TYPE, stats): if stat['type'] == FRONTEND_TYPE: metrics = FRONTEND_METRIC_TYPES side = 'frontend' else: metrics = BACKEND_METRIC_TYPES side = 'backend' for k, metric in metrics.iteritems(): if k not in stat: self.logger.warning("Can't find {} metric".format(k)) continue value = stat[k] metric_name = '{}_{}'.format(side, metric[0]) meta = {side: stat['pxname']} if metric[0] == 'status': value = STATUS_MAP[value] else: value = int(value) if value else 0 yield { 'type_instance': metric_name, 'type': metric[1], 'values': value, 'meta': meta } # Count the number of servers per backend and state backend_server_states = {} for stat in itertools.ifilter( lambda x: x['type'] == BACKEND_SERVER_TYPE, stats): pxname = stat['pxname'] if pxname not in backend_server_states: backend_server_states[pxname] = defaultdict(int) backend_server_states[pxname][stat['status']] += 1 # Emit metric for the backend server yield { 'type_instance': 'backend_server', 'values': STATUS_MAP[stat['status']], 'meta': { 'backend': pxname, 'state': stat['status'].lower(), 'server': stat['svname'], } } for pxname, states in backend_server_states.iteritems(): for s in STATUS_MAP.keys(): yield { 'type_instance': 'backend_servers', 'values': states.get(s, 0), 'meta': { 'backend': pxname, 'state': s.lower() } }
def itermetrics(self): if self.url: self.logger.debug("Requesting URL {}".format( self.url) ) try: r = self.session.get(self.url, timeout=self.timeout) except Exception as e: msg = "Got exception for '{}': {}".format(self.url, e) raise base.CheckException(msg) if r.status_code != 200: self.logger.error( ("{} responded with code {} " "").format(self.url, r.status_code)) raise base.CheckException( "Failed to gather Calico Felix metrics ({})".format( r.status_code ) ) self.logger.debug( "Got response from {}: '{}'" "".format(self.url, r.text)) # Example payload: # # HELP felix_active_local_endpoints Number # # of active endpoints on this host. # # TYPE felix_active_local_endpoints gauge # felix_active_local_endpoints 1 # # HELP felix_iptables_chains Number of active iptables chains. # # TYPE felix_iptables_chains gauge # felix_iptables_chains{ip_version="4",table="filter"} 14 # felix_iptables_chains{ip_version="4",table="nat"} 6 # felix_iptables_chains{ip_version="4",table="raw"} 6 # felix_iptables_chains{ip_version="6",table="filter"} 14 # felix_iptables_chains{ip_version="6",table="nat"} 6 # felix_iptables_chains{ip_version="6",table="raw"} 6 # # HELP go_goroutines Number of goroutines that currently exist. # # TYPE go_goroutineqs gauge # go_goroutines 39 for l in r.text.split('\n'): # Line is empty or is a comment if not l or l.startswith('#'): continue (name, rval) = l.split() self.logger.debug( "Got val for {}: '{}'".format(name, rval)) # For some metrics, remove the existing felix prefix # to ensure homogeneity if name.startswith('felix_'): name = name.replace('felix_', '', 1) # Initialization of returned metric ret_metric = { 'values': rval } # Metric can have implicit dimensions. For example: # felix_iptables_rules{ip_version="4",table="filter"} m = re.search( '^(?P<name>[^{]+)(?:{(?P<dimensions>.[^}]+)})?$', name) if not m: self.logger.error( "Error parsing metric name {}".format( name)) continue if m.group('dimensions'): name = m.group('name') meta = {} for d in m.group('dimensions').split(','): (k, v) = d.split('=') meta[k] = v.strip('"') if len(meta) > 0: ret_metric['meta'] = meta ret_metric['type_instance'] = m.group('name') yield ret_metric