def get_stats(self): """Retrieves stats from ceph pools""" ceph_cluster = "%s-%s" % (self.prefix, self.cluster) data = { ceph_cluster: {} } stats_output = None try: stats_output = subprocess.check_output('ceph osd pool stats -f json', shell=True) except Exception as exc: collectd.error("ceph-io: failed to ceph pool stats :: %s :: %s" % (exc, traceback.format_exc())) return if stats_output is None: collectd.error('ceph-io: failed to ceph osd pool stats :: output was None') json_stats_data = json.loads(stats_output) # push osd pool stats results for pool in json_stats_data: pool_key = "pool-%s" % pool['pool_name'] data[ceph_cluster][pool_key] = {} pool_data = data[ceph_cluster][pool_key] for stat in ('read_bytes_sec', 'write_bytes_sec', 'op_per_sec'): pool_data[stat] = pool['client_io_rate'][stat] if pool['client_io_rate'].has_key(stat) else 0 return data
def logger(t, msg): if t == 'err': collectd.error('%s: %s' % (NAME, msg)) if t == 'warn': collectd.warning('%s: %s' % (NAME, msg)) elif t == 'verb' and VERBOSE_LOGGING == True: collectd.info('%s: %s' % (NAME, msg))
def fetch_json(url): """Fetch json from url""" try: return json.load(urllib2.urlopen(url, timeout=5)) except urllib2.URLError, e: collectd.error("mesos-tasks plugin: Error connecting to %s - %r" % (url, e)) return None
def plugin_write(vl, config): try: session = boto3.session.Session(region_name=config.aws_region) client_config = botocore.client.Config(connect_timeout=5, read_timeout=5) client = session.client('cloudwatch', config=client_config) metrics_list = list(metrics(vl, config)) ts = datetime.fromtimestamp(vl.time) data = [] for i, v in enumerate(vl.values): fullname, unit, dims = metrics_list[i] name = fullname[:255] if len(name) < len(fullname): collectd.warning('Metric name was truncated for CloudWatch: {}'.format(fullname)) data.append(dict( MetricName=name, Timestamp=ts, Value=v, Unit=unit, Dimensions=dims )) client.put_metric_data(Namespace=vl.plugin, MetricData=data) except Exception, e: collectd.error(str(e))
def get_stats(self): """Retrieves stats regarding latency to write to a test pool""" ceph_cluster = "%s-%s" % (self.prefix, self.cluster) data = { ceph_cluster: {} } output = None try: output = subprocess.check_output( "timeout 30s rados -p data bench 10 write -t 1 -b 65536 2>/dev/null | grep -i latency | awk '{print 1000*$3}'", shell=True) except Exception as exc: collectd.error("ceph-latency: failed to run rados bench :: %s :: %s" % (exc, traceback.format_exc())) return if output is None: collectd.error('ceph-latency: failed to run rados bench :: output was None') results = output.split('\n') # push values data[ceph_cluster]['cluster'] = {} data[ceph_cluster]['cluster']['avg_latency'] = results[0] data[ceph_cluster]['cluster']['stddev_latency'] = results[1] data[ceph_cluster]['cluster']['max_latency'] = results[2] data[ceph_cluster]['cluster']['min_latency'] = results[3] return data
def kairosdb_send_http_data(data, json): collectd.debug('Json=%s' % json) data['lock'].acquire() if not kairosdb_connect(data): data['lock'].release() collectd.warning('kairosdb_writer: no connection to kairosdb server') return response = '' try: headers = {'Content-type': 'application/json', 'Connection': 'keep-alive'} data['conn'].request('POST', '/api/v1/datapoints', json, headers) res = data['conn'].getresponse() response = res.read() collectd.debug('Response code: %d' % res.status) if res.status == 204: exit_code = True else: collectd.error(response) exit_code = False except httplib.ImproperConnectionState, e: collectd.error('Lost connection to kairosdb server: %s' % e.message) data['conn'] = None exit_code = False
def kairosdb_config(c): global host, port, host_separator, \ metric_separator, lowercase_metric_names, protocol, \ tags_map, metric_name, add_host_tag, formatter, uri for child in c.children: if child.key == 'AddHostTag': add_host_tag = child.values[0] elif child.key == 'KairosDBURI': uri = child.values[0] elif child.key == 'TypesDB': for v in child.values: kairosdb_parse_types_file(v) elif child.key == 'LowercaseMetricNames': lowercase_metric_names = child.values[0] elif child.key == 'MetricName': metric_name = str(child.values[0]) elif child.key == 'HostSeparator': host_separator = child.values[0] elif child.key == 'MetricSeparator': metric_separator = child.values[0] elif child.key == 'Formatter': formatter_path = child.values[0] try: formatter = imp.load_source('formatter', formatter_path) # formatter = source.Formatter() except: raise Exception('Could not load formatter %s %s' % (formatter_path, format_exc())) elif child.key == 'Tags': for v in child.values: tag_parts = v.split("=") if len(tag_parts) == 2: tags_map[tag_parts[0]] = tag_parts[1] else: collectd.error("Invalid tag: %s" % tag)
def collect_buddyinfo(): if os.path.exists(buddy_fname): with open(buddy_fname) as f: for line in f: match = re_buddyinfo.search(line) if not match: continue; node = match.group('node') zone = match.group('zone') free_pages = match.group('pages').strip().split() stats_current[(node, zone, 'val')] = free_pages stats_current[(node, zone, 'ts')] = time.time() key_val = dict(zip(white_list, free_pages)) metric = collectd.Values() metric.host = host_name metric.plugin = 'buddyinfo' metric.plugin_instance = node metric.type = 'gauge' for k in range(0, len(white_list)): metric.type_instance = 'zone_' + zone + '.' metric.type_instance += white_list[k] metric.values = [free_pages[k]] metric.dispatch() f.close() else: collectd.error('buddyinfo: procfs path: %s does not exist' % (buddy_fname))
def kairosdb_connect(data): #collectd.info(repr(data)) if not data['conn'] and protocol == 'http': data['conn'] = httplib.HTTPConnection(data['host'], data['port']) return True elif not data['conn'] and protocol == 'https': data['conn'] = httplib.HTTPSConnection(data['host'], data['port']) return True elif not data['conn'] and protocol == 'telnet': # only attempt reconnect every 10 seconds if protocol of type Telnet now = time() if now - data['last_connect_time'] < 10: return False data['last_connect_time'] = now collectd.info('connecting to %s:%s' % (data['host'], data['port'])) try: data['conn'] = socket.socket(socket.AF_INET, socket.SOCK_STREAM) data['conn'].connect((data['host'], data['port'])) return True except: collectd.error('error connecting socket: %s' % format_exc()) return False else: return True
def read_callback(): log_verbose('Read callback called') for port_str in REDIS_PORTS.split(','): port=int(port_str) info = fetch_info(port) if not info: collectd.error('redis plugin: No info received') continue # send high-level values dispatch_value(info, 'uptime_in_seconds','gauge',port) dispatch_value(info, 'connected_clients', 'gauge',port) dispatch_value(info, 'connected_slaves', 'gauge',port) dispatch_value(info, 'blocked_clients', 'gauge',port) dispatch_value(info, 'evicted_keys', 'gauge',port) dispatch_value(info, 'used_memory', 'bytes',port) dispatch_value(info, 'changes_since_last_save', 'gauge',port) dispatch_value(info, 'total_connections_received', 'counter',port, 'connections_recieved') dispatch_value(info, 'total_commands_processed', 'counter',port, 'commands_processed') # database and vm stats for key in info: if key.startswith('vm_stats_'): dispatch_value(info, key, 'gauge',port) if key.startswith('db'): dispatch_value(info[key], 'keys', 'gauge',port, '%s-keys' % key)
def plugin_read(unused_input_data=None): """Handles collectd's 'read' interface for mlab plugin.""" vs_prefix = _PROC_VIRTUAL vs_dlimits = read_vsys_data('vs_xid_dlimits', _VSYS_FRONTEND_VERSION) report_cpuavg_for_system(_PROC_STAT) report_meta_metrics(_PROC_PID_STAT) uptime = read_system_uptime() for entry in os.listdir(vs_prefix): entry_path = os.path.join(vs_prefix, entry) if not os.path.isdir(entry_path): continue if entry not in _vs_xid_names: init_vserver_xid_names() # Try reloading names to get new vserver names. # Skip, if still not present. if entry not in _vs_xid_names: collectd.error(('mlab: no vserver name found for xid %s after ' 'reloading names.') % entry) continue vs_name = _vs_xid_names[entry] if vs_name in _config_exclude_slices: # Do not collect any stats for this slice. continue vs_host = slicename_to_hostname(vs_name) report_cpu_for_vserver(vs_host, entry_path) report_network_for_vserver(vs_host, entry_path) report_limits_for_vserver(vs_host, entry_path) report_threads_for_vserver(vs_host, entry_path, uptime) if entry in vs_dlimits: report_quota_for_vserver(vs_host, vs_dlimits[entry])
def read_vsys_data(command, version): """Runs vsys 'command' and returns results as dict. See command notes for description of returned data format. Args: command: str, name of script or command to execute in vsys backend. version: int, expected version of backend response. Returns: dict, results of 'command'. """ # Send request through vsys (for slice context). data = read_vsys_data_direct(command) if 'data' not in data: collectd.error('%s: returned value has no "data" field.' % command) return {} if 'version' not in data: collectd.error('%s: returned value has no "version" field.' % command) return {} if 'message_type' in data and data['message_type'] != command: collectd.error('Returned message_type does not match request.') collectd.error('Requested: %s' % command) collectd.error('Received : %s' % data['message_type']) return {} if data['version'] != version: msg = '%s: version mismatch: found (%d), expected (%d)' % ( command, data['version'], version) collectd.warning(msg) return data['data']
def init_stats_cache(): global white_list if os.path.exists(buddy_fname): num_buckets = 0 with open(buddy_fname) as f: for line in f: match = re_buddyinfo.search(line) if not match: collectd.error('buddyinfo: unknown line pattern: %s' % (line)) continue; node = match.group('node') zone = match.group('zone') free_pages = match.group('pages').strip().split() num_buckets = len(free_pages) if node not in node_list: node_list.append(node) if zone not in zone_list: zone_list.append(zone) stats_cache[(node, zone, 'val')] = free_pages stats_cache[(node, zone, 'ts')] = time.time() f.close() for i in range(0, num_buckets): white_list.append('free_pages_' + str(4*2**i) + 'K') collectd.info('buddyinfo: node_list : %s' % (node_list)) collectd.info('buddyinfo: zone_list : %s' % (zone_list)) collectd.info('buddyinfo: white_list: %s' % (white_list)) else: collectd.info('buddyinfo: init_stats_cache: path: %s does not exist' % (buddy_fname))
def get_stats(socket): """ Makes two calls to haproxy to fetch server info and server stats. Returns the dict containing metric name as the key and a tuple of metric value and the dict of dimensions if any """ if socket is None: collectd.error("Socket configuration parameter is undefined. Couldn't get the stats") return stats = [ ] haproxy = HAProxySocket(socket) try: server_info = haproxy.get_server_info() server_stats = haproxy.get_server_stats() except socket.error: collectd.warning( 'status err Unable to connect to HAProxy socket at %s' % socket) return stats for key, val in server_info.iteritems(): try: stats.append((key, int(val), None)) except (TypeError, ValueError): pass for statdict in server_stats: if not (statdict['svname'].lower() in PROXY_MONITORS or statdict['pxname'].lower() in PROXY_MONITORS): continue for metricname, val in statdict.items(): try: stats.append((metricname, int(val), {'proxy_name': statdict['pxname'], 'service_name': statdict['svname']})) except (TypeError, ValueError): pass return stats
def config(self, cfg): if "Module.interval" in cfg: try: self.interval = int(cfg["Module.interval"][0]) collectd.info("MetricWriteTracker.interval == {}".format(self.interval)) except ValueError: collectd.error("module {0}, interval parameter must be an integer".format(self.__module__))
def read_properties(*cmd): properties = AutoVivification() out = None real_cmd = (CONF['scli_wrap'],CONF['scli_user'],CONF['scli_password']) + cmd my_verbose('Executing command: %s %s ******* %s' % (CONF['scli_wrap'], CONF['scli_user'], " ".join(str(v) for v in cmd))) try: out = subprocess.check_output(real_cmd, stderr=subprocess.STDOUT) my_debug('scli output: ' + out) except Exception as e: collectd.error('ScaleIO: error on executing scli command %s --- %s' % (e, traceback.format_exc())) return if 'Failed to connect to MDM 127.0.0.1:6611' in out: my_verbose('plugin is running on non-primary/active MDM, skipping data collection') group_name = None group_regex = re.compile("^([^\s]+)\s([^:]+)") kv_regex = re.compile("^\s+([^\s]+)\s+(.*)$") for line in out.split('\n'): new_group_match = group_regex.match(line) if new_group_match: group_name = new_group_match.group(2) else: kv_match = kv_regex.match(line) if kv_match: properties[group_name][kv_match.group(1)] = kv_match.group(2) my_verbose('Read properties: %s' % (json.dumps(properties))) rectify_dict(properties) my_debug('Properties after rectify: %s' % (json.dumps(properties))) return properties
def dispatch(self, stats): """ Dispatches the given stats. stats should be something like: {'plugin': {'plugin_instance': {'type': {'type_instance': <value>, ...}}}} """ if not stats: collectd.error("%s: failed to retrieve stats" % self.prefix) return self.logdebug("dispatching %d new stats :: %s" % (len(stats), stats)) try: for plugin in stats.keys(): for plugin_instance in stats[plugin].keys(): for type in stats[plugin][plugin_instance].keys(): type_value = stats[plugin][plugin_instance][type] if not isinstance(type_value, dict): self.dispatch_value(plugin, plugin_instance, type, None, type_value) else: for type_instance in stats[plugin][plugin_instance][type].keys(): self.dispatch_value(plugin, plugin_instance, type, type_instance, stats[plugin][plugin_instance][type][type_instance]) except Exception as exc: collectd.error("%s: failed to dispatch values :: %s :: %s" % (self.prefix, exc, traceback.format_exc()))
def read(): #collectd.info("read") for name in CONFIG: while True: #collectd.info("querying: " + name) try: switch = ENV.get_switch(name) v1 = collectd.Values(plugin='wemo') v1.type = 'power' v1.type_instance = 'power' v1.plugin_instance = name power = switch.current_power/1000.0 collectd.info("Got power from %s = %fW" % (name, power)) v1.values = [power] v1.dispatch() except UnknownDevice: collectd.error("Unknown device: " + name) except ConnectionError: ENV.start() ENV.discover() continue break env = None
def dispatch_data(data,composite_key,type,plugin_instance=None,type_instance=None): """ Dispatch the Data""" log_verbose('In Dispatch') log_verbose('===========') if plugin_instance is None: plugin_instance = 'unkown_splunkData' collectd.error('splunkData plugin: Data key not found: %s' %kcomposite_key) if type_instance is None: type_instance = composite_key #split the composite key into it's components keys = composite_key.split('/') # assigning value of first data item to value # this should speed up recursive lookup done below in for loop key = keys.pop(0) value = data[key] # recurse until final value is found! for key in keys: value = value[key] log_verbose('Sending Value: %s=%s' % (type_instance, value)) val = collectd.Values(plugin='splunkData') val.type = type val.type_instance = type_instance val.plugin_instance = plugin_instance val.values = [value] val.dispatch()
def logger(t, msg): if t == "err": collectd.error("%s: %s" % (NAME, msg)) if t == "warn": collectd.warning("%s: %s" % (NAME, msg)) elif t == "verb" and VERBOSE_LOGGING == True: collectd.info("%s: %s" % (NAME, msg))
def read_callback(): """Get stats for all the servers in the cluster.""" for conf in CONFIGS: for host in conf['hosts']: try: zk = ZooKeeperServer(host, conf['port']) stats = zk.get_stats() for k, v in stats.items(): try: val = collectd.Values(plugin='zookeeper', meta={'0': True}) val.type = 'counter' if k in COUNTERS else 'gauge' val.type_instance = k val.values = [v] val.plugin_instance = conf['instance'] val.dispatch() except (TypeError, ValueError): collectd.error(('error dispatching stat; host=%s, ' 'key=%s, val=%s') % (host, k, v)) pass except socket.error: # Ignore because the cluster can still work even # if some servers fail completely. # This error should be also visible in a variable # exposed by the server in the statistics. log('unable to connect to server "%s"' % (host)) return stats
def read(data=None): starttime = time.time() gnocchi = client.Client(session=keystone_session) try: status = gnocchi.status.get() metric = collectd.Values() metric.plugin = 'gnocchi_status' metric.interval = INTERVAL metric.type = 'gauge' metric.type_instance = 'measures' metric.values = [status['storage']['summary']['measures']] metric.dispatch() metric = collectd.Values() metric.plugin = 'gnocchi_status' metric.interval = INTERVAL metric.type = 'gauge' metric.type_instance = 'metrics' metric.values = [status['storage']['summary']['metrics']] metric.dispatch() except Exception as err: collectd.error( 'gnocchi_status: Exception getting status: {}' .format(err)) timediff = time.time() - starttime if timediff > INTERVAL: collectd.warning( 'gnocchi_status: Took: {} > {}' .format(round(timediff, 2), INTERVAL))
def zope_read(self, data=None): self.logger('verb', 'read_callback') for metric in self._metrics: try: s = self.connect() except: collectd.error('Fail to connect to %s:%s' % (self._zmonitor_hostname, self._zmonitor_port)) return self.logger('verb', 'fetch %s' % metric) metricid = self.metric_configs[metric].get('alias', metric) s.sendall("%s\n" % metricid) output = None while 1: data = s.recv(1024) if data == "": break else: output = data s.close() if output is not None: data = self.strip_data(output, metric) self.logger('verb', 'got %s' % data) if data == '' or data is None: collectd.error('Recevied not data for %s' % metric) return values = collectd.Values(type=self.metric_configs[metric]['type'], plugin='zope') values.dispatch(plugin_instance='%s' % self._cluster_name, type_instance=self.metric_configs[metric]['type_instance'], values=(data, ))
def read_callback(): log_verbose('Read callback called') info = fetch_info() if not info: collectd.error('redis plugin: No info received') return # send high-level values dispatch_value(info, 'uptime_in_seconds','gauge') dispatch_value(info, 'connected_clients', 'gauge') dispatch_value(info, 'connected_slaves', 'gauge') dispatch_value(info, 'blocked_clients', 'gauge') dispatch_value(info, 'used_memory', 'bytes') dispatch_value(info, 'mem_fragmentation_ratio', 'gauge') dispatch_value(info, 'changes_since_last_save', 'gauge') dispatch_value(info, 'total_connections_received', 'counter', 'connections_received') dispatch_value(info, 'total_commands_processed', 'counter', 'commands_processed') dispatch_value(info, 'used_cpu_sys', 'counter', 'cpu_sys') dispatch_value(info, 'used_cpu_user', 'counter', 'cpu_user') dispatch_value(info, 'used_cpu_sys_children', 'counter', 'cpu_sys_children') dispatch_value(info, 'used_cpu_user_children', 'counter', 'cpu_user_children') # database and vm stats for key in info: if key.startswith('vm_stats_'): dispatch_value(info, key, 'gauge') if key.startswith('db'): dispatch_value(info[key], 'keys', 'gauge', '%s-keys' % key)
def logger(self, t, msg): if t == 'err': collectd.error('%s: %s' % (self.name, msg)) if t == 'warn': collectd.warning('%s: %s' % (self.name, msg)) elif t == 'verb' and self.verbose: collectd.info('%s: %s' % (self.name, msg))
def get_stats(self): """Retrieves stats from ceph pgs""" ceph_cluster = "%s" % self.cluster data = { ceph_cluster: { 'pg': { } } } output = None try: output = subprocess.check_output('ceph pg dump --format json',shell=True) except Exception as exc: collectd.error("ceph-pg: failed to ceph pg dump :: %s :: %s" % (exc, traceback.format_exc())) return if output is None: collectd.error('ceph-pg: failed to ceph osd dump :: output was None') json_data = json.loads(output) pg_data = data[ceph_cluster]['pg'] # number of pgs in each possible state for pg in json_data['pg_stats']: for state in pg['state'].split('+'): if not pg_data.has_key(state): pg_data[state] = 0 pg_data[state] += 1 return data
def err(self, message): """ Log an error message """ fmsg = '%s:ERR %s' % (self.plugin_name, message) if not self.debug_mode: collectd.error(fmsg) else: print(fmsg)
def fetch_stats(conf): try: result = json.load(urllib2.urlopen(conf['mesos_url'], timeout=10)) except urllib2.URLError, e: collectd.error('%s plugin: Error connecting to %s - %r' % (PREFIX, conf['mesos_url'], e)) return None
def get_stats(self): """Retrieves stats from ceph buckets""" ceph_cluster = "%s-%s" % (self.prefix, self.cluster) data = { ceph_cluster: {} } stats_output = None try: stats_output = subprocess.check_output('radosgw-admin bucket stats', shell=True) except Exception as exc: collectd.error("ceph-rgw-bucket: failed to ceph pool stats :: %s :: %s" % (exc, traceback.format_exc())) return if stats_output is None: collectd.error('ceph-rgw-bucket: failed to ceph osd pool stats :: output was None') json_stats_data = json.loads(stats_output) # rgw bucket stats results for idx, bucket in enumerate(json_stats_data): bucket_key = "bucket-%s" % bucket['bucket'] data[ceph_cluster][bucket_key] = {} bucket_data = data[ceph_cluster][bucket_key] for stat in ('size_kb', 'size_kb_actual', 'num_objects'): if bucket['usage'].has_key('rgw.main'): bucket_data[stat] = bucket['usage']['rgw.main'][stat] if bucket['usage']['rgw.main'].has_key(stat) else 0 return data
def dispatch_value(info, key, type, plugin_instance=None, type_instance=None): """Read a key from info response data and dispatch a value""" if key not in info: collectd.warning('redis_info plugin: Info key not found: %s' % key) return if plugin_instance is None: plugin_instance = 'unknown redis' collectd.error('redis_info plugin: plugin_instance is not set, Info key: %s' % key) if not type_instance: type_instance = key try: value = int(info[key]) except ValueError: value = float(info[key]) log_verbose('Sending value: %s=%s' % (type_instance, value)) val = collectd.Values(plugin='redis_info') val.type = type val.type_instance = type_instance val.plugin_instance = plugin_instance val.values = [value] val.meta={'0': True} val.dispatch()
def error(self, msg): collectd.error('{name}: {msg}'.format(name=PLUGIN_NAME, msg=msg))
def config_callback(conf): """Receive configuration block""" project_name = "demo" project_domainid = "default" user_domainid = "default" region_name = None interval = 10 testing = False ssl_verify = True OPENSTACK_CLIENT = {} plugin_conf = {} custom_dimensions = {} http_timeout = None request_batch_size = 5 nova_list_servers_search_opts = {} query_server_metrics = True query_hypervisor_metrics = True required_keys = frozenset(("authurl", "username", "password")) for node in conf.children: try: if node.key.lower() in required_keys: plugin_conf[node.key.lower()] = node.values[0] elif node.key.lower() == "projectname": project_name = node.values[0] elif node.key.lower() == "projectdomainid": project_domainid = node.values[0] elif node.key.lower() == "userdomainid": user_domainid = node.values[0] elif node.key.lower() == "regionname": if node.values[0]: region_name = node.values[0] elif node.key.lower() == "dimension": if len(node.values) == 2: custom_dimensions.update({node.values[0]: node.values[1]}) else: collectd.warning( "WARNING: Check configuration setting for %s" % node.key) elif node.key.lower() == "interval": interval = node.values[0] elif node.key.lower() == "sslverify": ssl_verify = node.values[0] elif node.key.lower() == "httptimeout": http_timeout = node.values[0] elif node.key.lower() == "requestbatchsize": request_batch_size = int(node.values[0]) elif node.key.lower() == "queryservermetrics": query_server_metrics = node.values[0] elif node.key.lower() == "queryhypervisormetrics": query_hypervisor_metrics = node.values[0] elif node.key.lower() == "novalistserverssearchopts": nova_list_servers_search_opts = yaml.load( node.values[0], Loader=yaml.FullLoader) if not isinstance(nova_list_servers_search_opts, dict): raise TypeError( "NovaListSeverSearchOpts must be a string representation of yaml mapping. Received {0}." .format(node.values[0])) elif node.key.lower() == "testing": testing = node.values[0] except Exception as e: collectd.error( "Failed to load the configuration {0} due to {1}".format( node.key, e)) raise e OPENSTACK_CLIENT["query_server_metrics"] = query_server_metrics OPENSTACK_CLIENT["query_hypervisor_metrics"] = query_hypervisor_metrics for key in required_keys: try: plugin_conf[key] except KeyError: raise KeyError("Missing required config setting: %s" % key) try: novametrics = NovaMetrics( auth_url=plugin_conf["authurl"], username=plugin_conf["username"], password=plugin_conf["password"], project_name=project_name, project_domain_id=project_domainid, user_domain_id=user_domainid, region_name=region_name, ssl_verify=ssl_verify, http_timeout=http_timeout, request_batch_size=request_batch_size, list_servers_search_opts=nova_list_servers_search_opts, ) OPENSTACK_CLIENT["nova"] = novametrics cindermetrics = CinderMetrics( auth_url=plugin_conf["authurl"], username=plugin_conf["username"], password=plugin_conf["password"], project_name=project_name, project_domain_id=project_domainid, user_domain_id=user_domainid, region_name=region_name, ssl_verify=ssl_verify, http_timeout=http_timeout, ) OPENSTACK_CLIENT["cinder"] = cindermetrics neutronmetrics = NeutronMetrics(plugin_conf["authurl"], plugin_conf["username"], plugin_conf["password"], project_name, project_domainid, user_domainid, region_name, ssl_verify, http_timeout) OPENSTACK_CLIENT["neutron"] = neutronmetrics OPENSTACK_CLIENT["custdims"] = custom_dimensions except Exception as e: collectd.error( "Failed to authenticate Openstack client due to {0}".format(e)) if testing: return plugin_conf, OPENSTACK_CLIENT collectd.register_read(read_callback, interval, data=OPENSTACK_CLIENT, name=project_name)
processed_stats = docker_dependency_resolver.get_DockerFormatter().process_stats(raw_stats) for container_name, container_stats in processed_stats.iteritems(): timestamp = None if 'read' in container_stats: timestamp = container_stats['read'] del (container_stats['read']) for metric_name, metric_value in container_stats.iteritems(): collectd_dependency_resolver.get_Exporter().export(container_name, metric_name, metric_value, timestamp) docker_dependency_resolver.get_ContainerStatsStreamPool().keep_streams_running(running_container_names) def init(): collectd.register_read(read) try: collectd_dependency_resolver = CollectdDependencyResolver.get_Resolver(collectd) docker_dependency_resolver = DockerDependencyResolver.get_Resolver(collectd_dependency_resolver.get_Logger(), socket_url, timeout) collectd.register_config(configure) collectd.register_init(init) except Exception as exception: collectd.error('collectd-docker-stats-plugin: plugin stopped because of exception: '.format(exception.message))
def kairosdb_write(self, values, data=None): # noinspection PyBroadException try: # collectd.info(repr(v)) if values.type not in self.types: collectd.warning( 'kairosdb_writer: do not know how to handle type %s. do you have all your types.db files configured?' % values.type) return v_type = self.types[values.type] if len(v_type) != len(values.values): collectd.warning( 'kairosdb_writer: differing number of values for type %s' % values.type) return hostname = values.host.replace('.', self.host_separator) tags = self.tags_map.copy() if self.add_host_tag: tags['host'] = hostname plugin = values.plugin plugin_instance = '' if values.plugin_instance: plugin_instance = self.sanitize_field(values.plugin_instance) type_name = values.type type_instance = '' if values.type_instance: type_instance = self.sanitize_field(values.type_instance) # collectd.info('plugin: %s plugin_instance: %s type: %s type_instance: %s' % (plugin, plugin_instance, type_name, type_instance)) default_name = self.metric_name % { 'host': hostname, 'plugin': plugin, 'plugin_instance': plugin_instance, 'type': type_name, 'type_instance': type_instance } if self.pluginsToFormatter and plugin in self.pluginsToFormatter: name, tags = self.pluginsToFormatter[plugin].format_metric( self.metric_name, tags, hostname, plugin, plugin_instance, type_name, type_instance) elif self.formatter: name, tags = self.formatter.format_metric( self.metric_name, tags, hostname, plugin, plugin_instance, type_name, type_instance) else: name = default_name # Remove dots for missing pieces name = name.replace('..', '.') name = name.rstrip('.') # collectd.info('Metric: %s' % name) type_list = list(v_type) values_list = list(values.values) if plugin in self.convert_rates: i = 0 type_list = [] values_list = [] for value in values.values: if self.is_counter(v_type[i]): counter = "%s.%s" % (default_name, v_type[i][0]) with data['lock']: if value is not None: if counter in self.counters_map: old_value = self.counters_map[counter] try: rate = (value - old_value['value']) / ( values.time - old_value['timestamp']) values_list.append(rate) type_list.append([ v_type[i][0] + '_rate', 'GAUGE', '0', 'U' ]) except ZeroDivisionError: collectd.error( "Timestamp values are identical (caused divide by error) for %s" + default_name) self.counters_map[counter] = { 'value': value, 'timestamp': values.time } else: values_list.append(value) type_list.append(v_type[i]) i += 1 if self.protocol == 'http' or self.protocol == 'https': self.kairosdb_write_http_metrics(data, type_list, values.time, values_list, name, tags) else: self.kairosdb_write_telnet_metrics(data, type_list, values.time, values_list, name, tags) except Exception: collectd.error(traceback.format_exc())
exit_code = False except httplib.ImproperConnectionState, e: collectd.error('Lost connection to kairosdb server: %s' % e.message) data['conn'] = None exit_code = False except httplib.HTTPException, e: collectd.error('Error sending http data: %s' % e.message) if response: collectd.error(response) exit_code = False except Exception, e: collectd.error('Error sending http data: %s' % str(e)) exit_code = False return exit_code def kairosdb_write(self, values, data=None): # noinspection PyBroadException try: # collectd.info(repr(v)) if values.type not in self.types: collectd.warning( 'kairosdb_writer: do not know how to handle type %s. do you have all your types.db files configured?' % values.type) return v_type = self.types[values.type]
def init_func(): """Init the plugin.""" # do nothing till config is complete. if obj.config_complete() is False: return pc.PLUGIN_PASS if obj._node_ready is False: obj.node_ready() return pc.PLUGIN_PASS obj.hostname = socket.gethostname() # Determine the full list of logical cpus for this host obj.logical_cpus = get_logical_cpus() # Determine the subset of logical platform cpus that we want to monitor obj.cpu_list = get_platform_cpulist() if obj.debug: collectd.info('%s configured platform cpu list: %r' % (PLUGIN_DEBUG, obj.cpu_list)) # Ensure that the platform cpus are a subset of actual logical cpus if not (all(x in obj.logical_cpus for x in obj.cpu_list)): collectd.error('%s cpulist %r is not a subset of host logical cpus %r' % (PLUGIN, obj.cpu_list, obj.logical_cpus)) return pc.PLUGIN_FAIL # Monitor all logical cpus if no platform cpus have been specified if not obj.cpu_list: obj.cpu_list = obj.logical_cpus obj.number_platform_cpus = len(obj.cpu_list) collectd.info('%s found %d cpus total; monitoring %d cpus, cpu list: %s' % (PLUGIN, len(obj.logical_cpus), obj.number_platform_cpus, pc.format_range_set(obj.cpu_list))) # Check schedstat version version = 0 try: with open(SCHEDSTAT, 'r') as f: line = f.readline() match = re_schedstat_version.search(line) if match: version = int(match.group(1)) except Exception as err: collectd.error('%s Cannot read schedstat, error=%s' % (PLUGIN, err)) return pc.PLUGIN_FAIL if version != SCHEDSTAT_SUPPORTED_VERSION: obj.schedstat_supported = False collectd.error('%s unsupported schedstat version [%d]' % (PLUGIN, version)) return pc.PLUGIN_FAIL # Gather initial cputime state information. update_cpu_data(init=True) obj.init_completed() return pc.PLUGIN_PASS
def update_cpu_data(init=False): """Gather cputime info and Update platform cpu occupancy metrics. This gathers current per-cpu cputime information from schedstats and per-cgroup cputime information from cgroup cpuacct. This calculates the average cpu occupancy of the platform cores since this routine was last run. """ # Get epoch time in floating seconds now = time.time() # Calculate elapsed time delta since last run obj.elapsed_ms = float(pc.ONE_THOUSAND) * (now - obj._t0[TIMESTAMP]) # Prevent calling this routine too frequently (<= 1 sec) if not init and obj.elapsed_ms <= 1000.0: return t1 = {} t1[TIMESTAMP] = now if obj.schedstat_supported: # Get current per-cpu cumulative cputime usage from /proc/schedstat. cputimes = read_schedstat() for cpu in obj.cpu_list: t1[cpu] = cputimes[cpu] else: return # Get current cpuacct usages based on cgroup hierarchy t1_cpuacct = get_cpuacct() # Refresh the k8s pod information if we have discovered new cgroups cg_pods = set(t1_cpuacct[pc.GROUP_PODS].keys()) if not cg_pods.issubset(obj.k8s_pods): if obj.debug: collectd.info('%s Refresh k8s pod information.' % (PLUGIN_DEBUG)) obj.k8s_pods = set() pods = obj._k8s_client.kube_get_local_pods() for i in pods: # NOTE: parent pod cgroup name contains annotation config.hash as # part of its name, otherwise it contains the pod uid. uid = i.metadata.uid if ((i.metadata.annotations) and (pc.POD_ANNOTATION_KEY in i.metadata.annotations)): hash_uid = i.metadata.annotations.get(pc.POD_ANNOTATION_KEY, None) if hash_uid: if obj.debug: collectd.info('%s POD_ANNOTATION_KEY: ' 'hash=%s, uid=%s, ' 'name=%s, namespace=%s, qos_class=%s' % (PLUGIN_DEBUG, hash_uid, i.metadata.uid, i.metadata.name, i.metadata.namespace, i.status.qos_class)) uid = hash_uid obj.k8s_pods.add(uid) if uid not in obj._cache: obj._cache[uid] = pc.POD_object(i.metadata.uid, i.metadata.name, i.metadata.namespace, i.status.qos_class) # Remove stale _cache entries remove_uids = set(obj._cache.keys()) - obj.k8s_pods for uid in remove_uids: del obj._cache[uid] # Save initial state information if init: obj._t0 = copy.deepcopy(t1) obj._t0_cpuacct = copy.deepcopy(t1_cpuacct) return # Aggregate cputime delta for platform logical cpus using integer math cputime_ms = 0.0 for cpu in obj.cpu_list: # Paranoia check, we should never hit this. if cpu not in obj._t0: collectd.error('%s cputime initialization error' % (PLUGIN)) break cputime_ms += float(t1[cpu] - obj._t0[cpu]) cputime_ms /= float(pc.ONE_MILLION) # Calculate average occupancy of platform logical cpus occupancy = 0.0 if obj.number_platform_cpus > 0 and obj.elapsed_ms > 0: occupancy = float(pc.ONE_HUNDRED) * float(cputime_ms) \ / float(obj.elapsed_ms) / obj.number_platform_cpus else: occupancy = 0.0 obj._data[PLATFORM_CPU_PERCENT] = occupancy if obj.debug: collectd.info('%s %s elapsed = %.1f ms, cputime = %.1f ms, ' 'n_cpus = %d, occupancy = %.2f %%' % (PLUGIN_DEBUG, PLATFORM_CPU_PERCENT, obj.elapsed_ms, cputime_ms, obj.number_platform_cpus, occupancy)) # Calculate cpuacct delta for cgroup hierarchy, dropping transient cgroups cpuacct = {} for i in t1_cpuacct.keys(): cpuacct[i] = {} for k, v in t1_cpuacct[i].items(): if i in obj._t0_cpuacct and k in obj._t0_cpuacct[i]: cpuacct[i][k] = v - obj._t0_cpuacct[i][k] else: cpuacct[i][k] = v # Summarize cpuacct usage for various groupings for g in pc.OVERALL_GROUPS: cpuacct[pc.GROUP_OVERALL][g] = 0.0 # Aggregate cpuacct usage by K8S pod for uid in cpuacct[pc.GROUP_PODS]: acct = cpuacct[pc.GROUP_PODS][uid] if uid in obj._cache: pod = obj._cache[uid] else: collectd.warning('%s uid %s not found' % (PLUGIN, uid)) continue # K8S platform system usage, i.e., essential: kube-system if pod.namespace in pc.K8S_NAMESPACE_SYSTEM: cpuacct[pc.GROUP_OVERALL][pc.GROUP_K8S_SYSTEM] += acct # K8S platform addons usage, i.e., non-essential: monitor, openstack if pod.namespace in pc.K8S_NAMESPACE_ADDON: cpuacct[pc.GROUP_OVERALL][pc.GROUP_K8S_ADDON] += acct # Calculate base cpuacct usage (i.e., base tasks, exclude K8S and VMs) # e.g., docker, system.slice, user.slice for name in cpuacct[pc.GROUP_FIRST]: if name in pc.BASE_GROUPS: cpuacct[pc.GROUP_OVERALL][pc.GROUP_BASE] += \ cpuacct[pc.GROUP_FIRST][name] elif name not in pc.BASE_GROUPS_EXCLUDE: collectd.warning('%s could not find cgroup: %s' % (PLUGIN, name)) # Calculate platform cpuacct usage (this excludes apps) for g in pc.PLATFORM_GROUPS: cpuacct[pc.GROUP_OVERALL][pc.GROUP_PLATFORM] += \ cpuacct[pc.GROUP_OVERALL][g] # Calculate cgroup based occupancy for overall groupings for g in pc.OVERALL_GROUPS: cputime_ms = \ float(cpuacct[pc.GROUP_OVERALL][g]) / float(pc.ONE_MILLION) occupancy = float(pc.ONE_HUNDRED) * float(cputime_ms) \ / float(obj.elapsed_ms) / obj.number_platform_cpus obj._data[g] = occupancy if obj.debug: collectd.info('%s %s elapsed = %.1f ms, cputime = %.1f ms, ' 'n_cpus = %d, occupancy = %.2f %%' % (PLUGIN_DEBUG, g, obj.elapsed_ms, cputime_ms, obj.number_platform_cpus, occupancy)) # Update t0 state for the next sample collection obj._t0 = copy.deepcopy(t1) obj._t0_cpuacct = copy.deepcopy(t1_cpuacct)
def load_json(resp, url): try: return json.loads(resp) except ValueError, e: collectd.error("Error parsing JSON for API call (%s) %s" % (e, url)) return None
def configure_callback(conf): """ Configuration callback. These are accepted configs: Server: hostname or ip adresss Port: HEC port Token: HEC token QueueSize: Number, maximum metrics buffer SSL: true to use HTTPS VerifySSL: True to enable SSL verification CertFile: Public key of the signing authority Dimension: specify dimension for metrics i.e. Dimension "location:phoenix" Dimension "type:dev" SplunkMetricTransform: true to use Splunk metric format :param conf: configration tree """ dimension_list = [] for node in conf.children: config_key = node.key.lower() if config_key == 'server': CONFIG['server'] = node.values[0] elif config_key == 'port': try: CONFIG['port'] = int(node.values[0]) except Exception: collectd.error('Invalid type of Port, number is required.') elif config_key == 'token': CONFIG['token'] = node.values[0] elif config_key == 'ssl': ssl_val = node.values[0] if ssl_val in ['1', 'True']: CONFIG['ssl'] = True elif ssl_val in ['0', 'False']: CONFIG['ssl'] = False else: collectd.error('Invalid type of ssl, boolean is required.') elif config_key == 'verifyssl': ssl_val = node.values[0] if ssl_val in ['1', 'True']: CONFIG['verify_ssl'] = True elif ssl_val in ['0', 'False']: CONFIG['verify_ssl'] = False else: collectd.error('Invalid type of ssl, boolean is required.') elif config_key == 'queuesize': try: queue_size = int(node.values[0]) CONFIG['queue_size'] = queue_size except Exception: collectd.error( 'Invalid type of queue size, number is required.') elif config_key == 'batchsize': try: batch_size = int(node.values[0]) CONFIG['batch_size'] = batch_size except Exception: collectd.error( 'Invalid type of batch size, number is required.') elif config_key == 'certfile': CONFIG['cert_file'] = node.values[0] elif config_key == 'dimension': # if dimension value is empty, we continue if (len(node.values) == 0): collectd.error("Dimension value is empty") continue try: (key, value) = node.values[0].split(':') except ValueError: collectd.error("Invalid dimension values: %s" % (node.values)) continue dimension_list.append('%s=%s' % ((key, value))) elif config_key == 'splunkmetrictransform': should_transform = node.values[0] if should_transform in ['1', 'True']: CONFIG['splunk_metric_transform'] = True elif should_transform in ['0', 'False']: CONFIG['splunk_metric_transform'] = False else: collectd.error( 'Invalid type of splunk metric transform, boolean is required' ) else: collectd.error('Not supported config key: %s' % (config_key)) CONFIG[DIMENSION_LIST_KEY] = dimension_list collectd.debug("Setting configuration completed.") collectd.debug("configuration: {}".format(CONFIG))
def get_stats(self): collectd.error('Not implemented, should be subclassed')
data[ceph_cluster][osd_id]['snap_trim_queue_len'] = osd[ 'snap_trim_queue_len'] data[ceph_cluster][osd_id]['num_snap_trimming'] = osd[ 'num_snap_trimming'] data[ceph_cluster][osd_id]['apply_latency_ms'] = osd[ 'fs_perf_stat']['apply_latency_ms'] data[ceph_cluster][osd_id]['commit_latency_ms'] = osd[ 'fs_perf_stat']['commit_latency_ms'] return data try: plugin = CephPGPlugin() except Exception as exc: collectd.error("ceph-pg: failed to initialize ceph pg plugin :: %s :: %s" % (exc, traceback.format_exc())) def configure_callback(conf): """Received configuration information""" plugin.config_callback(conf) collectd.register_read(read_callback, plugin.interval) def read_callback(): """Callback triggerred by collectd on read""" plugin.read_callback() collectd.register_init(CephPGPlugin.reset_sigchld) collectd.register_config(configure_callback)
def _send_data(config): protocol = 'http' if config['ssl'] is True: protocol = 'https' server_uri = '%s://%s:%s/services/collector' % (protocol, config['server'], config['port']) headers = ('Authorization: Splunk ' + config['token'] if CURRENT_OS == 'darwin' else { 'Authorization': 'Splunk ' + config['token'] }) metrics = [] while True: if not metrics: count = 0 start = time.time() timeout = config['timeout'] / 2 while (time.time() - start <= timeout) and count < config['batch_size']: # ITOA-8109: Queue api comes with condition/lock menchanism that handles the # case when queue is empty. If queue is empty then it puts this resource to waiting. # this way we are not in a infinite loop. # source: https://hg.python.org/cpython/file/3a1db0d2747e/Lib/Queue.py#l150 try: value = config['metric_queue'].get(timeout=timeout) new_metrics = _build_splunk_metrics(value) for m in new_metrics: metrics.append(json.dumps(m)) count += 1 except Queue.Empty: pass # If there is message in queue try: payload = ''.join(metrics) collectd.info( "payload data to be sent to Splunk: {}".format(payload)) if config['verify_ssl'] is False: if CURRENT_OS == 'darwin': args = [ 'curl', '-k', server_uri, '-H', headers, '-d', payload ] process = subprocess.Popen(args, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() else: response = requests.post(server_uri, data=payload, headers=headers, verify=False) if response.status_code != requests.codes.ok: collectd.error( 'Failed sending metrics to Splunk. Response code:{}, response content:{}' .format(response.status_code, response.content)) else: if CURRENT_OS == 'darwin': args = [ 'curl', '-k', '--cert', config['cert_file'], server_uri, '-H', headers, '-d', payload ] process = subprocess.Popen(args, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() else: response = requests.post(server_uri, data=payload, headers=headers, verify=config['cert_file']) if response.status_code != requests.codes.ok: collectd.error( 'Failed sending metric to Splunk. Response code:{}, response content:{}' .format(response.status_code, response.content)) if config['disable_ssl_warning'] and CURRENT_OS != 'darwin': requests.packages.urllib3.disable_warnings( InsecureRequestWarning) config['disable_ssl_warning'] = False except Exception, e: collectd.error('Failed sending metric to Splunk HEC: {}'.format( str(e))) # Try again in 3 seconds time.sleep(3) metrics = []
def read(self): """ read() method collects the data from libvirt and dispatch it """ self.establish_connection() if self.error == FAILURE: collectd.error("Empty stats dispatch, failed to open connection.") return for domain in self.conn.listAllDomains(0): if not domain.isActive(): collectd.warning("Failed to collectd interface " "stats for VM %s, VM is not running!" % domain.name()) continue tree = ElementTree.fromstring(domain.XMLDesc()) interfaces = [ i.get("dev") for i in tree.findall('devices/interface/target') ] total_rx_pkts = 0 total_tx_pkts = 0 total_rx_drops = 0 total_tx_drops = 0 total_rx_bytes = 0 total_tx_bytes = 0 for iface in interfaces: if iface: collectd.info( "Collecting stats for '%s' interface of VM: %s" % (iface, domain.name())) nic_data = self.collectd_nic_stats(domain, iface) self.prev_iface_data[nic_data[IFACE_NAME]] = deepcopy( nic_data) dispatch(nic_data) collectd.info("Data for interface: %s of VM: %s is " "dispatched" % (iface, domain.name())) total_rx_pkts = total_rx_pkts + nic_data[RX_PKTS] total_tx_pkts = total_tx_pkts + nic_data[TX_PKTS] total_rx_drops = total_rx_drops + nic_data[RX_DROPS] total_tx_drops = total_tx_drops + nic_data[TX_DROPS] total_rx_bytes = total_rx_bytes + nic_data[RX_BYTES] total_tx_bytes = total_tx_bytes + nic_data[TX_BYTES] interface = {} interface[TYPE] = AGGREGATE interface[RX_PKTS] = total_rx_pkts interface[TX_PKTS] = total_tx_pkts interface[RX_DROPS] = total_rx_drops interface[TX_DROPS] = total_tx_drops interface[RX_BYTES] = total_rx_bytes interface[TX_BYTES] = total_tx_bytes self.add_aggregate(domain, interface) self.prev_iface_agg[domain.name()] = interface dispatch(interface) if not self.error: self.conn.close()
def log_err(msg): collectd.error('COMPAL: ' + msg)
def get_stats(self): """Retrieves stats from ceph pools""" ceph_cluster = "%s-%s" % (self.prefix, self.cluster) data = {ceph_cluster: {}} stats_output = None try: osd_pool_cmdline = 'ceph osd pool stats -f json --cluster ' + self.cluster stats_output = subprocess.check_output(osd_pool_cmdline, shell=True) cephdf_cmdline = 'ceph df -f json --cluster ' + self.cluster df_output = subprocess.check_output(ceph_dfcmdline, shell=True) except Exception as exc: collectd.error("ceph-pool: failed to ceph pool stats :: %s :: %s" % (exc, traceback.format_exc())) return if stats_output is None: collectd.error( 'ceph-pool: failed to ceph osd pool stats :: output was None') if df_output is None: collectd.error('ceph-pool: failed to ceph df :: output was None') json_stats_data = json.loads(stats_output) json_df_data = json.loads(df_output) # push osd pool stats results for pool in json_stats_data: pool_key = "pool-%s" % pool['pool_name'] data[ceph_cluster][pool_key] = {} pool_data = data[ceph_cluster][pool_key] for stat in ('read_bytes_sec', 'write_bytes_sec', 'op_per_sec'): pool_data[stat] = pool['client_io_rate'][stat] if pool[ 'client_io_rate'].has_key(stat) else 0 # push df results for pool in json_df_data['pools']: pool_data = data[ceph_cluster]["pool-%s" % pool['name']] for stat in ('bytes_used', 'kb_used', 'objects'): pool_data[stat] = pool['stats'][stat] if pool['stats'].has_key( stat) else 0 # push totals from df data[ceph_cluster]['cluster'] = {} if json_df_data['stats'].has_key('total_bytes'): # ceph 0.84+ data[ceph_cluster]['cluster']['total_space'] = int( json_df_data['stats']['total_bytes']) data[ceph_cluster]['cluster']['total_used'] = int( json_df_data['stats']['total_used_bytes']) data[ceph_cluster]['cluster']['total_avail'] = int( json_df_data['stats']['total_avail_bytes']) else: # ceph < 0.84 data[ceph_cluster]['cluster']['total_space'] = int( json_df_data['stats']['total_space']) * 1024.0 data[ceph_cluster]['cluster']['total_used'] = int( json_df_data['stats']['total_used']) * 1024.0 data[ceph_cluster]['cluster']['total_avail'] = int( json_df_data['stats']['total_avail']) * 1024.0 return data
def get_metrics(conf): info = fetch_info(conf) if not info: collectd.error('redis plugin: No info received') return plugin_instance = conf['instance'] if plugin_instance is None: plugin_instance = '{host}:{port}'.format(host=conf['host'], port=conf['port']) # send high-level values dispatch_value(info, 'uptime_in_seconds', 'counter', plugin_instance) dispatch_value(info, 'connected_clients', 'counter', plugin_instance) dispatch_value(info, 'connected_slaves', 'counter', plugin_instance) dispatch_value(info, 'blocked_clients', 'counter', plugin_instance) dispatch_value(info, 'evicted_keys', 'counter', plugin_instance) dispatch_value(info, 'expired_keys', 'counter', plugin_instance) dispatch_value(info, 'used_memory', 'bytes', plugin_instance) dispatch_value(info, 'used_memory_rss', 'bytes', plugin_instance) dispatch_value(info, 'used_memory_peak', 'bytes', plugin_instance) dispatch_value(info, 'mem_fragmentation_ratio', 'gauge', plugin_instance) dispatch_value(info, 'changes_since_last_save', 'counter', plugin_instance) dispatch_value(info, 'total_connections_received', 'counter', plugin_instance, 'connections_received') dispatch_value(info, 'total_commands_processed', 'counter', plugin_instance, 'commands_processed') dispatch_value(info, 'instantaneous_ops_per_sec', 'counter', plugin_instance, 'instantaneous_ops') dispatch_value(info, 'rejected_connections', 'counter', plugin_instance) dispatch_value(info, 'pubsub_channels', 'counter', plugin_instance) dispatch_value(info, 'pubsub_patterns', 'counter', plugin_instance) dispatch_value(info, 'latest_fork_usec', 'counter', plugin_instance) # send keyspace hits and misses, if they exist if 'keyspace_hits' in info: dispatch_value(info, 'keyspace_hits', 'derive', plugin_instance) if 'keyspace_misses' in info: dispatch_value(info, 'keyspace_misses', 'derive', plugin_instance) # send replication stats, but only if they exist (some belong to master only, some to slaves only) if 'master_repl_offset' in info: dispatch_value(info, 'master_repl_offset', 'gauge', plugin_instance) if 'master_last_io_seconds_ago' in info: dispatch_value(info, 'master_last_io_seconds_ago', 'gauge', plugin_instance) if 'slave_repl_offset' in info: dispatch_value(info, 'slave_repl_offset', 'gauge', plugin_instance) # database and vm stats for key in info: if key.startswith('repl_'): dispatch_value(info, key, 'gauge', plugin_instance) if key.startswith('vm_stats_'): dispatch_value(info, key, 'gauge', plugin_instance) if key.startswith('db'): dispatch_value(info[key], 'keys', 'counter', plugin_instance, '%s-keys' % key) if key.startswith('slave'): dispatch_value(info[key], 'delay', 'gauge', plugin_instance, '%s-delay' % key)
def read_func(): # ntp query is for controllers only if tsc.nodetype != 'controller': return 0 if obj.init_complete is False: init_func() return 0 # get a list if provisioned ntp servers _get_ntp_servers() # nothing to do while there are no provisioned NTP servers if len(obj.server_list_conf) == 0: return 0 # Do NTP Query data = subprocess.check_output([PLUGIN_EXEC, PLUGIN_EXEC_OPTIONS]) # Keep this FIT test code but make it commented out for security # # if os.path.exists('/var/run/fit/ntpq_data'): # data = '' # collectd.info("%s using ntpq FIT data" % PLUGIN) # with open('/var/run/fit/ntpq_data', 'r') as infile: # for line in infile: # data += line if not data: collectd.error("%s no data from query" % PLUGIN) return 0 # Get the ntp query output into a list of lines obj.ntpq = data.split('\n') # keep track of changes ; only log on changes reachable_list_changed = False unreachable_list_changed = False # Manage the selected server name # # save the old value so we can print a log if the selected server changes if obj.selected_server: obj.selected_server_save = obj.selected_server # always assume no selected server ; till its learned obj.selected_server = '' # start with a fresh empty list for this new run to populate obj.server_list_ntpq = [] # Loop through the ntpq output. # Ignore the first 2 lines ; just header data. for i in range(2, len(obj.ntpq)): # Ignore empty or lines that are not long enough. IPV4 IP Address is at # least 7 characters long, IPV6 2. Adding 1 character for the # availability flag. if len(obj.ntpq[i]) < 3: continue # log the ntpq output ; minus the 2 lines of header collectd.info("NTPQ: %s" % obj.ntpq[i]) # Unreachable servers are ones whose line start with a space ip = '' if obj.ntpq[i][0] == ' ': # get the ip address # example format of line:['', '132.163.4.102', '', '', '.INIT.', # get ip from index [1] of the list unreachable = obj.ntpq[i].split(' ')[1] if unreachable: # check to see if its a controller ip # we skip over controller ips if _is_controller(unreachable) is False: _add_ip_to_ntpq_server_list(unreachable) if unreachable not in obj.unreachable_servers: if _raise_alarm(unreachable) is False: unreachable_list_changed = True # if the FM call to raise the alarm worked then # add this ip to the unreachable list if its not # already in it _add_unreachable_server(unreachable) # Reachable servers are ones whose line start with a '+' elif obj.ntpq[i][0] == '+': # remove the '+' and get the ip ip = obj.ntpq[i].split(' ')[0][1:] elif obj.ntpq[i][0] == '*': # remove the '*' and get the ip cols = obj.ntpq[i].split(' ') ip = cols[0][1:] if ip: ip_family = _is_ip_address(ip) obj.peer_selected = _is_controller(ip) if ip != obj.selected_server and obj.alarm_raised is True: # a new ntp server is selected, old alarm may not be # valid _clear_base_alarm() if obj.peer_selected is False: if obj.selected_server: # done update the selected server if more selections # are found. go with the first one found. collectd.info("%s additional selected server found" " '%s'; current selection is '%s'" % (PLUGIN, ip, obj.selected_server)) else: # update the selected server list obj.selected_server = ip collectd.debug("%s selected server is '%s'" % (PLUGIN, obj.selected_server)) else: # refer to peer refid = '' for i in range(1, len(cols)): if cols[i] != '': refid = cols[i] break if refid not in ('', '127.0.0.1') and \ not _is_controller(refid) and \ socket.AF_INET == ip_family: # ipv4, peer controller refer to a time source is not # itself or a controller (this node) obj.selected_server = ip collectd.debug("peer controller has a reliable " "source") # anything else is unreachable else: unreachable = obj.ntpq[i][1:].split(' ')[0] if _is_controller(unreachable) is False: _add_ip_to_ntpq_server_list(unreachable) if unreachable not in obj.unreachable_servers: if _raise_alarm(unreachable) is False: unreachable_list_changed = True # if the FM call to raise the alarm worked then # add this ip to the unreachable list if its not # already in it _add_unreachable_server(unreachable) if ip: # if the ip is valid then manage it if _is_controller(ip) is False: _add_ip_to_ntpq_server_list(ip) # add the ip to the reachable servers list # if its not already there if ip not in obj.reachable_servers: obj.reachable_servers.append(ip) reachable_list_changed = True # make sure this IP is no longer in the unreachable # list and that alarms for it are cleared _remove_ip_from_unreachable_list(ip) _cleanup_stale_servers() if obj.selected_server: if obj.selected_server != obj.selected_server_save: collectd.info( "%s selected server changed from '%s' to '%s'" % (PLUGIN, obj.selected_server_save, obj.selected_server)) obj.selected_server_save = obj.selected_server if obj.alarm_raised is True: _clear_base_alarm() elif obj.alarm_raised is False: if obj.peer_selected: collectd.info("%s peer is selected" % PLUGIN) else: collectd.error("%s no selected server" % PLUGIN) if _raise_alarm() is False: obj.selected_server_save = 'None' # only log and act on changes if reachable_list_changed is True: if obj.reachable_servers: collectd.info("%s reachable servers: %s" % (PLUGIN, obj.reachable_servers)) if obj.alarm_raised is True: if obj.selected_server and obj.reachable_servers: _clear_base_alarm() else: collectd.error("%s no reachable servers" % PLUGIN) _raise_alarm() # only log changes if unreachable_list_changed is True: if obj.unreachable_servers: collectd.info("%s unreachable servers: %s" % (PLUGIN, obj.unreachable_servers)) else: collectd.info("%s all servers are reachable" % PLUGIN) # The sample published to the database is simply the number # of reachable servers if one is selected if not obj.selected_server: sample = 0 else: sample = len(obj.reachable_servers) # Dispatch usage value to collectd val = collectd.Values(host=obj.hostname) val.plugin = 'ntpq' val.type = 'absolute' val.type_instance = 'reachable' val.dispatch(values=[sample]) return 0
def read_config(self, cfg): """Initializes variables from conf files.""" for children in cfg.children: if children.key == INTERVAL: self.interval = children.values[0] elif children.key == USE_REST_API: use_rest_api = int(children.values[0]) elif children.key == USER: self.username = children.values[0] elif children.key == PASSWORD: self.password = children.values[0] host, port, index = self.get_elastic_search_details() elastic["host"] = host elastic["port"] = port indices["workflow"] = index appname = self.get_app_name() tag_app_name['oozie'] = appname self.cluster_name = self.get_cluster() job_history_server["port"] = "19888" timeline_server["port"] = "8188" oozie["port"] = "11000" self.hdfs_port = "50070" if not os.path.isdir(jobhistory_copy_dir): try: os.mkdir(jobhistory_copy_dir) except: collectd.error("Unable to create job history directory %s" % jobhistory_copy_dir) timezone = self.get_time_zone() if not timezone: collectd.error("Unable to get timezone") if self.cluster_name and timezone and self.is_service_running( ["OOZIE", "MAPREDUCE2", "SPARK2", "HDFS"]): job_history_host = self.get_hadoop_service_details( self.url_knox + "/" + self.cluster_name + "/services/MAPREDUCE2/components/HISTORYSERVER") if job_history_host: job_history_server["host"] = job_history_host[0] else: collectd.error("Unable to get Job_history ip") timeline_host = self.get_hadoop_service_details( self.url_knox + "/" + self.cluster_name + "/services/YARN/components/APP_TIMELINE_SERVER") if timeline_host: timeline_server["host"] = timeline_host[0] else: collectd.error("Unable to get timeline_server ip") oozie_host = self.get_hadoop_service_details( self.url_knox + "/" + self.cluster_name + "/services/OOZIE/components/OOZIE_SERVER") if oozie_host: oozie["host"] = oozie_host[0] else: collectd.error("Unable to get oozie ip") self.hdfs_hosts = self.get_hadoop_service_details( self.url_knox + "/" + self.cluster_name + "/services/HDFS/components/NAMENODE") if self.hdfs_hosts: if len(self.hdfs_hosts) == 2: hdfs["url"] = "http://{0}:{1};http://{2}:{3}".format( self.hdfs_hosts[0], self.hdfs_port, self.hdfs_hosts[1], self.hdfs_port) else: hdfs["url"] = "http://{0}:{1}".format( self.hdfs_hosts[0], self.hdfs_port) hdfs['timezone'] = timezone hdfs["user"] = "******" else: collectd.error("Unable to get hdfs ips") if job_history_host and timeline_host and oozie_host and self.hdfs_hosts: self.update_config_file(use_rest_api, jobhistory_copy_dir) self.is_config_updated = 1 initialize_app() initialize_app_elastic() else: collectd.error("Unable to get cluster name")
def get_redis_details(self): details_dict={} stats_dict={} persistence_dict = {} cpu_dict = {} final_redis_dict={} try: server_details = self.redis_client.info(section="server") if server_details: details_dict["version"] = server_details.get("redis_version",None) details_dict["buildId"] = server_details.get("redis_build_id",None) details_dict["mode"] = server_details.get("redis_mode",None) details_dict["os"] = server_details.get("os") details_dict["tcpPort"] = server_details.get("tcp_port") details_dict["runId"] = server_details.get("run_id") details_dict["upTime"] = server_details.get("uptime_in_seconds",None) server_conn_details = self.redis_client.info("clients") if server_conn_details: stats_dict["clientLongestOutputList"] = server_conn_details.get("client_longest_output_list") stats_dict["clientBiggestInputBuf"] = server_conn_details.get("client_biggest_input_buf") stats_dict["blockedClients"] = server_conn_details.get("blocked_clients",0) stats_dict["connectedClients"] = server_conn_details.get("connected_clients",0) server_stats = self.redis_client.info(section="stats") if server_stats: input_bytes = None try: input_bytes = round(server_stats.get("total_net_input_bytes",0) / (1024.0 * 1024.0), 2) except Exception as e: collectd.error("Error in getting total input bytes due to %s" % str(e)) output_bytes = None try: output_bytes = round(server_stats.get("total_net_output_bytes",0) / (1024.0 * 1024.0), 2) except Exception as e: collectd.error("Error in getting total input bytes due to %s" % str(e)) stats_dict["clusterEnabled"] = True if self.redis_client.info().get("cluster_enabled")== 1 else False stats_dict["instantaneousInputKbps"] = server_stats.get("instantaneous_input_kbps",0.0) stats_dict["instantaneousOutputKbps"] = server_stats.get("instantaneous_output_kbps",0.0) if self.pollCounter <= 1: self.previousData["syncFull"] = server_stats.get("sync_full",0) self.previousData["syncPartialOk"] = server_stats.get("sync_partial_ok",0) self.previousData["syncPartialErr"] = server_stats.get("sync_partial_err",0) self.previousData["totalConnReceived"] = server_stats.get("total_connections_received",0) self.previousData["totalCommandsProcessed"] = server_stats.get("total_commands_processed",0) self.previousData["totalNetInputBytes"] = input_bytes self.previousData["totalNetOuputBytes"] = output_bytes self.previousData["keyspaceHits"] = server_stats.get("keyspace_hits",0) self.previousData["keyspaceMisses"] = server_stats.get("keyspace_misses",0) self.previousData["expiredKeys"] = server_stats.get("expired_keys",0) self.previousData["evictedKeys"] = server_stats.get("evicted_keys",0) self.previousData["rejectedConn"] = server_stats.get("rejected_connections",0) stats_dict["syncFull"]=0 stats_dict["syncPartialOk"]=0 stats_dict["syncPartialErr"]=0 stats_dict["totalConnReceived"] = 0 stats_dict["totalCommandsProcessed"] = 0 stats_dict["totalNetInputBytes"] = 0 stats_dict["totalNetOutputBytes"] = 0 stats_dict["keyspaceHits"] = 0 stats_dict["keyspaceMisses"] = 0 # stats_dict["keyspaceHitRate"] = 0.0 stats_dict["keyspaceMissRate"] = 0.0 stats_dict["writeThroughput"] = 0.0 stats_dict["readThroughput"] = 0.0 else: stats_dict["syncFull"]=server_stats.get("sync_full",0)-self.previousData["syncFull"] stats_dict["syncPartialOk"]= server_stats.get("sync_partial_ok",0)-self.previousData["syncPartialOk"] stats_dict["syncPartialErr"]=server_stats.get("sync_partial_err",0)-self.previousData["syncPartialErr"] stats_dict["rejectedConn"] = server_stats.get("rejected_connections",0) - self.previousData["rejectedConn"] stats_dict["expiredKeys"] = server_stats.get("expired_keys",0) - self.previousData["expiredKeys"] stats_dict["evictedKeys"] = server_stats.get("evicted_keys",0) - self.previousData["evictedKeys"] stats_dict["totalConnReceived"] = server_stats.get("total_connections_received",0) - self.previousData["totalConnReceived"] stats_dict["totalCommandsProcessed"] = server_stats.get("total_commands_processed",0) - self.previousData["totalCommandsProcessed"] stats_dict["totalNetInputBytes"] = input_bytes - self.previousData["totalNetInputBytes"] stats_dict["totalNetOutputBytes"] = output_bytes - self.previousData["totalNetOuputBytes"] stats_dict["keyspaceHits"] = server_stats.get("keyspace_hits",0) - self.previousData["keyspaceHits"] stats_dict["keyspaceMisses"] = server_stats.get("keyspace_misses",0) - self.previousData["keyspaceMisses"] # if ((stats_dict["keyspaceHits"] > 0) or (stats_dict["keyspaceMisses"] > 0)): # keyspace_dict["keyspaceHitRate"] = round(float(stats_dict["keyspaceHits"] / (keyspace_dict["keyspaceHits"] + stats_dict["keyspaceMisses"])), 2) # stats_dict["keyspaceMissRate"] = round(float(stats_dict["keyspaceMisses"] / (stats_dict["keyspaceHits"] + stats_dict["keyspaceMisses"])), 2) # else: # stats_dict["keyspaceHitRate"] = 0 # stats_dict["keyspaceMissRate"] = 0 stats_dict["readThroughput"] = round(float(float(stats_dict["totalNetInputBytes"]) / int(self.interval)), 2) stats_dict["writeThroughput"] = round(float(float(stats_dict["totalNetOutputBytes"]) / int(self.interval)), 2) self.previousData["syncFull"] = server_stats.get("sync_full",0) self.previousData["syncPartialOk"] = server_stats.get("sync_partial_ok",0) self.previousData["syncPartialErr"] = server_stats.get("sync_partial_err",0) self.previousData["totalConnReceived"] = server_stats.get("total_connections_received",0) self.previousData["totalCommandsProcessed"] = server_stats.get("total_commands_processed",0) self.previousData["totalNetInputBytes"] = input_bytes self.previousData["totalNetOutputBytes"] = output_bytes self.previousData["keyspaceHits"] = server_stats.get("keyspace_hits",0) self.previousData["keyspaceMisses"] = server_stats.get("keyspace_misses",0) self.previousData["expiredKeys"] = server_stats.get("expired_keys",0) self.previousData["evictedKeys"] = server_stats.get("evicted_keys",0) self.previousData["rejectedConn"] = server_stats.get("rejected_connections",0) keyspace_details = self.redis_client.info("keyspace") if keyspace_details: totalk = 0 dbcount = 0 for k, v in keyspace_details.items(): totalk += int(v["keys"]) dbcount += 1 stats_dict["totKeys"] = totalk else: collectd.error("No Key details found") stats_dict["totKeys"] = 0 outlis = subprocess.check_output(["redis-cli", "--intrinsic-latency", "1"]).split() if len(outlis) > 0: try: stats_dict["latency"] = float(outlis[-16]) except ValueError: collectd.error("No latency details found") stats_dict["latency"] = 0 memory_stats = self.redis_client.info(section="memory") if memory_stats: stats_dict["usedMemoryPeak"] = round(memory_stats.get("used_memory_peak",0) / (1024.0 * 1024.0), 2) stats_dict["totalSystemMemory"] = round(memory_stats.get("total_system_memory",0) / (1024.0 * 1024.0), 2) stats_dict["memFragmentationRatio"] = memory_stats.get("mem_fragmentation_ratio",0) stats_dict["memoryAllocator"] = memory_stats.get("mem_allocator",None) stats_dict["maxmemoryPolicy"] = memory_stats.get("maxmemory_policy") stats_dict["memoryUsed"] = round(memory_stats.get("used_memory",0) / (1024.0 * 1024.0), 2) else: collectd.error("No memory stats found") pass cpu_stats = self.redis_client.info(section="cpu") if cpu_stats: if self.pollCounter <= 1: self.previousData["usedCpuSys"] = cpu_stats.get("used_cpu_sys",0.0) self.previousData["usedCpuUser"] = cpu_stats.get("used_cpu_user",0.0) self.previousData["usedCpuUserChildren"] = cpu_stats.get("used_cpu_user_children",0.0) self.previousData["usedCpuSysChildren"] = cpu_stats.get("used_cpu_sys_children",0.0) details_dict["usedCpuSys"] = 0.0 details_dict["usedCpuUser"]= 0.0 details_dict["usedCpuUserChildren"]= 0.0 details_dict["usedCpuSysChildren"]= 0.0 else: details_dict["usedCpuSys"] = cpu_stats.get("used_cpu_sys",0.0)-self.previousData["usedCpuSys"] details_dict["usedCpuUser"] = cpu_stats.get("used_cpu_user",0.0)- self.previousData["usedCpuUser"] details_dict["usedCpuUserChildren"] = cpu_stats.get("used_cpu_user_children",0.0) - self.previousData["usedCpuUserChildren"] details_dict["usedCpuSysChildren"] = cpu_stats.get("used_cpu_sys_children",0.0) - self.previousData["usedCpuSysChildren"] self.previousData["usedCpuSys"] = cpu_stats.get("used_cpu_sys",0.0) self.previousData["usedCpuUser"] = cpu_stats.get("used_cpu_user",0.0) self.previousData["usedCpuUserChildren"] = cpu_stats.get("used_cpu_user_children",0.0) self.previousData["usedCpuSysChildren"] = cpu_stats.get("used_cpu_sys_children",0.0) persistence_stats = self.redis_client.info(section="persistence") if persistence_stats: persistence_dict["aofEnabled"] = True if persistence_stats.get("aof_enabled") == 1 else False persistence_dict["aofRewriteInProgress"] = True if persistence_stats.get("aof_rewrite_in_progress") == 1 else False persistence_dict["aofLastWriteStatus"] = persistence_stats.get("aof_last_write_status","Failed") persistence_dict["aofRewriteScheduled"] = True if persistence_stats.get("aof_rewrite_scheduled") == 1 else False persistence_dict["aofCurrentSize"] = round(persistence_stats.get("aof_current_size",0)/1024, 2) persistence_dict["aofBufferLength"] = round(persistence_stats.get("aof_buffer_length",0)/ 1024, 2) persistence_dict["rdbBgsaveInProgress"] = True if persistence_stats.get("rdb_bgsave_in_progress") == 1 else False persistence_dict["rdbLastSaveTime"] = persistence_stats.get("rdb_last_save_time",0) persistence_dict["rdbLastBgsaveStatus"] = persistence_stats.get("rdb_last_bgsave_status","Failed") persistence_dict["loadingStartTime"] = persistence_stats.get("loading_start_time",0) persistence_dict["loadingTotalKBytes"] = round(persistence_stats.get("loading_total_bytes",0.0)/(1024.0), 2) if self.pollCounter <= 1: self.previousData["loadingLoadedKBytes"] = persistence_stats.get("loading_loaded_bytes",0.0) persistence_dict["loadingLoadedKBytes"] = 0.0 else: persistence_dict["loadingLoadedKBytes"] = round((persistence_stats.get("loading_loaded_bytes",0.0)-self.previousData["loadingLoadedKBytes"])/(1024.0), 2) self.previousData["loadingLoadedKBytes"] = persistence_stats.get("loading_loaded_bytes",0.0) persistence_dict["loadingLoadedPerc"] = int(persistence_stats.get("loading_loaded_perc",0)) persistence_dict["loadingEtaSeconds"] = int(persistence_stats.get("loading_eta_seconds",0)) persistence_dict[PLUGINTYPE] = "redisPersistence" rep_stats = self.redis_client.info(section="replication") if rep_stats: details_dict["role"] = rep_stats.get("role",None) stats_dict["connectedSlaves"] = rep_stats.get("connected_slaves",0) details_dict["replBacklogActive"] = True if rep_stats.get("repl_backlog_active") == 1 else False if self.pollCounter <= 1: self.previousData["replBacklogHistlen"] = int(rep_stats.get("repl_backlog_histlen",0)) details_dict["replBacklogHistlen"] = 0 else: details_dict["replBacklogHistlen"] = int((rep_stats.get("repl_backlog_histlen",0)-self.previousData["replBacklogHistlen"])/1024) self.previousData["replBacklogHistlen"] = rep_stats.get("repl_backlog_histlen",0) details_dict["masterLinkStatus"] = rep_stats.get("master_link_status", None) stats_dict["masterLastIOSecsAgo"] = rep_stats.get("master_last_io_seconds_ago", None) details_dict["masterLinkDownSinceSecs"] = rep_stats.get("master_link_down_since_seconds", None) details_dict[PLUGINTYPE] = "redisDetails" stats_dict[PLUGINTYPE] = "redisStat" final_redis_dict["redisPersistence"] = persistence_dict final_redis_dict["redisDetails"] = details_dict final_redis_dict["redisStat"] = stats_dict self.add_common_params(final_redis_dict) return final_redis_dict except Exception as err: collectd.error("Unable to fetch the details due to %s" % str(err)) return final_redis_dict
except socket.error, e: collectd.error('redis_info plugin: Error connecting to %s:%d - %r' % (REDIS_HOST, REDIS_PORT, e)) return None fp = s.makefile('r') if REDIS_AUTH is not None: log_verbose('Sending auth command') s.sendall('auth %s\r\n' % (REDIS_AUTH)) status_line = fp.readline() if not status_line.startswith('+OK'): # -ERR invalid password # -ERR Client sent AUTH, but no password is set collectd.error('redis_info plugin: Error sending auth to %s:%d - %r' % (REDIS_HOST, REDIS_PORT, status_line)) return None log_verbose('Sending info command') s.sendall('info\r\n') status_line = fp.readline() content_length = int(status_line[1:-1]) # status_line looks like: $<content_length> data = fp.read(content_length) log_verbose('Received data: %s' % data) s.close() linesep = '\r\n' if '\r\n' in data else '\n' return parse_info(data.split(linesep))
data[self.prefix][name]['memory_mb'] * data[self.prefix]['cluster']['config']['AllocationRatioRam'] data[self.prefix][name]['memory_mb_overcommit_withreserve'] = \ data[self.prefix][name]['memory_mb_overcommit'] - data[self.prefix]['cluster']['config']['ReservedNodeRamMB'] data[self.prefix][name]['vcpus_overcommit'] = \ data[self.prefix][name]['vcpus'] * data[self.prefix]['cluster']['config']['AllocationRatioCores'] data[self.prefix][name]['vcpus_overcommit_withreserve'] = \ data[self.prefix][name]['vcpus_overcommit'] - data[self.prefix]['cluster']['config']['ReservedNodeCores'] return data try: plugin = NovaPlugin() except Exception as exc: collectd.error( "openstack-nova: failed to initialize nova plugin :: %s :: %s" % (exc, traceback.format_exc())) def configure_callback(conf): """Received configuration information""" plugin.config_callback(conf) def read_callback(): """Callback triggerred by collectd on read""" plugin.read_callback() collectd.register_config(configure_callback) collectd.register_read(read_callback, plugin.interval)
results = output.split('\n') # push values data[ceph_cluster]['cluster'] = {} data[ceph_cluster]['cluster']['avg_latency'] = results[0] data[ceph_cluster]['cluster']['stddev_latency'] = results[1] data[ceph_cluster]['cluster']['max_latency'] = results[2] data[ceph_cluster]['cluster']['min_latency'] = results[3] return data try: plugin = CephLatencyPlugin() except Exception as exc: collectd.error( "ceph-latency: failed to initialize ceph latency plugin :: %s :: %s" % (exc, traceback.format_exc())) def configure_callback(conf): """Received configuration information""" plugin.config_callback(conf) collectd.register_read(read_callback, plugin.interval) def read_callback(): """Callback triggerred by collectd on read""" plugin.read_callback() collectd.register_config(configure_callback)
def poll(self): node_stats = dict() try: endTime = int( time.time() * 1000) #swagger-stats time is represented in nano seconds startTime = endTime - (int(self.interval) * 1000) error_response = False log_stats = True url_api_request = "http://localhost:{}/swagger-stats/stats?fields=apistats".format( self.port) response = requests.get(url_api_request) final_json_to_be_dispatched = list() if response.status_code == 200: collectd.info( 'Plugin nodejsapi: Response code 200 received for apistats' ) content = response.content #response_json = json.loads(content) response_json = ast.literal_eval(content) swagger_api_stats = response_json.get("apistats") api_req_stats = dict() for path_key in swagger_api_stats.keys(): path = path_key method_info = swagger_api_stats[path] method_level_info = dict() for method in method_info.keys(): req_method = method req_method_details = method_info[method] api_req = {} api_req["requests"] = req_method_details.get( "requests") api_req["responses"] = req_method_details.get( "responses") api_req["redirect"] = req_method_details.get( "redirect") api_req["total_time"] = req_method_details.get( "total_time") api_req["success"] = req_method_details.get("success") api_req["errors"] = req_method_details.get("errors") api_req["total_req_clength"] = req_method_details.get( "total_req_clength") api_req["total_res_clength"] = req_method_details.get( "total_res_clength") method_level_info[req_method] = api_req api_req_stats[path] = method_level_info if self.previous_data: for key_path in api_req_stats.keys(): api_stats = dict() if key_path in self.previous_data.keys(): method_info = api_req_stats[key_path] for method, method_details in method_info.items(): if method in self.previous_data[key_path].keys( ): if (method_details.get("requests") - self.previous_data[key_path] [method]["requests"]) < 0: log_stats = False #dont log long request and errors when app server is restarted final_json_to_be_dispatched = list() break api_stats["requests"] = method_details.get( "requests") - self.previous_data[ key_path][method]["requests"] api_stats[ "responses"] = method_details.get( "responses") - self.previous_data[ key_path][method]["responses"] api_stats["redirect"] = method_details.get( "redirect") - self.previous_data[ key_path][method]["redirect"] api_stats[ "total_time"] = method_details.get( "total_time") - self.previous_data[ key_path][method]["total_time"] api_stats["success"] = method_details.get( "success") - self.previous_data[ key_path][method]["success"] api_stats["errors"] = method_details.get( "errors") - self.previous_data[ key_path][method]["errors"] api_stats[ "total_req_clength"] = method_details.get( "total_req_clength" ) - self.previous_data[key_path][ method]["total_req_clength"] api_stats["method"] = method api_stats["path"] = key_path else: api_stats = copy.deepcopy(method_details) api_stats["method"] = method api_stats["path"] = key_path if int(api_stats["requests"]) != 0: final_json_to_be_dispatched.append( api_stats) else: method_info = api_req_stats[key_path] for method, method_details in method_info.items(): api_stats = copy.deepcopy(method_details) api_stats["method"] = method api_stats["path"] = key_path final_json_to_be_dispatched.append(api_stats) else: log_stats = False #dont log long request and errors in first poll self.previous_data = copy.deepcopy(api_req_stats) else: error_response = True node_stats[API_STATS] = final_json_to_be_dispatched url_last_errors = "http://localhost:{}/swagger-stats/stats?fields=lasterrors".format( self.port) response = requests.get(url_last_errors) error_stats = list() if response.status_code == 200 and log_stats: # don't log data for first interval collectd.info( 'Plugin nodejsapi: Response code 200 received for lasterrors' ) content = response.content # response_json = json.loads(content) response_json = ast.literal_eval(content) swagger_error_stats = response_json.get("lasterrors") for error in swagger_error_stats: if (error.get("startts") <= endTime and error.get("startts") >= startTime): error_stat = {} error_stat["path"] = error.get("api").get("path") error_stat["method"] = error.get("method") error_stat["error_code"] = (( error.get("http")).get("response")).get("code") error_stats.append(error_stat) else: error_response = True node_stats[ERROR_STATS] = error_stats url_long_request = "http://localhost:{}/swagger-stats/stats?fields=longestreq".format( self.port) response = requests.get(url_long_request) long_req_stats = list() if response.status_code == 200 and log_stats: # don't log data for first interval collectd.info( 'Plugin nodejsapi: Response code 200 received for longestreq' ) content = response.content # response_json = json.loads(content) response_json = ast.literal_eval(content) swagger_long_req_stats = response_json.get("longestreq") for long_req in swagger_long_req_stats: if (long_req.get("startts") <= endTime and long_req.get("startts") >= startTime): long_req_stat = {} long_req_stat["path"] = long_req.get("api").get("path") long_req_stat["method"] = long_req.get("method") long_req_stat["responsetime"] = long_req.get( "responsetime") long_req_stats.append(long_req_stat) else: error_response = True node_stats[LONG_REQUEST_STATS] = long_req_stats #send empty value if no data present for given time interval if not error_response and len(error_stats) == 0 and len( long_req_stats) == 0 and len( final_json_to_be_dispatched) == 0: long_req_stat = {} long_req_stat["path"] = None long_req_stat["method"] = None long_req_stat["responsetime"] = None long_req_stats.append(long_req_stat) node_stats[LONG_REQUEST_STATS] = long_req_stats except Exception as ex: collectd.error( 'Error collecting nodejsapi application stats : %s ' % ex.message) return node_stats
v.dispatch(values=[value if new_data else 0]) def read(): global gw_stat_file, last_time # Read the stats file try: with open(gw_stat_file, 'r') as f: gw_stats = load(f) except IOError, e: collectd.error('ttn_gw plugin: Cannot read gateway stats file %s' % e) collectd.error('ttn_gw plugin: (gateway not runing?)') return except ValueError: collectd.error('ttn_gw plugin: Cannot parse gateway stats file') return new_data = False if last_time != gw_stats['time']: new_data = True last_time = gw_stats['time'] current = gw_stats['current'] keys = ( 'up_radio_packets_received', 'up_radio_packets_crc_good', 'up_radio_packets_crc_bad', 'up_radio_packets_crc_absent', 'up_radio_packets_dropped', 'up_radio_packets_forwarded',
def populate_disk_details(self, vol_name, brick_host, brick_path): try: device_to_partitions = {} brick_devices, brick_device_partitions, mount_point = \ self.get_brick_devices(brick_path) if not (brick_devices or brick_device_partitions): collectd.error( 'Failed to fetch device details for brick %s:%s' ' of volume %s' % ( brick_host, brick_path, vol_name ) ) return for device in brick_devices: partition_name_re = re.compile('%s[0-9]+' % device) device_partitions = [] for partition in brick_device_partitions: if partition_name_re.match(partition): device_partitions = device_to_partitions.get( device, [] ) device_partitions.append(partition) device_to_partitions[device] = device_partitions for brick_device, partitions in device_to_partitions.iteritems(): # Collect disk read and write octets # Push to cluster->volume->node->brick tree self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'disk_octets.read' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'read_bytes' ) self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'disk_octets.write' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'write_bytes' ) # Push to cluster->node->brick tree self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'disk_octets.read' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'read_bytes' ) self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'disk_octets.write' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'write_bytes' ) # Collect disk read and write io # Push cluster->volume->host->brick tree self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'disk_ops.read' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'read_count' ) self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'disk_ops.write' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'write_count' ) # Push to cluster->node->brick tree self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'disk_ops.read' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'read_count' ) self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'disk_ops.write' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'write_count' ) # Collect disk read and write latency # Push to cluster->volume->node->brick tree self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'disk_time.read' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'read_time' ) self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'disk_time.write' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'write_time' ) # Push to cluster->node->brick tree self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'disk_time.read' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'read_time' ) self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'disk_time.write' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = self.get_interval_disk_io_stat( brick_device, partitions, 'write_time' ) # Collect disk utilization # Push to cluster->volume->node->brick tree disk_usage = self.get_disk_usage(brick_device) self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'utilization.used' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.used self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'utilization.total' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.total self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'utilization.percent_used' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.percent # Push to cluster->node->brick tree self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'utilization.used' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.used self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'utilization.total' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.total self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'utilization.percent_used' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.percent # Collect disk mount-point utilization if not mount_point: return disk_usage = self.get_disk_usage(mount_point) self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'mount_utilization.used' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.used self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'mount_utilization.total' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.total self.brick_details[ 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.' 'mount_utilization.percent_used' % ( self.CONFIG['integration_id'], vol_name, brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.percent # Push to cluster->node->brick tree self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'mount_utilization.used' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.used self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'mount_utilization.total' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.total self.brick_details[ 'clusters.%s.nodes.%s.bricks.%s.device.%s.' 'mount_utilization.percent_used' % ( self.CONFIG['integration_id'], brick_host.replace('.', '_'), brick_path.replace('/', '|'), brick_device.replace('/dev/', '') ) ] = disk_usage.percent except (AttributeError, KeyError): collectd.error( 'Failed to populate_disk_details for volume %s, brick %s.' 'Error %s' % ( vol_name, brick_path, traceback.format_exc() ) )
def dns_request_conf(config=None): """Collectd Plugin Configuration Parsing""" global Queries, Nameserver_Cache collectd.debug("config: {}".format(str(config.key))) Queries = {} for request in config.children: collectd.debug("C: {} = {}".format(request.key, request.values[0])) request_name = request.values[0] Queries[request_name] = {'recordtype': 'A', 'timeout': DEFAULT_TIMEOUT} for c in [ x for x in request.children if x.key.lower() in ('query', 'server', 'timeout', 'sourceip', 'sourceport') ]: collectd.debug("Queries[{}][{}] values: {}".format( request_name, c.key.lower(), c.values[0])) Queries[request_name][c.key.lower()] = c.values[0] collectd.debug("QUERIES: {}".format(Queries.keys())) required_args = set(['query', 'server', 'timeout']) for q, query in Queries.items(): query = Queries[q] actual_args = set(sorted(query.keys())) if (not required_args.issubset(actual_args)): # if (('query', 'server', 'timeout') not in query.keys()): collectd.warning("Request '{}' is missing either a Query, " "Server or Timeout value ({}). Skipping.".format( q, query)) query['skip'] = True if (('server' in query.keys()) and query['server'] not in Nameserver_Cache.keys()): try: resolver = dns.resolver.Resolver() results = resolver.query(query['server'], 'A') collectd.debug("RESULTS {}: {}".format(query['server'], results)) if results: Nameserver_Cache[query['server']] = str(results[0]) except dns.resolver.NXDOMAIN as e: collectd.warning("Unable to determine the IP of the server " "'{}', supplied in request '{}'".format( query['server'], q)) query['skip'] = True # Validate the SourceIP to see that it makes sense # (it's an IP and we can bind to it) if ('sourceip' in query.keys()): try: ip_addr = ipaddress.ip_address(unicode(query['sourceip'])) if (type(ip_addr) is ipaddress.IPv6Address): test_sock = socket.socket(socket.AF_INET6) test_sock.bind(('', 0)) elif (type(ip_addr) is ipaddress.IPv4Address): test_sock = socket.socket(socket.AF_INET) test_sock.bind(('', 0)) else: raise ValueError("'{}' isn't an IPv4 or IPv6 address" "!?".format(query['sourceip'])) query['skip'] = True except ValueError as v: collectd.error("Source IP in '{}' ({}) doens't look valid!" " {}".format(q, query['sourceip'], v)) query['skip'] = True # Validate the SourcePort to see that it makes sense: # It's in a good range and we can bind to it. # Binding's a little redundant, but it'll complain if we don't # have permission, or if the port's already in use, where checking the # range will only tell us if the port is a reasonable number if ('sourceport' in query.keys()): source_port = int(query['sourceport']) if (source_port < 0 or source_port > 65535): query['skip'] = True collectd.warning( "Invalid source port '{:d}' provided. Skipping " "the DNS query for '{}' [{}]".format( source_port, query['query'], query['recordtype'])) else: test_sock = socket.socket(socket.AF_INET) test_sock.bind(('', source_port)) test_sock.close() collectd.debug("QUERIES: {}".format(Queries))
try: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(admin_socket) except socket.error, e: collectd.error('ERROR: ceph plugin: Connecting to %s: - %r' % (admin_socket, e)) return None sock.sendall(cmd) try: length = struct.unpack('>i', sock.recv(4))[0] json_data = json.loads(sock.recv(length)) except Exception as err: collectd.error('ERROR: ceph plugin: Unable to parse json: %r' % (err, )) json_data = {} finally: sock.close() return json_data def configure_callback(conf): """ Collectd configuration callback """ global CEPH_ADMIN_SOCKET for node in conf.children: if node.key == 'AdminSocket': CEPH_ADMIN_SOCKET = node.values[0] else: collectd.warning('WARNING: ceph plugin: Unknown config key: %s.' %
collectd.error('redis_info plugin: Error connecting to %s:%d - %r' % (conf['host'], conf['port'], e)) return None fp = s.makefile('r') if conf['auth'] is not None: log_verbose('Sending auth command') s.sendall('auth %s\r\n' % (conf['auth'])) status_line = fp.readline() if not status_line.startswith('+OK'): # -ERR invalid password # -ERR Client sent AUTH, but no password is set collectd.error( 'redis_info plugin: Error sending auth to %s:%d - %r' % (conf['host'], conf['port'], status_line)) return None log_verbose('Sending info command') s.sendall('info\r\n') status_line = fp.readline() if status_line.startswith('-'): collectd.error('redis_info plugin: Error response from %s:%d - %r' % (conf['host'], conf['port'], status_line)) s.close() return None content_length = int(
def get_metrics(self): try: ret_val = {} volumes = self.CLUSTER_TOPOLOGY.get('volumes', []) # Push brick level connections count volumes_list = [] for volume in volumes: brick_found_for_curr_node = False for sub_volume_index, sub_volume_bricks in volume.get( 'bricks', {} ).iteritems(): for brick in sub_volume_bricks: brick_hostname = \ tendrl_glusterfs_utils.find_brick_host( self.etcd_client, self.CONFIG['integration_id'], brick.get('hostname') ) if brick_hostname: brick_ip = socket.gethostbyname(brick_hostname) if ( brick_ip == socket.gethostbyname( self.CONFIG['peer_name'] ) or brick_hostname == self.CONFIG['peer_name'] ): brick_found_for_curr_node = True # Push brick client connections ret_val[ 'clusters.%s.volumes.%s.nodes.%s.' 'bricks.%s.' 'connections_count' % ( self.CONFIG['integration_id'], volume.get('name', ''), self.CONFIG['peer_name'].replace( '.', '_'), brick['path'].replace( '/', self.brick_path_separator ) ) ] = brick['connections_count'] if brick_found_for_curr_node: # Update rebalance info only for this volumes volumes_list.append(volume.get('name', '')) # push rebalance info rebalance_info = self._get_rebalance_info() for vol_name in rebalance_info: if vol_name in volumes_list: # Push volume wise snap counts ret_val[ 'clusters.%s.volumes.%s.snap_count' % ( self.CONFIG['integration_id'], vol_name ) ] = rebalance_info[vol_name]['snap_count'] # Push rebalance bytes progress ret_val[ 'clusters.%s.volumes.%s.nodes.%s.rebalance_bytes' % ( self.CONFIG['integration_id'], vol_name, self.CONFIG['peer_name'].replace('.', '_') ) ] = rebalance_info[vol_name]['rebalance_data'] # Push rebalance files progress ret_val[ 'clusters.%s.volumes.%s.nodes.%s.rebalance_files' % ( self.CONFIG['integration_id'], vol_name, self.CONFIG['peer_name'].replace('.', '_') ) ] = rebalance_info[vol_name]['rebalance_files'] # Push rebalance failures ret_val[ 'clusters.%s.volumes.%s.nodes.%s.' 'rebalance_failures' % ( self.CONFIG['integration_id'], vol_name, self.CONFIG['peer_name'].replace('.', '_') ) ] = rebalance_info[vol_name]['rebalance_failures'] # Push rebalance skipped ret_val[ 'clusters.%s.volumes.%s.nodes.%s.rebalance_skipped' % ( self.CONFIG['integration_id'], vol_name, self.CONFIG['peer_name'].replace('.', '_') ) ] = rebalance_info[vol_name]['rebalance_skipped'] return ret_val except (AttributeError, KeyError, ValueError): collectd.error( 'Failed to fetch counters. Error %s\n\n' % ( traceback.format_exc() ) ) return {}
def init_func(): # ntp query is for controllers only if tsc.nodetype != 'controller': return 0 # do nothing till config is complete. if obj.config_complete() is False: return 0 # get current hostname obj.hostname = obj.gethostname() if not obj.hostname: collectd.error("%s failed to get hostname" % PLUGIN) return 1 obj.base_eid = 'host=' + obj.hostname + '.ntp' collectd.debug("%s on %s with entity id '%s'" % (PLUGIN, obj.hostname, obj.base_eid)) # get a list of provisioned ntp servers _get_ntp_servers() # manage existing alarms. try: alarms = api.get_faults_by_id(PLUGIN_ALARMID) except Exception as ex: collectd.error("%s 'get_faults_by_id' exception ; %s ; %s" % (PLUGIN, PLUGIN_ALARMID, ex)) return 0 if alarms: for alarm in alarms: eid = alarm.entity_instance_id # ignore alarms not for this host if obj.hostname not in eid: continue # maintain only the base alarm. if alarm.entity_instance_id != obj.base_eid: # clear any ntp server specific alarms over process restart # this is done to avoid the potential for stuck ntp ip alarms collectd.info("%s clearing found startup alarm '%s'" % (PLUGIN, alarm.entity_instance_id)) try: api.clear_fault(PLUGIN_ALARMID, alarm.entity_instance_id) except Exception as ex: collectd.error( "%s 'clear_fault' exception ; %s:%s ; %s" % (PLUGIN, PLUGIN_ALARMID, alarm.entity_instance_id, ex)) return 0 else: obj.alarm_raised = True collectd.info( "%s found alarm %s:%s" % (PLUGIN, PLUGIN_ALARMID, alarm.entity_instance_id)) # ensure the base alarm is cleared if there are no # provisioned servers. if not obj.server_list_conf: _clear_base_alarm() else: collectd.info("%s no major startup alarms found" % PLUGIN) obj.init_completed() return 0