def _add_conntrack_stats_metrics(self, conntrack_path, use_sudo_conntrack, tags): """ Parse the output of conntrack -S Add the parsed metrics """ try: cmd = [conntrack_path, "-S"] if use_sudo_conntrack: cmd.insert(0, "sudo") output, _, _ = get_subprocess_output(cmd, self.log) # conntrack -S sample: # cpu=0 found=27644 invalid=19060 ignore=485633411 insert=0 insert_failed=1 \ # drop=1 early_drop=0 error=0 search_restart=39936711 # cpu=1 found=21960 invalid=17288 ignore=475938848 insert=0 insert_failed=1 \ # drop=1 early_drop=0 error=0 search_restart=36983181 lines = output.splitlines() for line in lines: cols = line.split() cpu_num = cols[0].split('=')[-1] cpu_tag = ['cpu:{}'.format(cpu_num)] cols = cols[1:] for cell in cols: metric, value = cell.split('=') self.monotonic_count( 'system.net.conntrack.{}'.format(metric), int(value), tags=tags + cpu_tag) except SubprocessOutputEmptyError: self.log.debug("Couldn't use %s to get conntrack stats", conntrack_path)
def check(self, instance): stat_out, err, _ = get_subprocess_output(self.nfs_cmd, self.log) all_devices = [] this_device = [] custom_tags = instance.get("tags", []) stats = stat_out.splitlines() if 'No NFS mount point' in stats[0]: if not self.autofs_enabled: self.warning("No NFS mount points were found.") else: self.log.debug("AutoFS enabled: no mount points currently.") return for l in stats: if not l: continue elif l.find('mounted on') >= 0 and len(this_device) > 0: # if it's a new device, create the device and add it to the array device = Device(this_device, self.log) all_devices.append(device) this_device = [] this_device.append(l.strip().split()) # Add the last device into the array device = Device(this_device, self.log) all_devices.append(device) # Disregard the first half of device stats (report 1 of 2) # as that is the moving average all_devices = all_devices[len(all_devices) // 2:] for device in all_devices: device.send_metrics(self.gauge, custom_tags)
def _collect_raw(self, ceph_cmd, ceph_cluster, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise Exception('The dd-agent user does not have sudo access') ceph_args = 'sudo {}'.format(ceph_cmd) else: ceph_args = ceph_cmd ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'): try: args = '{} {} -fjson'.format(ceph_args, cmd) output, _, _ = get_subprocess_output(args.split(), self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s', cmd, e) continue name = cmd.replace(' ', '_') raw[name] = res return raw
def _get_varnish_adm(self, version): cmd = [] if geteuid() != 0: cmd.append('sudo') if version < LooseVersion('4.1.0'): cmd.extend(self.varnishadm_path + ['-S', self.secretfile_path, 'debug.health']) else: cmd.extend(self.varnishadm_path + [ '-T', '{}:{}'.format(self.daemon_host, self.daemon_port), '-S', self.secretfile_path, 'backend.list', '-p', ]) err, output = None, None try: output, err, _ = get_subprocess_output(cmd, self.log, raise_on_empty_output=False) except OSError as e: self.log.error( "There was an error running varnishadm. Make sure 'sudo' is available. %s", e) output = None if err or not output: self.log.error('Error getting service check from varnishadm: %s', err) return output
def check(self, instance): if instance is None: instance = {} cmd = "ss --numeric --listening --tcp" output, _, _ = get_subprocess_output(["sh", "-c", cmd], self.log, raise_on_empty_output=True) # Run "ss --numeric --listening --tcp" command on host. # Expected output: # State Recv-Q Send-Q Local Address:Port Peer Address:Port # LISTEN 0 128 127.0.0.1:6062 0.0.0.0:* # LISTEN 0 128 0.0.0.0:111 0.0.0.0:* # LISTEN 0 128 0.0.0.0:22 0.0.0.0:* # LISTEN 0 100 127.0.0.1:25 0.0.0.0:* # LISTEN 0 128 127.0.0.1:8126 0.0.0.0:* # LISTEN 0 128 127.0.0.1:5000 0.0.0.0:* # LISTEN 0 128 127.0.0.1:5001 0.0.0.0:* # LISTEN 0 80 *:3306 *:* # LISTEN 0 128 [::]:111 [::]:* # LISTEN 0 128 [::]:22 [::]:* lines = output.splitlines() # Parse the output into Datadog metrics for l in lines[1:]: cols = l.split() ip, port=cols[3].rsplit(':', 1) #print(cols[1], cols[2], cols[3], port) self.gauge("ss.listening.recvq", cols[1], tags=["port:"+str(port), "type:tcp"]) self.gauge("ss.listening.sendq", cols[2], tags=["port:"+str(port), "type:tcp"])
def call_unbound_control(self, command, tags): try: # Pass raise_on_empty_output as False so we get a chance to log stderr ub_out, ub_err, returncode = get_subprocess_output(command, self.log, raise_on_empty_output=False) except Exception as e: self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="exception collecting stats", tags=tags ) raise Exception("Unable to get unbound stats: {}".format(str(e))) for line in ub_err.splitlines(): self.log.debug('stderr from %s: %s', command, line) # Check the return value if returncode != 0: self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="non-zero return code collecting stats", tags=tags ) raise Exception('"{}" failed, return code: {}'.format(command, returncode)) # And because we pass raise_on_empty_output as False, check that too if not ub_out: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="no stats", tags=tags) raise Exception('no output from "{}"'.format(command)) return ub_out
def which(program, use_sudo, log): def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) if use_sudo: # Pass raise_on_empty_output as False to Leave it to the caller to handle the not # found case. stdout, stderr, returncode = get_subprocess_output(['sudo', 'which', program], log, raise_on_empty_output=False) if returncode == 0: return stdout for line in stderr.splitlines(): log.debug('stderr from sudo which %s: %s', program, line) return None fpath, fname = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None
def _exec_ping(self, timeout, target_host): if platform.system() == "Windows": # pragma: nocover countOption = "-n" timeoutOption = "-w" # The timeout option is in ms on Windows # https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/ping timeout = timeout * 1000 elif platform.system() == "Darwin": countOption = "-c" timeoutOption = "-W" # Also in ms on Mac timeout = timeout * 1000 else: # The timeout option is is seconds on Linux, leaving timeout as is # https://linux.die.net/man/8/ping countOption = "-c" timeoutOption = "-W" self.log.debug("Running: ping %s %s %s %s %s", countOption, "1", timeoutOption, timeout, target_host) lines, err, retcode = get_subprocess_output( ["ping", countOption, "1", timeoutOption, str(timeout), target_host], self.log, raise_on_empty_output=True ) self.log.debug("ping returned %s - %s - %s", retcode, lines, err) if retcode != 0: raise CheckException("ping returned {}: {}".format(retcode, err)) return lines
def check(self, instance): files, err, retcode = get_subprocess_output(["exim", "-bpc"], self.log, raise_on_empty_output=True) queue_count = int(files.strip()) self.gauge('exim.queued.messages.count', queue_count, tags=['TAG_KEY:TAG_VALUE'])
def _get_varnish_stats(self, varnishstat_format): cmd = self.varnishstat_path + [ self.VARNISHSTAT_FORMAT_OPTION[varnishstat_format] ] for metric in self.metrics_filter: cmd.extend(["-f", metric]) if self.name is not None: cmd.extend(['-n', self.name]) output, _, _ = get_subprocess_output(cmd, self.log) return output
def pingable(self, host: str): """ Returns True if host (str) responds to a ping request. """ param = '-n' if platform.system().lower() == 'windows' else '-c' command = ['ping', param, '1', host] out, err, retcode = get_subprocess_output(command, self.log, raise_on_empty_output=False) return retcode == 0
def check(self, instance): file_info, err, retcode = get_subprocess_output( ["ls", "-al", "/var/log/nginx/error.log"], self.log, raise_on_empty_output=True) file_size = file_info.split(" ")[4] self.gauge("kurian.nginx.error_log.size", file_size, tags=['component:nginx'])
def _get_queue_count(self, directory, queues, tags): for queue in queues: queue_path = os.path.join(directory, queue) if not os.path.exists(queue_path): raise Exception('{} does not exist'.format(queue_path)) count = 0 if os.geteuid() == 0: # dd-agent is running as root (not recommended) count = sum( len(files) for root, dirs, files in os.walk(queue_path)) else: # can dd-agent user run sudo? test_sudo = ['sudo', '-l'] _, _, exit_code = get_subprocess_output( test_sudo, self.log, False) if exit_code == 0: # default to `root` for backward compatibility postfix_user = self.init_config.get('postfix_user', 'root') cmd = [ 'sudo', '-u', postfix_user, 'find', queue_path, '-type', 'f' ] output, _, _ = get_subprocess_output(cmd, self.log, False) count = len(output.splitlines()) else: raise Exception( 'The dd-agent user does not have sudo access') # emit an individually tagged metric self.gauge( 'postfix.queue.size', count, tags=tags + [ 'queue:{}'.format(queue), 'instance:{}'.format( os.path.basename(directory)) ], )
def _collect_metadata(self): try: pc_output, _, _ = get_subprocess_output(['postconf', 'mail_version'], self.log, False) except Exception as e: self.log.warning('unable to call `postconf mail_version`: %s', e) return self.log.debug('postconf mail_version output: %s', pc_output) if pc_output: postfix_version = pc_output.strip('\n').split('=')[1].strip() self.log.debug('Postfix Version: %s', postfix_version) if postfix_version: self.set_metadata('version', postfix_version)
def _get_version_info(self): # Get the varnish version from varnishstat output, error, _ = get_subprocess_output(self.varnishstat_path + ["-V"], self.log, raise_on_empty_output=False) # Assumptions regarding varnish's version varnishstat_format = "json" raw_version = None m1 = self.version_pattern.search(output, re.MULTILINE) # v2 prints the version on stderr, v3 on stdout m2 = self.version_pattern.search(error, re.MULTILINE) if m1 is None and m2 is None: self.log.warning( "Cannot determine the version of varnishstat, assuming 3 or greater" ) self.warning( "Cannot determine the version of varnishstat, assuming 3 or greater" ) else: if m1 is not None: raw_version = m1.group() elif m2 is not None: raw_version = m2.group() self.log.debug("Varnish version: %s", raw_version) if raw_version: self.set_metadata('version', raw_version) if raw_version is None: raw_version = '3.0.0' version = LooseVersion(raw_version) # Location of varnishstat if version < LooseVersion('3.0.0'): varnishstat_format = "text" elif version < LooseVersion( '5.0.0'): # we default to json starting version 5.0.0 varnishstat_format = "xml" return version, varnishstat_format
def _get_devices_label_from_blkid(self): devices_label = {} try: blkid_out, _, _ = get_subprocess_output(['blkid'], self.log) all_devices = [l.split(':', 1) for l in blkid_out.splitlines()] for d in all_devices: # Line sample # /dev/sda1: LABEL="MYLABEL" UUID="5eea373d-db36-4ce2-8c71-12ce544e8559" TYPE="ext4" labels = self._blkid_label_re.findall(d[1]) if labels: devices_label[d[0]] = 'label:{}'.format(labels[0]) except SubprocessOutputEmptyError: self.log.debug("Couldn't use blkid to have device labels") return devices_label
def _get_version(self): """ Get version from `gunicorn --version` """ cmd = '{} --version'.format(self.gunicorn_cmd) try: pc_out, pc_err, _ = get_subprocess_output(cmd, self.log, False) except OSError: self.log.warning("Error collecting gunicorn version.") return None match = re.match(self.VERSION_PATTERN, pc_out) if not match: match = re.match(self.VERSION_PATTERN, pc_err) if match: return match.groups()[0] else: self.log.warning("Version not found in stdout `%s` and stderr `%s`", pc_out, pc_err) return None
def _get_version_from_command_line(self): version_command = '{} --version'.format(self._fluentd_command) try: out, _, _ = get_subprocess_output(version_command, self.log, raise_on_empty_output=False) except OSError as exc: self.log.debug("Error collecting fluentd version: %s", exc) return None match = re.match(self.VERSION_PATTERN, out) if match is None: self.log.debug("fluentd version not found in stdout: `%s`", out) return None return match.group('version')
def _get_sendmail_stats(self, sendmail_command, use_sudo): if not os.path.exists(sendmail_command): raise Exception('{} does not exist'.format(sendmail_command)) self.log.debug(sendmail_command) # mailq sample output. sendmail output is similar. ## # MSP Queue status... # /var/spool/mqueue-client is empty # Total requests: 0 # MTA Queue status... # /var/spool/mqueue is empty # Total requests: 0 # if we want to use sendmail, we need to append -bp to it # https://www.electrictoolbox.com/show-sendmail-mail-queue/ if "sendmail" in sendmail_command: command = [sendmail_command, '-bp'] else: command = [sendmail_command] # Listing the directory might require sudo privileges if use_sudo: try: os.system('setsid sudo -l < /dev/null') command.insert(0, 'sudo') except OSError as e: self.log.exception( "trying to retrieve %s with sudo failed with return code %s", command, e) self.log.debug(command) mail_queue, err, retcode = get_subprocess_output( command, self.log, False) self.log.debug("Error: %s", err) count = mail_queue.splitlines() # Retrieve the last total number of requests queue_count = int(count[-1][-1]) self.log.info("Number of mails in the queue: %s", queue_count) return queue_count
def collect_metrics_manually(self): df_out, _, _ = get_subprocess_output(self.DF_COMMAND + ['-k'], self.log) self.log.debug(df_out) for device in self._list_devices(df_out): self.log.debug("Passed: {}".format(device)) device_name = device[-1] if self._use_mount else device[0] tags = [device[1], 'filesystem:{}'.format(device[1]) ] if self._tag_by_filesystem else [] tags.extend(self._custom_tags) # apply device/mountpoint specific tags for regex, device_tags in self._device_tag_re: if regex.match(device_name): tags += device_tags tags.append('device:{}'.format(device_name)) for metric_name, value in iteritems( self._collect_metrics_manually(device)): self.gauge(metric_name, value, tags=tags)
def get_process_states(self): state_counts = defaultdict(int) prio_counts = defaultdict(int) ps = get_subprocess_output(['ps', '--no-header', '-eo', 'stat'], self.log) for state in ps[0]: # Each process state is a flag in a list of characters. See ps(1) for details. for _ in list(state): if state in PROCESS_STATES: state_counts[PROCESS_STATES[state]] += 1 elif state in PROCESS_PRIOS: prio_counts[PROCESS_PRIOS[state]] += 1 for state in state_counts: state_tags = list(self.tags) state_tags.append("state:" + state) self.gauge('system.processes.states', float(state_counts[state]), state_tags) for prio in prio_counts: prio_tags = list(self.tags) prio_tags.append("priority:" + prio) self.gauge('system.processes.priorities', float(prio_counts[prio]), prio_tags)
def _get_devices_label_from_lsblk(self): """ Get device labels using the `lsblk` command. Returns a map of device name to label:value """ devices_labels = dict() try: # Use raw output mode (space-separated fields encoded in UTF-8). # We want to be compatible with lsblk version 2.19 since # it is the last version supported by CentOS 6 and SUSE 11. lsblk_out, _, _ = get_subprocess_output(["lsblk", "--noheadings", "--raw", "--output=NAME,LABEL"], self.log) for line in lsblk_out.splitlines(): device, _, label = line.partition(' ') if label: # Line sample (device "/dev/sda1" with label " MY LABEL") # sda1 MY LABEL devices_labels["/dev/" + device] = ['label:{}'.format(label), 'device_label:{}'.format(label)] except SubprocessOutputEmptyError: self.log.debug("Couldn't use lsblk to have device labels") return devices_labels
def _collect_raw(self, ceph_cmd, ceph_cluster, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise CheckException( 'The dd-agent user does not have sudo access') ceph_args = 'sudo {}'.format(ceph_cmd) else: ceph_args = ceph_cmd ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'): try: args = '{} {} -fjson'.format(ceph_args, cmd) output, _, _ = get_subprocess_output(args.split(), self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s', cmd, e) continue name = cmd.replace(' ', '_') raw[name] = res mon_map = raw.get('status', {}).get('monmap') if mon_map is None: raise RuntimeError("Could not detect Ceph release series") if 'min_mon_release_name' in mon_map and mon_map[ 'min_mon_release_name'] == 'octopus': self.log.debug("Detected octopus version of ceph...") self._octopus = True else: self._octopus = False return raw
def _check_linux(self, instance): """ _check_linux can be run inside a container and still collects the network metrics from the host For that procfs_path can be set to something like "/host/proc" When a custom procfs_path is set, the collect_connection_state option is ignored """ proc_location = self.agentConfig.get('procfs_path', '/proc').rstrip('/') custom_tags = instance.get('tags', []) net_proc_base_location = self._get_net_proc_base_location( proc_location) if self._is_collect_cx_state_runnable(net_proc_base_location): try: self.log.debug("Using `ss` to collect connection state") # Try using `ss` for increased performance over `netstat` metrics = self._get_metrics() for ip_version in ['4', '6']: # Call `ss` for each IP version because there's no built-in way of distinguishing # between the IP versions in the output # Also calls `ss` for each protocol, because on some systems (e.g. Ubuntu 14.04), there is a # bug that print `tcp` even if it's `udp` # The `-H` flag isn't available on old versions of `ss`. cmd = "ss --numeric --tcp --all --ipv{} | cut -d ' ' -f 1 | sort | uniq -c".format( ip_version) output, _, _ = get_subprocess_output(["sh", "-c", cmd], self.log) # 7624 CLOSE-WAIT # 72 ESTAB # 9 LISTEN # 1 State # 37 TIME-WAIT lines = output.splitlines() self._parse_short_state_lines(lines, metrics, self.tcp_states['ss'], ip_version=ip_version) cmd = "ss --numeric --udp --all --ipv{} | wc -l".format( ip_version) output, _, _ = get_subprocess_output(["sh", "-c", cmd], self.log) metric = self.cx_state_gauge[('udp{}'.format(ip_version), 'connections')] metrics[metric] = int(output) - 1 # Remove header for metric, value in iteritems(metrics): self.gauge(metric, value, tags=custom_tags) except OSError: self.log.info("`ss` not found: using `netstat` as a fallback") output, _, _ = get_subprocess_output( ["netstat", "-n", "-u", "-t", "-a"], self.log) lines = output.splitlines() # Active Internet connections (w/o servers) # Proto Recv-Q Send-Q Local Address Foreign Address State # tcp 0 0 46.105.75.4:80 79.220.227.193:2032 SYN_RECV # tcp 0 0 46.105.75.4:143 90.56.111.177:56867 ESTABLISHED # tcp 0 0 46.105.75.4:50468 107.20.207.175:443 TIME_WAIT # tcp6 0 0 46.105.75.4:80 93.15.237.188:58038 FIN_WAIT2 # tcp6 0 0 46.105.75.4:80 79.220.227.193:2029 ESTABLISHED # udp 0 0 0.0.0.0:123 0.0.0.0:* # udp6 0 0 :::41458 :::* metrics = self._parse_linux_cx_state( lines[2:], self.tcp_states['netstat'], 5) for metric, value in iteritems(metrics): self.gauge(metric, value, tags=custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") proc_dev_path = "{}/net/dev".format(net_proc_base_location) with open(proc_dev_path, 'r') as proc: lines = proc.readlines() # Inter-| Receive | Transmit # face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed # noqa: E501 # lo:45890956 112797 0 0 0 0 0 0 45890956 112797 0 0 0 0 0 0 # noqa: E501 # eth0:631947052 1042233 0 19 0 184 0 1206 1208625538 1320529 0 0 0 0 0 0 # noqa: E501 # eth1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # noqa: E501 for l in lines[2:]: cols = l.split(':', 1) x = cols[1].split() # Filter inactive interfaces if self._parse_value(x[0]) or self._parse_value(x[8]): iface = cols[0].strip() metrics = { 'bytes_rcvd': self._parse_value(x[0]), 'bytes_sent': self._parse_value(x[8]), 'packets_in.count': self._parse_value(x[1]), 'packets_in.error': self._parse_value(x[2]) + self._parse_value(x[3]), 'packets_out.count': self._parse_value(x[9]), 'packets_out.error': self._parse_value(x[10]) + self._parse_value(x[11]), } self._submit_devicemetrics(iface, metrics, custom_tags) netstat_data = {} for f in ['netstat', 'snmp']: proc_data_path = "{}/net/{}".format(net_proc_base_location, f) try: with open(proc_data_path, 'r') as netstat: while True: n_header = netstat.readline() if not n_header: break # No more? Abort! n_data = netstat.readline() h_parts = n_header.strip().split(' ') h_values = n_data.strip().split(' ') ns_category = h_parts[0][:-1] netstat_data[ns_category] = {} # Turn the data into a dictionary for idx, hpart in enumerate(h_parts[1:]): netstat_data[ns_category][hpart] = h_values[idx + 1] except IOError: # On Openshift, /proc/net/snmp is only readable by root self.log.debug("Unable to read %s.", proc_data_path) nstat_metrics_names = { 'Tcp': { 'RetransSegs': 'system.net.tcp.retrans_segs', 'InSegs': 'system.net.tcp.in_segs', 'OutSegs': 'system.net.tcp.out_segs', }, 'TcpExt': { 'ListenOverflows': 'system.net.tcp.listen_overflows', 'ListenDrops': 'system.net.tcp.listen_drops', 'TCPBacklogDrop': 'system.net.tcp.backlog_drops', 'TCPRetransFail': 'system.net.tcp.failed_retransmits', }, 'Udp': { 'InDatagrams': 'system.net.udp.in_datagrams', 'NoPorts': 'system.net.udp.no_ports', 'InErrors': 'system.net.udp.in_errors', 'OutDatagrams': 'system.net.udp.out_datagrams', 'RcvbufErrors': 'system.net.udp.rcv_buf_errors', 'SndbufErrors': 'system.net.udp.snd_buf_errors', 'InCsumErrors': 'system.net.udp.in_csum_errors', }, } # Skip the first line, as it's junk for k in nstat_metrics_names: for met in nstat_metrics_names[k]: if met in netstat_data.get(k, {}): self._submit_netmetric(nstat_metrics_names[k][met], self._parse_value( netstat_data[k][met]), tags=custom_tags) # Get the conntrack -S information conntrack_path = instance.get('conntrack_path') if conntrack_path is not None: self._add_conntrack_stats_metrics(conntrack_path, custom_tags) # Get the rest of the metric by reading the files. Metrics available since kernel 3.6 conntrack_files_location = os.path.join(proc_location, 'sys', 'net', 'netfilter') # By default, only max and count are reported. However if the blacklist is set, # the whitelist is loosing its default value blacklisted_files = instance.get('blacklist_conntrack_metrics') whitelisted_files = instance.get('whitelist_conntrack_metrics') if blacklisted_files is None and whitelisted_files is None: whitelisted_files = ['max', 'count'] available_files = [] # Get the metrics to read try: for metric_file in os.listdir(conntrack_files_location): if (os.path.isfile( os.path.join(conntrack_files_location, metric_file)) and 'nf_conntrack_' in metric_file): available_files.append(metric_file[len('nf_conntrack_'):]) except Exception as e: self.log.debug("Unable to list the files in {}. {}".format( conntrack_files_location, e)) filtered_available_files = pattern_filter(available_files, whitelist=whitelisted_files, blacklist=blacklisted_files) for metric_name in filtered_available_files: metric_file_location = os.path.join( conntrack_files_location, 'nf_conntrack_{}'.format(metric_name)) try: with open(metric_file_location, 'r') as conntrack_file: # Checking it's an integer try: value = int(conntrack_file.read().rstrip()) self.gauge( 'system.net.conntrack.{}'.format(metric_name), value, tags=custom_tags) except ValueError: self.log.debug( "{} is not an integer".format(metric_name)) except IOError as e: self.log.debug("Unable to read {}, skipping {}.".format( metric_file_location, e))
def _check_bsd(self, instance): netstat_flags = ['-i', '-b'] custom_tags = instance.get('tags', []) # FreeBSD's netstat truncates device names unless you pass '-W' if Platform.is_freebsd(): netstat_flags.append('-W') try: output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log) lines = output.splitlines() # Name Mtu Network Address Ipkts Ierrs Ibytes Opkts Oerrs Obytes Coll # lo0 16384 <Link#1> 318258 0 428252203 318258 0 428252203 0 # lo0 16384 localhost fe80:1::1 318258 - 428252203 318258 - 428252203 - # lo0 16384 127 localhost 318258 - 428252203 318258 - 428252203 - # lo0 16384 localhost ::1 318258 - 428252203 318258 - 428252203 - # gif0* 1280 <Link#2> 0 0 0 0 0 0 0 # stf0* 1280 <Link#3> 0 0 0 0 0 0 0 # en0 1500 <Link#4> 04:0c:ce:db:4e:fa 20801309 0 13835457425 15149389 0 11508790198 0 # en0 1500 seneca.loca fe80:4::60c:ceff: 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 192.168.1 192.168.1.63 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # p2p0 2304 <Link#5> 06:0c:ce:db:4e:fa 0 0 0 0 0 0 0 # ham0 1404 <Link#6> 7a:79:05:4d:bf:f5 30100 0 6815204 18742 0 8494811 0 # ham0 1404 5 5.77.191.245 30100 - 6815204 18742 - 8494811 - # ham0 1404 seneca.loca fe80:6::7879:5ff: 30100 - 6815204 18742 - 8494811 - # ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204 18742 - 8494811 - headers = lines[0].split() # Given the irregular structure of the table above, better to parse from the end of each line # Verify headers first # -7 -6 -5 -4 -3 -2 -1 for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"): if h not in headers: self.log.error("%s not found in %s; cannot parse" % (h, headers)) return False current = None for l in lines[1:]: # Another header row, abort now, this is IPv6 land if "Name" in l: break x = l.split() if len(x) == 0: break iface = x[0] if iface.endswith("*"): iface = iface[:-1] if iface == current: # skip multiple lines of same interface continue else: current = iface # Filter inactive interfaces if self._parse_value(x[-5]) or self._parse_value(x[-2]): iface = current metrics = { 'bytes_rcvd': self._parse_value(x[-5]), 'bytes_sent': self._parse_value(x[-2]), 'packets_in.count': self._parse_value(x[-7]), 'packets_in.error': self._parse_value(x[-6]), 'packets_out.count': self._parse_value(x[-4]), 'packets_out.error': self._parse_value(x[-3]), } self._submit_devicemetrics(iface, metrics, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") try: netstat, _, _ = get_subprocess_output( ["netstat", "-s", "-p" "tcp"], self.log) # 3651535 packets sent # 972097 data packets (615753248 bytes) # 5009 data packets (2832232 bytes) retransmitted # 0 resends initiated by MTU discovery # 2086952 ack-only packets (471 delayed) # 0 URG only packets # 0 window probe packets # 310851 window update packets # 336829 control packets # 0 data packets sent after flow control # 3058232 checksummed in software # 3058232 segments (571218834 bytes) over IPv4 # 0 segments (0 bytes) over IPv6 # 4807551 packets received # 1143534 acks (for 616095538 bytes) # 165400 duplicate acks # ... self._submit_regexed_values(netstat, BSD_TCP_METRICS, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def check(self, instance): # Not configured? Not a problem. if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") custom_tags = instance.get('tags', []) if custom_tags is None: custom_tags = [] else: custom_tags = list(set(custom_tags)) # Split the varnishstat command so that additional arguments can be passed in # In order to support monitoring a Varnish instance which is running as a Docker # container we need to wrap commands (varnishstat, varnishadm) with scripts which # perform a docker exec on the running container. This works fine when running a # single container on the host but breaks down when attempting to use the auto # discovery feature. This change allows for passing in additional parameters to # the script (i.e. %%host%%) so that the command is properly formatted and the # desired container is queried. varnishstat_path = instance.get('varnishstat', '').split() name = instance.get('name') metrics_filter = instance.get("metrics_filter", []) if not isinstance(metrics_filter, list): raise Exception("The parameter 'metrics_filter' must be a list") # Get version and version-specific args from varnishstat -V. version, varnishstat_format = self._get_version_info(varnishstat_path) cmd = varnishstat_path + [self.VARNISHSTAT_FORMAT_OPTION[varnishstat_format]] for metric in metrics_filter: cmd.extend(["-f", metric]) if name is not None: cmd.extend(['-n', name]) tags = custom_tags + [u'varnish_name:%s' % name] else: tags = custom_tags + [u'varnish_name:default'] output, _, _ = get_subprocess_output(cmd, self.log) self._parse_varnishstat(output, varnishstat_format, tags) # Parse service checks from varnishadm. if instance.get("varnishadm", None): # Split the varnishadm command so that additional arguments can be passed in # In order to support monitoring a Varnish instance which is running as a Docker # container we need to wrap commands (varnishstat, varnishadm) with scripts which # perform a docker exec on the running container. This works fine when running a # single container on the host but breaks down when attempting to use the auto # discovery feature. This change allows for passing in additional parameters to # the script (i.e. %%host%%) so that the command is properly formatted and the # desired container is queried. varnishadm_path = instance.get('varnishadm', '').split() secretfile_path = instance.get('secretfile', '/etc/varnish/secret') daemon_host = instance.get('daemon_host', 'localhost') daemon_port = instance.get('daemon_port', '6082') cmd = [] if geteuid() != 0: cmd.append('sudo') if version < LooseVersion('4.1.0'): cmd.extend(varnishadm_path + ['-S', secretfile_path, 'debug.health']) else: cmd.extend( varnishadm_path + ['-T', '{}:{}'.format(daemon_host, daemon_port), '-S', secretfile_path, 'backend.list', '-p'] ) try: output, err, _ = get_subprocess_output(cmd, self.log) except OSError as e: self.log.error("There was an error running varnishadm. Make sure 'sudo' is available. %s", e) output = None if err: self.log.error('Error getting service check from varnishadm: %s', err) if output: self._parse_varnishadm(output, custom_tags)
def _check(self, instance): if not self.binary: raise BinaryUnavailable("Cannot find executable: {}".format(self.expected_bin)) ip_address = self._get_instance_addr(instance) metrics = instance.get('metrics', []) community_string = instance.get('community_string', 'public') timeout = int(instance.get('timeout', self.DEFAULT_TIMEOUT)) retries = int(instance.get('retries', self.DEFAULT_RETRIES)) hostname = instance.get('metric_host', None) # Build up our dataset data = defaultdict(dict) types = {} for metric in metrics: mib = metric['MIB'] table = metric['table'] cmd = [self.binary, '-c{}'.format(community_string), '-v2c', '-t', str(timeout), '-r', str(retries)] if self.mib_dirs: cmd.extend(['-M', self.mib_dirs]) cmd.extend([ip_address, '{}:{}'.format(mib, table)]) try: output = get_subprocess_output(cmd, self.log)[0] except Exception as e: error = "Fail to collect metrics for {0} - {1}".format(instance['name'], e) self.log.warning(error) return [(self.SC_NAME, Status.CRITICAL, error)] for line in output.splitlines(): if not line: continue match = self.output_re.match(line) if match is not None: symbol = match.group('symbol') index = int(match.group('index')) value = match.group('value') typ = match.group('type') types[symbol] = typ if typ == 'INTEGER': try: value = int(value) except ValueError: pass elif value == '': value = None data[symbol][index] = value else: # TODO: remove this self.log.warning('Problem parsing output of snmp walk: %s', line) # Get any base configured tags and add our primary tag tags = instance.get('tags', []) + ['snmp_device:{}'.format(ip_address)] # It seems kind of weird, but from what I can tell the snmp check allows # you to add symbols to a metric that were retrieved by another metric, # both for values and tags. So you can add a symbol in the 1st metric # that pulls data from the 2nd. Same applies to tag lookups. Seems like # symbols should have been at the instance level rather than # per-metric... That way the bahavior would match up with schema, but oh # well. # Time to emit metrics for metric in metrics: # Build a list of dynamic tags per-index dynamic_tags = defaultdict(list) for metric_tag in metric.get('metric_tags', []): if 'column' in metric_tag: tag = metric_tag['tag'] column = metric_tag['column'] regex = metric_tag.get('regex', None) if regex is not None: # pre-compile our regex regex = re.compile(regex) for i, v in data[column].items(): if v is None: # No value for the column, ignore continue elif types[column] == 'INTEGER': # enum/bool etc, use the human readable name v = v.split('(')[0] if regex is not None: # There's a regex for this tag match = regex.match(v) if match is not None: # It matches so we'll apply it, group(1) becomes # the value v = match.group(1) dynamic_tags[i].append('{}:{}'.format(tag, v)) additional_tags = metric_tag.get('additional_tags', []) # and we add any additional tags dynamic_tags[i].extend(additional_tags) else: # This is a standard tag, just use the value dynamic_tags[i].append('{}:{}'.format(tag, v)) else: self.log.debug('unsupported metric_tag: %s', metric_tag) continue symbols = metric.get('symbols', []) # For each of the symbols we'll be recording as a metric for symbol in symbols: # For each value for that symbol for i, value in data[symbol].items(): if value is None: # skip empty continue # metric key key = '{}.{}'.format(SOURCE_TYPE_NAME, symbol) value = int(value) typ = types[symbol] if typ in self.COUNTER_TYPES: self.rate(key, value, tags + dynamic_tags[i], hostname=hostname) elif typ in self.GAUGE_TYPES: self.gauge(key, value, tags + dynamic_tags[i], hostname=hostname) else: raise Exception('unsupported metric symbol type: {}'.format(typ)) return [(self.SC_NAME, Status.UP, None)]
def _get_lighthouse_report(command, logger, raise_on_empty=False): json, err_msg, exit_code = get_subprocess_output(command, logger, raise_on_empty_output=raise_on_empty) return json, err_msg, exit_code
def _check_bsd(self, instance): netstat_flags = ['-i', '-b'] custom_tags = instance.get('tags', []) # FreeBSD's netstat truncates device names unless you pass '-W' if Platform.is_freebsd(): netstat_flags.append('-W') try: output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log) lines = output.splitlines() # Name Mtu Network Address Ipkts Ierrs Ibytes Opkts Oerrs Obytes Coll # lo0 16384 <Link#1> 318258 0 428252203 318258 0 428252203 0 # lo0 16384 localhost fe80:1::1 318258 - 428252203 318258 - 428252203 - # lo0 16384 127 localhost 318258 - 428252203 318258 - 428252203 - # lo0 16384 localhost ::1 318258 - 428252203 318258 - 428252203 - # gif0* 1280 <Link#2> 0 0 0 0 0 0 0 # stf0* 1280 <Link#3> 0 0 0 0 0 0 0 # en0 1500 <Link#4> 04:0c:ce:db:4e:fa 20801309 0 13835457425 15149389 0 11508790198 0 # en0 1500 seneca.loca fe80:4::60c:ceff: 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 192.168.1 192.168.1.63 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # p2p0 2304 <Link#5> 06:0c:ce:db:4e:fa 0 0 0 0 0 0 0 # ham0 1404 <Link#6> 7a:79:05:4d:bf:f5 30100 0 6815204 18742 0 8494811 0 # ham0 1404 5 5.77.191.245 30100 - 6815204 18742 - 8494811 - # ham0 1404 seneca.loca fe80:6::7879:5ff: 30100 - 6815204 18742 - 8494811 - # ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204 18742 - 8494811 - headers = lines[0].split() # Given the irregular structure of the table above, better to parse from the end of each line # Verify headers first # -7 -6 -5 -4 -3 -2 -1 for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"): if h not in headers: self.log.error("%s not found in %s; cannot parse", h, headers) return False current = None for l in lines[1:]: # Another header row, abort now, this is IPv6 land if "Name" in l: break x = l.split() if len(x) == 0: break iface = x[0] if iface.endswith("*"): iface = iface[:-1] if iface == current: # skip multiple lines of same interface continue else: current = iface # Filter inactive interfaces if self._parse_value(x[-5]) or self._parse_value(x[-2]): iface = current metrics = { 'bytes_rcvd': self._parse_value(x[-5]), 'bytes_sent': self._parse_value(x[-2]), 'packets_in.count': self._parse_value(x[-7]), 'packets_in.error': self._parse_value(x[-6]), 'packets_out.count': self._parse_value(x[-4]), 'packets_out.error': self._parse_value(x[-3]), } self._submit_devicemetrics(iface, metrics, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") try: netstat, _, _ = get_subprocess_output( ["netstat", "-s", "-p" "tcp"], self.log) # 3651535 packets sent # 972097 data packets (615753248 bytes) # 5009 data packets (2832232 bytes) retransmitted # 0 resends initiated by MTU discovery # 2086952 ack-only packets (471 delayed) # 0 URG only packets # 0 window probe packets # 310851 window update packets # 336829 control packets # 0 data packets sent after flow control # 3058232 checksummed in software # 3058232 segments (571218834 bytes) over IPv4 # 0 segments (0 bytes) over IPv6 # 4807551 packets received # 1143534 acks (for 616095538 bytes) # 165400 duplicate acks # ... self._submit_regexed_values(netstat, BSD_TCP_METRICS, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.") proc_location = self.agentConfig.get('procfs_path', '/proc').rstrip('/') net_proc_base_location = self._get_net_proc_base_location( proc_location) if self._is_collect_cx_state_runnable(net_proc_base_location): try: self.log.debug("Using `netstat` to collect connection state") output_TCP, _, _ = get_subprocess_output( ["netstat", "-n", "-a", "-p", "tcp"], self.log) output_UDP, _, _ = get_subprocess_output( ["netstat", "-n", "-a", "-p", "udp"], self.log) lines = output_TCP.splitlines() + output_UDP.splitlines() # Active Internet connections (w/o servers) # Proto Recv-Q Send-Q Local Address Foreign Address State # tcp 0 0 46.105.75.4:80 79.220.227.193:2032 SYN_RECV # tcp 0 0 46.105.75.4:143 90.56.111.177:56867 ESTABLISHED # tcp 0 0 46.105.75.4:50468 107.20.207.175:443 TIME_WAIT # tcp6 0 0 46.105.75.4:80 93.15.237.188:58038 FIN_WAIT2 # tcp6 0 0 46.105.75.4:80 79.220.227.193:2029 ESTABLISHED # udp 0 0 0.0.0.0:123 0.0.0.0:* # udp6 0 0 :::41458 :::* metrics = self._parse_linux_cx_state( lines[2:], self.tcp_states['netstat'], 5) for metric, value in iteritems(metrics): self.gauge(metric, value, tags=custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection states.")
def check(self, instance): # Allow to specify a complete command for nodetool such as `docker exec container nodetool` nodetool_cmd = instance.get("nodetool", self.nodetool_cmd).split() host = instance.get("host", DEFAULT_HOST) port = instance.get("port", DEFAULT_PORT) keyspaces = instance.get("keyspaces", []) username = instance.get("username", "") password = instance.get("password", "") ssl = instance.get("ssl", False) tags = instance.get("tags", []) # Flag to send service checks only once and not for every keyspace send_service_checks = True if not keyspaces: self.log.info( "No keyspaces set in the configuration: no metrics will be sent" ) for keyspace in keyspaces: # Build the nodetool command cmd = nodetool_cmd + ['-h', host, '-p', str(port)] if username and password: cmd += ['-u', username, '-pw', password] # add ssl if requested if ssl: cmd += ['--ssl'] cmd += ['status', '--', keyspace] # Execute the command out, err, code = get_subprocess_output(cmd, self.log, False, log_debug=False) if err or 'Error:' in out or code != 0: self.log.error('Error executing nodetool status: %s', err or out) continue nodes = self._process_nodetool_output(out) percent_up_by_dc = defaultdict(float) percent_total_by_dc = defaultdict(float) # Send the stats per node and compute the stats per datacenter for node in nodes: node_tags = [ 'node_address:%s' % node['address'], 'node_id:%s' % node['id'], 'datacenter:%s' % node['datacenter'], 'rack:%s' % node['rack'], ] # nodetool prints `?` when it can't compute the value of `owns` for certain keyspaces (e.g. system) # don't send metric in this case if node['owns'] != '?': owns = float(node['owns']) if node['status'] == 'U': percent_up_by_dc[node['datacenter']] += owns percent_total_by_dc[node['datacenter']] += owns self.gauge('cassandra.nodetool.status.owns', owns, tags=tags + node_tags + ['keyspace:%s' % keyspace]) # Send service check only once for each node if send_service_checks: status = AgentCheck.OK if node[ 'status'] == 'U' else AgentCheck.CRITICAL self.service_check('cassandra.nodetool.node_up', status, tags + node_tags) self.gauge('cassandra.nodetool.status.status', 1 if node['status'] == 'U' else 0, tags=tags + node_tags) self.gauge( 'cassandra.nodetool.status.load', float(node['load']) * TO_BYTES[node['load_unit']], tags=tags + node_tags, ) # All service checks have been sent, don't resend send_service_checks = False # Send the stats per datacenter for datacenter, percent_up in percent_up_by_dc.items(): self.gauge( 'cassandra.nodetool.status.replication_availability', percent_up, tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter], ) for datacenter, percent_total in percent_total_by_dc.items(): self.gauge( 'cassandra.nodetool.status.replication_factor', int(round(percent_total / 100)), tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter], )