def _check_solaris(self, instance): # Can't get bytes sent and received via netstat # Default to kstat -p link:0: custom_tags = instance.get('tags', []) try: netstat, _, _ = get_subprocess_output(["kstat", "-p", "link:0:"], self.log) metrics_by_interface = self._parse_solaris_netstat(netstat) for interface, metrics in metrics_by_interface.iteritems(): self._submit_devicemetrics(interface, metrics, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting kstat stats.") try: netstat, _, _ = get_subprocess_output(["netstat", "-s", "-P" "tcp"], self.log) # TCP: tcpRtoAlgorithm= 4 tcpRtoMin = 200 # tcpRtoMax = 60000 tcpMaxConn = -1 # tcpActiveOpens = 57 tcpPassiveOpens = 50 # tcpAttemptFails = 1 tcpEstabResets = 0 # tcpCurrEstab = 0 tcpOutSegs = 254 # tcpOutDataSegs = 995 tcpOutDataBytes =1216733 # tcpRetransSegs = 0 tcpRetransBytes = 0 # tcpOutAck = 185 tcpOutAckDelayed = 4 # ... self._submit_regexed_values(netstat, SOLARIS_TCP_METRICS, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def check(self, instance): cluster_uri = instance['cluster_uri'] public_key = instance['cluster_public_key'] security_file = instance['user_security_file'] node_id = instance['node_id'] if public_key and security_file: metrics, err, retcode = get_subprocess_output( [ "python3", "-m", "qdb_datadog", "--cluster", cluster_uri, "--node-id", node_id, "--prefix", "qdb", "--cluster-public-key", public_key, "--user-security-file", security_file ], self.log, raise_on_empty_output=True) else: metrics, err, retcode = get_subprocess_output( [ "python3", "-m", "qdb_datadog", "--cluster", cluster_uri, "--node-id", node_id, "--prefix", "qdb" ], self.log, raise_on_empty_output=True) for m in metrics.splitlines(): k, t, v = m.split(',') if t == 'GAUGE': self.gauge(k, int(v), tags=['node_id:' + node_id]) elif t == 'COUNTER': self.monotonic_count(k, int(v), tags=['node_id:' + node_id]) else: raise RuntimeError("Unrecognized counter type: '" + t + "'")
def _get_postqueue_stats(self, postfix_config_dir, tags): # get some intersting configuratin values from postconf pc_output, _, _ = get_subprocess_output(['postconf', 'mail_version'], self.log, False) postfix_version = pc_output.strip('\n').split('=')[1].strip() pc_output, _, _ = get_subprocess_output( ['postconf', 'authorized_mailq_users'], self.log, False) authorized_mailq_users = pc_output.strip('\n').split('=')[1].strip() self.log.debug( 'authorized_mailq_users : {}'.format(authorized_mailq_users)) output, _, _ = get_subprocess_output( ['postqueue', '-c', postfix_config_dir, '-p'], self.log, False) active_count = 0 hold_count = 0 deferred_count = 0 # postque -p sample output ''' root@postfix:/opt/datadog-agent/agent/checks.d# postqueue -p ----Queue ID----- --Size-- ---Arrival Time---- --Sender/Recipient------ 3xWyLP6Nmfz23fk 367 Tue Aug 15 16:17:33 [email protected] (deferred transport) [email protected] 3xWyD86NwZz23ff! 358 Tue Aug 15 16:12:08 [email protected] (deferred transport) [email protected] -- 1 Kbytes in 2 Requests. ''' for line in output.splitlines(): if '*' in line: active_count += 1 continue if '!' in line: hold_count += 1 continue if line[0:1].isdigit(): deferred_count += 1 self.log.debug('Postfix Version: %s' % postfix_version) self.gauge('postfix.queue.size', active_count, tags=tags + ['queue:active', 'instance:{}'.format(postfix_config_dir)]) self.gauge('postfix.queue.size', hold_count, tags=tags + ['queue:hold', 'instance:{}'.format(postfix_config_dir)]) self.gauge( 'postfix.queue.size', deferred_count, tags=tags + ['queue:deferred', 'instance:{}'.format(postfix_config_dir)])
def call_unbound_control(self, command, tags): try: # Pass raise_on_empty_output as False so we get a chance to log stderr ub_out, ub_err, returncode = get_subprocess_output( command, self.log, raise_on_empty_output=False) except Exception as e: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="exception collecting stats", tags=tags) raise Exception("Unable to get unbound stats: {}".format(str(e))) for line in ub_err.splitlines(): self.log.debug('stderr from %s: %s', command, line) # Check the return value if returncode != 0: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="non-zero return code collecting stats", tags=tags) raise Exception('"{}" failed, return code: {}'.format( command, returncode)) # And because we pass raise_on_empty_output as False, check that too if not ub_out: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="no stats", tags=tags) raise Exception('no output from "{}"'.format(command)) return ub_out
def which(program, use_sudo, log): def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) if use_sudo: # Pass raise_on_empty_output as False to Leave it to the caller to handle the not # found case. stdout, stderr, returncode = get_subprocess_output( ['sudo', 'which', program], log, raise_on_empty_output=False) if returncode == 0: return stdout for line in stderr.splitlines(): log.debug('stderr from sudo which %s: %s', program, line) return None fpath, fname = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None
def _get_queue_count(self, directory, queues, tags): for queue in queues: queue_path = os.path.join(directory, queue) if not os.path.exists(queue_path): raise Exception('{} does not exist'.format(queue_path)) count = 0 if os.geteuid() == 0: # dd-agent is running as root (not recommended) count = sum(len(files) for root, dirs, files in os.walk(queue_path)) else: # can dd-agent user run sudo? test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo == 0: # default to `root` for backward compatibility postfix_user = self.init_config.get('postfix_user', 'root') cmd = ['sudo', '-u', postfix_user, 'find', queue_path, '-type', 'f'] output, _, _ = get_subprocess_output(cmd, self.log, False) count = len(output.splitlines()) else: raise Exception('The dd-agent user does not have sudo access') # emit an individually tagged metric self.gauge('postfix.queue.size', count, tags=tags + ['queue:{}'.format(queue), 'instance:{}'.format(os.path.basename(directory))])
def _add_conntrack_stats_metrics(self, conntrack_path, tags): """ Parse the output of conntrack -S Add the parsed metrics """ try: output, _, _ = get_subprocess_output( ["sudo", conntrack_path, "-S"], self.log) # conntrack -S sample: # cpu=0 found=27644 invalid=19060 ignore=485633411 insert=0 insert_failed=1 \ # drop=1 early_drop=0 error=0 search_restart=39936711 # cpu=1 found=21960 invalid=17288 ignore=475938848 insert=0 insert_failed=1 \ # drop=1 early_drop=0 error=0 search_restart=36983181 lines = output.splitlines() for line in lines: cols = line.split() cpu_num = cols[0].split('=')[-1] cpu_tag = ['cpu:{}'.format(cpu_num)] cols = cols[1:] for cell in cols: metric, value = cell.split('=') self.monotonic_count( 'system.net.conntrack.{}'.format(metric), int(value), tags=tags + cpu_tag) except SubprocessOutputEmptyError: self.log.debug("Couldn't use {} to get conntrack stats".format( conntrack_path))
def _exec_ping(self, timeout, target_host): if platform.system() == "Windows": # pragma: nocover countOption = "-n" timeoutOption = "-w" # The timeout option is in ms on Windows # https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/ping timeout = timeout * 1000 elif platform.system() == "Darwin": countOption = "-c" timeoutOption = "-W" # Also in ms on Mac timeout = timeout * 1000 else: # The timeout option is is seconds on Linux, leaving timeout as is # https://linux.die.net/man/8/ping countOption = "-c" timeoutOption = "-W" self.log.debug("Running: ping {} {} {} {} {}".format( countOption, "1", timeoutOption, str(timeout), target_host)) lines, err, retcode = get_subprocess_output([ "ping", countOption, "1", timeoutOption, str(timeout), target_host ], self.log, raise_on_empty_output=True) self.log.debug("ping returned {} - {} - {}".format( retcode, lines, err)) if retcode != 0: raise CheckException("ping returned {}: {}".format(retcode, err)) return lines
def _collect_raw(self, ceph_cmd, ceph_cluster, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise Exception('The dd-agent user does not have sudo access') ceph_args = 'sudo {}'.format(ceph_cmd) else: ceph_args = ceph_cmd ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'): try: args = '{} {} -fjson'.format(ceph_args, cmd) output, _, _ = get_subprocess_output(args.split(), self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s' % (cmd, str(e))) continue name = cmd.replace(' ', '_') raw[name] = res return raw
def check(self, instance): stat_out, err, _ = get_subprocess_output(self.nfs_cmd, self.log) all_devices = [] this_device = [] custom_tags = instance.get("tags", []) for l in stat_out.splitlines(): if not l: continue elif l.find(b'mounted on') >= 0 and len(this_device) > 0: # if it's a new device, create the device and add it to the array device = Device(this_device, self.log) all_devices.append(device) this_device = [] this_device.append(l.strip().split()) # Add the last device into the array device = Device(this_device, self.log) all_devices.append(device) # Disregard the first half of device stats (report 1 of 2) # as that is the moving average all_devices = all_devices[len(all_devices) // 2:] for device in all_devices: device.send_metrics(self.gauge, custom_tags)
def check(self, instance): check_command = instance.get('check_command') metric_namespace = instance.get('metric_namespace') tags = instance.get('tags', []) create_service_check = instance.get('create_service_check', False) if not check_command: raise CheckException( "Configuration error. Missing check_command definition, please fix nagios_plugin_wrapper.yaml" ) if not metric_namespace: raise CheckException( "Configuration error. Missing metric_namespace definition, please fix nagios_plugin_wrapper.yaml" ) raw_output = None err = None ret = None try: raw_output, err, ret = get_subprocess_output( check_command, self.log) except Exception as e: error = "Failed to execute check_command {check_command} - {error}".format( check_command=check_command, error=e) self.log.warning(error) raise CheckException( "check_command '{check_command}' failed to execute, see agent.log for more information." .format(check_command=check_command)) output, metrics = self._parse_output(raw_output) if metrics: metrics = self._parse_perfdata(metrics) for label, value in metrics: label = self._sanitize(label) self.log.debug( "metric_namespace: {namespace} | tags: {tags} | value: {value} | ret_code: {ret}" .format(namespace=metric_namespace, tags=tags, value=value, ret=ret)) self.gauge('{metric_namespace}.{label}'.format( metric_namespace=metric_namespace, label=label), value, tags=tags) if output and create_service_check: if ret == 0: status = AgentCheck.OK elif ret == 1: status = AgentCheck.WARNING elif ret == 2: status = AgentCheck.CRITICAL else: status = AgentCheck.UNKNOWN self.service_check(metric_namespace, status, tags=tags, message=output.rstrip())
def _collect_metadata(self): pc_output, _, _ = get_subprocess_output(['postconf', 'mail_version'], self.log, False) self.log.debug('postconf mail_version output: %s', pc_output) postfix_version = pc_output.strip('\n').split('=')[1].strip() self.log.debug('Postfix Version: %s', postfix_version) self.set_metadata('version', postfix_version)
def _collect_metadata(self): try: pc_output, _, _ = get_subprocess_output( ['postconf', 'mail_version'], self.log, False) except Exception as e: self.log.warning('unable to call `postconf mail_version`: %s', e) return self.log.debug('postconf mail_version output: %s', pc_output) postfix_version = pc_output.strip('\n').split('=')[1].strip() self.log.debug('Postfix Version: %s', postfix_version) self.set_metadata('version', postfix_version)
def check(self, instance): config = self.get_instance_config(instance) command = config.get("command") metric_name = config.get("metric_name") metric_type = config.get("metric_type") tags = config.get("tags") output, err, retcode = get_subprocess_output( command, self.log, raise_on_empty_output=True) if "MySQL Replication Health is OK." in output: self.gauge(metric_name, 1, tags=tags) else: self.gauge(metric_name, 0, tags=tags)
def check(self, instance): output, err, retcode = get_subprocess_output( [ "powershell.exe", "C:\ProgramData\Datadog\checks.d\custom_WinServMonitor.ps1" ], self.log, raise_on_empty_output=True) if output.startswith('OK'): self.gauge('custom_WinServMonitor.autoNotStarted', 0) else: for service in output.split(';'): self.gauge('custom_WinServMonitor.autoNotStarted', 1, ['service:' + service.strip()])
def collect_metrics_manually(self): df_out, _, _ = get_subprocess_output(self.DF_COMMAND + ['-k'], self.log) self.log.debug(df_out) for device in self._list_devices(df_out): self.log.debug("Passed: {0}".format(device)) tags = [device[1], 'filesystem:{}'.format(device[1])] if self._tag_by_filesystem else [] tags.extend(self._custom_tags) device_name = device[-1] if self._use_mount else device[0] # apply device/mountpoint specific tags for regex, device_tags in self._device_tag_re: if regex.match(device_name): tags += device_tags for metric_name, value in self._collect_metrics_manually(device).iteritems(): self.gauge(metric_name, value, tags=tags, device_name=device_name)
def get_license_usage(self): sacli_path = self.init_config.get( "sacli_path", "/usr/local/openvpn_as/scripts/sacli") sacli_licusage = ["sudo", sacli_path, "LicUsage"] out, err, retcode = get_subprocess_output(sacli_licusage, self.log, raise_on_empty_output=True) out_split = out[3:-2].splitlines() lic_usage = [int(n.strip(", ")) for n in out_split] return { "used": lic_usage[0], "total": lic_usage[1], "available": lic_usage[1] - lic_usage[0], }
def _get_version_info(self, varnishstat_path): # Get the varnish version from varnishstat output, error, _ = get_subprocess_output(varnishstat_path + ["-V"], self.log, raise_on_empty_output=False) # Assumptions regarding varnish's version varnishstat_format = "json" raw_version = None m1 = self.version_pattern.search(output, re.MULTILINE) # v2 prints the version on stderr, v3 on stdout m2 = self.version_pattern.search(error, re.MULTILINE) if m1 is None and m2 is None: self.log.warn( "Cannot determine the version of varnishstat, assuming 3 or greater" ) self.warning( "Cannot determine the version of varnishstat, assuming 3 or greater" ) else: if m1 is not None: raw_version = m1.group() elif m2 is not None: raw_version = m2.group() self.log.debug("Varnish version: %s", raw_version) if raw_version: self.set_metadata('version', raw_version) if raw_version is None: raw_version = '3.0.0' version = LooseVersion(raw_version) # Location of varnishstat if version < LooseVersion('3.0.0'): varnishstat_format = "text" elif version < LooseVersion( '5.0.0'): # we default to json starting version 5.0.0 varnishstat_format = "xml" return version, varnishstat_format
def _get_sendmail_stats(self, sendmail_command, use_sudo): if not os.path.exists(sendmail_command): raise Exception('{} does not exist'.format(sendmail_command)) self.log.debug(sendmail_command) # mailq sample output. sendmail output is similar. ## # MSP Queue status... # /var/spool/mqueue-client is empty # Total requests: 0 # MTA Queue status... # /var/spool/mqueue is empty # Total requests: 0 # if we want to use sendmail, we need to append -bp to it # https://www.electrictoolbox.com/show-sendmail-mail-queue/ if "sendmail" in sendmail_command: command = [sendmail_command, '-bp'] else: command = [sendmail_command] # Listing the directory might require sudo privileges if use_sudo: try: os.system('setsid sudo -l < /dev/null') command.insert(0, 'sudo') except OSError as e: self.log.exception( "trying to retrieve %s with sudo failed with return code %s", command, e) self.log.debug(command) mail_queue, err, retcode = get_subprocess_output( command, self.log, False) self.log.debug("Error: %s", err) count = mail_queue.splitlines() # Retrieve the last total number of requests queue_count = int(count[-1][-1]) self.log.info("Number of mails in the queue: %s", queue_count) return queue_count
def get_process_states(self): state_counts = defaultdict(int) prio_counts = defaultdict(int) ps = get_subprocess_output(['ps', '--no-header', '-eo', 'stat'], self.log) for state in ps[0]: # Each process state is a flag in a list of characters. See ps(1) for details. for flag in list(state): if state in PROCESS_STATES: state_counts[PROCESS_STATES[state]] += 1 elif state in PROCESS_PRIOS: prio_counts[PROCESS_PRIOS[state]] += 1 for state in state_counts: state_tags = list(self.tags) state_tags.append("state:" + state) self.gauge('system.processes.states', float(state_counts[state]), state_tags) for prio in prio_counts: prio_tags = list(self.tags) prio_tags.append("priority:" + prio) self.gauge('system.processes.priorities', float(prio_counts[prio]), prio_tags)
def check(self, instance): metric = "ssl.expire_in_days" site = instance['site'] tag = "site:" + site # generate the tags command = [ "timeout", "10", "bash", "-c", "openssl s_client -showcerts -servername " + site + " -connect " + site + ":443 2>/dev/null | openssl x509 -noout -dates | grep notAfter | cut -f 2 -d\= | xargs -0 -I arg date -d arg '+%s'" ] (output, err, returncode) = get_subprocess_output(command, self.log, False) if output: output = output.rstrip("\n") d0 = int(time.time()) d1 = int(output) delta = d1 - d0 days = delta / 24 / 60 / 60 # convert the timestamp to days self.gauge(metric, int(days), tags=[tag]) else: self.gauge(metric, -1, tags=[tag])
def check(self, instance): cmd = ["/usr/local/bin/lighthouse", instance["url"], "--output", "json", "--quiet", "--chrome-flags='--headless'"] json_string, error_message, exit_code = get_subprocess_output(cmd, self.log, raise_on_empty_output=False) # check for error since we have raise_on_empty_output set to False if exit_code > 0: raise Exception(json_string, error_message, exit_code) try: data = json.loads(json_string) score_accessibility = data["categories"]["accessibility"]["score"] * 100 score_best_practices = data["categories"]["best-practices"]["score"] * 100 score_performance = data["categories"]["performance"]["score"] * 100 score_pwa = data["categories"]["pwa"]["score"] * 100 score_seo = data["categories"]["seo"]["score"] * 100 except Exception: self.log.warn("lighthouse response JSON structure different than expected") raise Exception(json_string, error_message, exit_code) # add tags try: tags = instance['tags'] if type(tags) != list: self.log.warn('The tags list in the lighthouse check is not configured properly') tags = [] except KeyError: tags = [] tags.append("lighthouse_url:%s" % instance['url']) tags.append("lighthouse_name:%s" % instance['name']) self.gauge("custom_lighthouse.accessibility", score_accessibility, tags=tags) self.gauge("custom_lighthouse.best_practices", score_best_practices, tags=tags) self.gauge("custom_lighthouse.performance", score_performance, tags=tags) self.gauge("custom_lighthouse.pwa", score_pwa, tags=tags) self.gauge("custom_lighthouse.seo", score_seo, tags=tags)
def _check_bsd(self, instance): netstat_flags = ['-i', '-b'] custom_tags = instance.get('tags', []) # FreeBSD's netstat truncates device names unless you pass '-W' if Platform.is_freebsd(): netstat_flags.append('-W') try: output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log) lines = output.splitlines() # Name Mtu Network Address Ipkts Ierrs Ibytes Opkts Oerrs Obytes Coll # lo0 16384 <Link#1> 318258 0 428252203 318258 0 428252203 0 # lo0 16384 localhost fe80:1::1 318258 - 428252203 318258 - 428252203 - # lo0 16384 127 localhost 318258 - 428252203 318258 - 428252203 - # lo0 16384 localhost ::1 318258 - 428252203 318258 - 428252203 - # gif0* 1280 <Link#2> 0 0 0 0 0 0 0 # stf0* 1280 <Link#3> 0 0 0 0 0 0 0 # en0 1500 <Link#4> 04:0c:ce:db:4e:fa 20801309 0 13835457425 15149389 0 11508790198 0 # en0 1500 seneca.loca fe80:4::60c:ceff: 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 192.168.1 192.168.1.63 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # p2p0 2304 <Link#5> 06:0c:ce:db:4e:fa 0 0 0 0 0 0 0 # ham0 1404 <Link#6> 7a:79:05:4d:bf:f5 30100 0 6815204 18742 0 8494811 0 # ham0 1404 5 5.77.191.245 30100 - 6815204 18742 - 8494811 - # ham0 1404 seneca.loca fe80:6::7879:5ff: 30100 - 6815204 18742 - 8494811 - # ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204 18742 - 8494811 - headers = lines[0].split() # Given the irregular structure of the table above, better to parse from the end of each line # Verify headers first # -7 -6 -5 -4 -3 -2 -1 for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"): if h not in headers: self.log.error("%s not found in %s; cannot parse" % (h, headers)) return False current = None for l in lines[1:]: # Another header row, abort now, this is IPv6 land if "Name" in l: break x = l.split() if len(x) == 0: break iface = x[0] if iface.endswith("*"): iface = iface[:-1] if iface == current: # skip multiple lines of same interface continue else: current = iface # Filter inactive interfaces if self._parse_value(x[-5]) or self._parse_value(x[-2]): iface = current metrics = { 'bytes_rcvd': self._parse_value(x[-5]), 'bytes_sent': self._parse_value(x[-2]), 'packets_in.count': self._parse_value(x[-7]), 'packets_in.error': self._parse_value(x[-6]), 'packets_out.count': self._parse_value(x[-4]), 'packets_out.error': self._parse_value(x[-3]), } self._submit_devicemetrics(iface, metrics, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") try: netstat, _, _ = get_subprocess_output(["netstat", "-s", "-p" "tcp"], self.log) # 3651535 packets sent # 972097 data packets (615753248 bytes) # 5009 data packets (2832232 bytes) retransmitted # 0 resends initiated by MTU discovery # 2086952 ack-only packets (471 delayed) # 0 URG only packets # 0 window probe packets # 310851 window update packets # 336829 control packets # 0 data packets sent after flow control # 3058232 checksummed in software # 3058232 segments (571218834 bytes) over IPv4 # 0 segments (0 bytes) over IPv6 # 4807551 packets received # 1143534 acks (for 616095538 bytes) # 165400 duplicate acks # ... self._submit_regexed_values(netstat, BSD_TCP_METRICS, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def _check_linux(self, instance): """ _check_linux can be run inside a container and still collects the network metrics from the host For that procfs_path can be set to something like "/host/proc" When a custom procfs_path is set, the collect_connection_state option is ignored """ proc_location = self.agentConfig.get('procfs_path', '/proc').rstrip('/') custom_tags = instance.get('tags', []) if Platform.is_containerized() and proc_location != "/proc": proc_location = "%s/1" % proc_location if self._is_collect_cx_state_runnable(proc_location): try: self.log.debug("Using `ss` to collect connection state") # Try using `ss` for increased performance over `netstat` for ip_version in ['4', '6']: for protocol in ['tcp', 'udp']: # Call `ss` for each IP version because there's no built-in way of distinguishing # between the IP versions in the output # Also calls `ss` for each protocol, because on some systems (e.g. Ubuntu 14.04), there is a # bug that print `tcp` even if it's `udp` output, _, _ = get_subprocess_output(["ss", "-n", "-{0}".format(protocol[0]), "-a", "-{0}".format(ip_version)], self.log) lines = output.splitlines() # State Recv-Q Send-Q Local Address:Port Peer Address:Port # UNCONN 0 0 127.0.0.1:8125 *:* # ESTAB 0 0 127.0.0.1:37036 127.0.0.1:8125 # UNCONN 0 0 fe80::a00:27ff:fe1c:3c4:123 :::* # TIME-WAIT 0 0 90.56.111.177:56867 46.105.75.4:143 # LISTEN 0 0 ::ffff:127.0.0.1:33217 ::ffff:127.0.0.1:7199 # ESTAB 0 0 ::ffff:127.0.0.1:58975 ::ffff:127.0.0.1:2181 metrics = self._parse_linux_cx_state(lines[1:], self.tcp_states['ss'], 0, protocol=protocol, ip_version=ip_version) # Only send the metrics which match the loop iteration's ip version for stat, metric in self.cx_state_gauge.iteritems(): if stat[0].endswith(ip_version) and stat[0].startswith(protocol): self.gauge(metric, metrics.get(metric), tags=custom_tags) except OSError: self.log.info("`ss` not found: using `netstat` as a fallback") output, _, _ = get_subprocess_output(["netstat", "-n", "-u", "-t", "-a"], self.log) lines = output.splitlines() # Active Internet connections (w/o servers) # Proto Recv-Q Send-Q Local Address Foreign Address State # tcp 0 0 46.105.75.4:80 79.220.227.193:2032 SYN_RECV # tcp 0 0 46.105.75.4:143 90.56.111.177:56867 ESTABLISHED # tcp 0 0 46.105.75.4:50468 107.20.207.175:443 TIME_WAIT # tcp6 0 0 46.105.75.4:80 93.15.237.188:58038 FIN_WAIT2 # tcp6 0 0 46.105.75.4:80 79.220.227.193:2029 ESTABLISHED # udp 0 0 0.0.0.0:123 0.0.0.0:* # udp6 0 0 :::41458 :::* metrics = self._parse_linux_cx_state(lines[2:], self.tcp_states['netstat'], 5) for metric, value in metrics.iteritems(): self.gauge(metric, value, tags=custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") proc_dev_path = "{}/net/dev".format(proc_location) with open(proc_dev_path, 'r') as proc: lines = proc.readlines() # Inter-| Receive | Transmit # face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed # noqa: E501 # lo:45890956 112797 0 0 0 0 0 0 45890956 112797 0 0 0 0 0 0 # noqa: E501 # eth0:631947052 1042233 0 19 0 184 0 1206 1208625538 1320529 0 0 0 0 0 0 # noqa: E501 # eth1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # noqa: E501 for l in lines[2:]: cols = l.split(':', 1) x = cols[1].split() # Filter inactive interfaces if self._parse_value(x[0]) or self._parse_value(x[8]): iface = cols[0].strip() metrics = { 'bytes_rcvd': self._parse_value(x[0]), 'bytes_sent': self._parse_value(x[8]), 'packets_in.count': self._parse_value(x[1]), 'packets_in.error': self._parse_value(x[2]) + self._parse_value(x[3]), 'packets_out.count': self._parse_value(x[9]), 'packets_out.error': self._parse_value(x[10]) + self._parse_value(x[11]), } self._submit_devicemetrics(iface, metrics, custom_tags) netstat_data = {} for f in ['netstat', 'snmp']: proc_data_path = "{}/net/{}".format(proc_location, f) try: with open(proc_data_path, 'r') as netstat: while True: n_header = netstat.readline() if not n_header: break # No more? Abort! n_data = netstat.readline() h_parts = n_header.strip().split(' ') h_values = n_data.strip().split(' ') ns_category = h_parts[0][:-1] netstat_data[ns_category] = {} # Turn the data into a dictionary for idx, hpart in enumerate(h_parts[1:]): netstat_data[ns_category][hpart] = h_values[idx + 1] except IOError: # On Openshift, /proc/net/snmp is only readable by root self.log.debug("Unable to read %s.", proc_data_path) nstat_metrics_names = { 'Tcp': { 'RetransSegs': 'system.net.tcp.retrans_segs', 'InSegs': 'system.net.tcp.in_segs', 'OutSegs': 'system.net.tcp.out_segs', }, 'TcpExt': { 'ListenOverflows': 'system.net.tcp.listen_overflows', 'ListenDrops': 'system.net.tcp.listen_drops', 'TCPBacklogDrop': 'system.net.tcp.backlog_drops', 'TCPRetransFail': 'system.net.tcp.failed_retransmits', }, 'Udp': { 'InDatagrams': 'system.net.udp.in_datagrams', 'NoPorts': 'system.net.udp.no_ports', 'InErrors': 'system.net.udp.in_errors', 'OutDatagrams': 'system.net.udp.out_datagrams', 'RcvbufErrors': 'system.net.udp.rcv_buf_errors', 'SndbufErrors': 'system.net.udp.snd_buf_errors', 'InCsumErrors': 'system.net.udp.in_csum_errors' } } # Skip the first line, as it's junk for k in nstat_metrics_names: for met in nstat_metrics_names[k]: if met in netstat_data.get(k, {}): self._submit_netmetric(nstat_metrics_names[k][met], self._parse_value(netstat_data[k][met]), tags=custom_tags)
def check(self, instance): # Allow to specify a complete command for nodetool such as `docker exec container nodetool` nodetool_cmd = shlex.split(instance.get("nodetool", self.nodetool_cmd)) host = instance.get("host", DEFAULT_HOST) port = instance.get("port", DEFAULT_PORT) keyspaces = instance.get("keyspaces", []) username = instance.get("username", "") password = instance.get("password", "") ssl = instance.get("ssl", False) tags = instance.get("tags", []) # Flag to send service checks only once and not for every keyspace send_service_checks = True if not keyspaces: self.log.info( "No keyspaces set in the configuration: no metrics will be sent" ) for keyspace in keyspaces: # Build the nodetool command cmd = nodetool_cmd + ['-h', host, '-p', str(port)] if username and password: cmd += ['-u', username, '-pw', password] # add ssl if requested if ssl: cmd += ['--ssl'] cmd += ['status', '--', keyspace] # Execute the command out, err, _ = get_subprocess_output(cmd, self.log, False, log_debug=False) if err or 'Error:' in out: self.log.error('Error executing nodetool status: %s', err or out) continue nodes = self._process_nodetool_output(out) percent_up_by_dc = defaultdict(float) percent_total_by_dc = defaultdict(float) # Send the stats per node and compute the stats per datacenter for node in nodes: node_tags = [ 'node_address:%s' % node['address'], 'node_id:%s' % node['id'], 'datacenter:%s' % node['datacenter'], 'rack:%s' % node['rack'] ] # nodetool prints `?` when it can't compute the value of `owns` for certain keyspaces (e.g. system) # don't send metric in this case if node['owns'] != '?': owns = float(node['owns']) if node['status'] == 'U': percent_up_by_dc[node['datacenter']] += owns percent_total_by_dc[node['datacenter']] += owns self.gauge('cassandra.nodetool.status.owns', owns, tags=tags + node_tags + ['keyspace:%s' % keyspace]) # Send service check only once for each node if send_service_checks: status = AgentCheck.OK if node[ 'status'] == 'U' else AgentCheck.CRITICAL self.service_check('cassandra.nodetool.node_up', status, tags + node_tags) self.gauge('cassandra.nodetool.status.status', 1 if node['status'] == 'U' else 0, tags=tags + node_tags) self.gauge('cassandra.nodetool.status.load', float(node['load']) * TO_BYTES[node['load_unit']], tags=tags + node_tags) # All service checks have been sent, don't resend send_service_checks = False # Send the stats per datacenter for datacenter, percent_up in percent_up_by_dc.items(): self.gauge( 'cassandra.nodetool.status.replication_availability', percent_up, tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) for datacenter, percent_total in percent_total_by_dc.items(): self.gauge( 'cassandra.nodetool.status.replication_factor', int(round(percent_total / 100)), tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter])
def _get_lighthouse_report(command, logger, raise_on_empty=False): json, err_msg, exit_code = get_subprocess_output( command, logger, raise_on_empty_output=raise_on_empty) return json, err_msg, exit_code
def check(self, instance): # Not configured? Not a problem. if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") custom_tags = instance.get('tags', []) if custom_tags is None: custom_tags = [] else: custom_tags = list(set(custom_tags)) # Split the varnishstat command so that additional arguments can be passed in # In order to support monitoring a Varnish instance which is running as a Docker # container we need to wrap commands (varnishstat, varnishadm) with scripts which # perform a docker exec on the running container. This works fine when running a # single container on the host but breaks down when attempting to use the auto # discovery feature. This change allows for passing in additional parameters to # the script (i.e. %%host%%) so that the command is properly formatted and the # desired container is queried. varnishstat_path = shlex.split(instance.get("varnishstat")) name = instance.get('name') metrics_filter = instance.get("metrics_filter", []) if not isinstance(metrics_filter, list): raise Exception("The parameter 'metrics_filter' must be a list") # Get version and version-specific args from varnishstat -V. version, varnishstat_format = self._get_version_info(varnishstat_path) cmd = varnishstat_path + [ self.VARNISHSTAT_FORMAT_OPTION[varnishstat_format] ] for metric in metrics_filter: cmd.extend(["-f", metric]) if name is not None: cmd.extend(['-n', name]) tags = custom_tags + [u'varnish_name:%s' % name] else: tags = custom_tags + [u'varnish_name:default'] output, _, _ = get_subprocess_output(cmd, self.log) self._parse_varnishstat(output, varnishstat_format, tags) # Parse service checks from varnishadm. if instance.get("varnishadm", None): # Split the varnishadm command so that additional arguments can be passed in # In order to support monitoring a Varnish instance which is running as a Docker # container we need to wrap commands (varnishstat, varnishadm) with scripts which # perform a docker exec on the running container. This works fine when running a # single container on the host but breaks down when attempting to use the auto # discovery feature. This change allows for passing in additional parameters to # the script (i.e. %%host%%) so that the command is properly formatted and the # desired container is queried. varnishadm_path = shlex.split(instance.get('varnishadm')) secretfile_path = instance.get('secretfile', '/etc/varnish/secret') daemon_host = instance.get('daemon_host', 'localhost') daemon_port = instance.get('daemon_port', '6082') cmd = [] if geteuid() != 0: cmd.append('sudo') if version < LooseVersion('4.1.0'): cmd.extend(varnishadm_path + ['-S', secretfile_path, 'debug.health']) else: cmd.extend(varnishadm_path + [ '-T', '{}:{}'.format(daemon_host, daemon_port), '-S', secretfile_path, 'backend.list', '-p' ]) try: output, err, _ = get_subprocess_output(cmd, self.log) except OSError as e: self.log.error( "There was an error running varnishadm. Make sure 'sudo' is available. %s", e) output = None if err: self.log.error( 'Error getting service check from varnishadm: %s', err) if output: self._parse_varnishadm(output, custom_tags)