def testNetwork(self): # FIXME: cx_state to true, but needs sysstat installed config = """ init_config: instances: - collect_connection_state: false excluded_interfaces: - lo - lo0 """ check, instances = get_check("network", config) check.check(instances[0]) check.get_metrics() metric_names = [m[0] for m in check.aggregator.metrics] assert "system.net.bytes_rcvd" in metric_names assert "system.net.bytes_sent" in metric_names if Platform.is_linux(): assert "system.net.tcp.retrans_segs" in metric_names assert "system.net.tcp.in_segs" in metric_names assert "system.net.tcp.out_segs" in metric_names elif Platform.is_bsd(): assert "system.net.tcp.retrans_packs" in metric_names assert "system.net.tcp.sent_packs" in metric_names assert "system.net.tcp.rcv_packs" in metric_names
def testNetwork(self): # FIXME: cx_state to true, but needs sysstat installed config = """ init_config: instances: - collect_connection_state: false excluded_interfaces: - lo - lo0 """ check, instances = get_check('network', config) check.check(instances[0]) check.get_metrics() metric_names = [m[0] for m in check.aggregator.metrics] assert 'system.net.bytes_rcvd' in metric_names assert 'system.net.bytes_sent' in metric_names if Platform.is_linux(): assert 'system.net.tcp.retrans_segs' in metric_names assert 'system.net.tcp.in_segs' in metric_names assert 'system.net.tcp.out_segs' in metric_names elif Platform.is_bsd(): assert 'system.net.tcp.retrans_packs' in metric_names assert 'system.net.tcp.sent_packs' in metric_names assert 'system.net.tcp.rcv_packs' in metric_names
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint) except TimeoutException: self.log.warn( u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...", part.mountpoint) continue except Exception as e: self.log.warn("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total) # The problem here is that total includes reserved space the user # doesn't have access to. This causes psutil to calculate a misleadng # percentage for in_use; a lower percentage than df shows. # Calculate in_use w/o reserved space; consistent w/ df's Use% metric. pmets = self._collect_part_metrics(part, disk_usage) used = 'system.disk.used' free = 'system.disk.free' pmets['system.disk.in_use'] = pmets[used] / (pmets[used] + pmets[free]) # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in pmets.iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name) # And finally, latency metrics, a legacy gift from the old Windows Check if Platform.is_win32(): self.collect_latency_metrics()
def parse_df_output(self, df_output, platform_name, inodes=False, use_mount=False, blacklist_re=None): """ Parse the output of the df command. If use_volume is true the volume is used to anchor the metric, otherwise false the mount point is used. Returns a tuple of (disk, inode). """ usage_data = [] # Transform the raw output into tuples of the df data. devices = self._transform_df_output(df_output, blacklist_re) # If we want to use the mount point, replace the volume name on each # line. for parts in devices: try: if use_mount: parts[0] = parts[-1] if inodes: if Platform.is_darwin(platform_name): # Filesystem 512-blocks Used Available Capacity iused ifree %iused Mounted # Inodes are in position 5, 6 and we need to compute the total # Total parts[1] = int(parts[5]) + int(parts[6]) # Total parts[2] = int(parts[5]) # Used parts[3] = int(parts[6]) # Available elif Platform.is_freebsd(platform_name): # Filesystem 1K-blocks Used Avail Capacity iused ifree %iused Mounted # Inodes are in position 5, 6 and we need to compute the total parts[1] = int(parts[5]) + int(parts[6]) # Total parts[2] = int(parts[5]) # Used parts[3] = int(parts[6]) # Available else: parts[1] = int(parts[1]) # Total parts[2] = int(parts[2]) # Used parts[3] = int(parts[3]) # Available else: parts[1] = int(parts[1]) # Total parts[2] = int(parts[2]) # Used parts[3] = int(parts[3]) # Available except IndexError: self.logger.exception("Cannot parse %s" % (parts, )) usage_data.append(parts) return usage_data
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint) except TimeoutException: self.log.warn( u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...", part.mountpoint ) continue except Exception as e: self.log.warn("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total) # The problem here is that total includes reserved space the user # doesn't have access to. This causes psutil to calculate a misleadng # percentage for in_use; a lower percentage than df shows. # Calculate in_use w/o reserved space; consistent w/ df's Use% metric. pmets = self._collect_part_metrics(part, disk_usage) used = 'system.disk.used' free = 'system.disk.free' pmets['system.disk.in_use'] = pmets[used] / (pmets[used] + pmets[free]) # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in pmets.iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name) # And finally, latency metrics, a legacy gift from the old Windows Check if Platform.is_win32(): self.collect_latency_metrics()
def testMemory(self): global logger res = Memory(logger).check({}) if Platform.is_linux(): MEM_METRICS = [ "swapTotal", "swapFree", "swapPctFree", "swapUsed", "physTotal", "physFree", "physUsed", "physBuffers", "physCached", "physUsable", "physPctUsable", "physShared", ] for k in MEM_METRICS: # % metric is only here if total > 0 if k == "swapPctFree" and res["swapTotal"] == 0: continue assert k in res, res assert res["swapTotal"] == res["swapFree"] + res["swapUsed"] assert res["physTotal"] == res["physFree"] + res["physUsed"] elif sys.platform == "darwin": for k in ("swapFree", "swapUsed", "physFree", "physUsed"): assert k in res, res
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): if self._exclude_disk_psutil(part): continue try: disk_usage = psutil.disk_usage(part.mountpoint) except Exception, e: self.log.debug("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue if disk_usage.total == 0: continue self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device pmets = self._collect_part_metrics(part, disk_usage) used = 'system.disk.used' free = 'system.disk.free' pmets['system.disk.pct_usage'] = (pmets[used] / (pmets[used] + pmets[free])) * 100 if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in pmets.iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name)
def check(self, instance): """ Collect metrics for the given gunicorn instance. """ self.log.debug("Running instance: %s", instance) if Platform.is_linux(): procfs_path = self.agentConfig.get('procfs_path', '/proc').rstrip('/') psutil.PROCFS_PATH = procfs_path # Validate the config. if not instance or self.PROC_NAME not in instance: raise GUnicornCheckError("instance must specify: %s" % self.PROC_NAME) # Load the gunicorn master procedure. proc_name = instance.get(self.PROC_NAME) master_proc = self._get_master_proc_by_name(proc_name) # Fetch the worker procs and count their states. worker_procs = master_proc.children() working, idle = self._count_workers(worker_procs) # if no workers are running, alert CRITICAL, otherwise OK msg = "%s working and %s idle workers for %s" % (working, idle, proc_name) status = AgentCheck.CRITICAL if working == 0 and idle == 0 else AgentCheck.OK self.service_check(self.SVC_NAME, status, tags=['app:' + proc_name], message=msg) # Submit the data. self.log.debug("instance %s procs - working:%s idle:%s" % (proc_name, working, idle)) self.gauge("gunicorn.workers", working, self.WORKING_TAGS) self.gauge("gunicorn.workers", idle, self.IDLE_TAGS)
def _host_matches_node(self, primary_addrs): """ For < 0.19, check if the current host matches the IP given in the cluster nodes check `/_cluster/nodes`. Uses `ip addr` on Linux and `ifconfig` on Mac """ if Platform.is_darwin(): ifaces = subprocess.Popen(["ifconfig"], stdout=subprocess.PIPE) else: ifaces = subprocess.Popen(["ip", "addr"], stdout=subprocess.PIPE) grepper = subprocess.Popen( ["grep", "inet"], stdin=ifaces.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) ifaces.stdout.close() out, err = grepper.communicate() # Capture the list of interface IPs ips = [] for iface in out.split("\n"): iface = iface.strip() if iface: ips.append(iface.split(" ")[1].split("/")[0]) # Check the interface addresses against the primary address return primary_addrs in ips
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = psutil.disk_usage(part.mountpoint) except Exception, e: self.log.debug("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in self._collect_part_metrics( part, disk_usage).iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name)
def _host_matches_node(self, primary_addrs): """ For < 0.19, check if the current host matches the IP given in the cluster nodes check `/_cluster/nodes`. Uses `ip addr` on Linux and `ifconfig` on Mac """ if Platform.is_darwin(): ifaces = subprocess.Popen(['ifconfig'], stdout=subprocess.PIPE) else: ifaces = subprocess.Popen(['ip', 'addr'], stdout=subprocess.PIPE) grepper = subprocess.Popen(['grep', 'inet'], stdin=ifaces.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ifaces.stdout.close() out, err = grepper.communicate() # Capture the list of interface IPs ips = [] for iface in out.split("\n"): iface = iface.strip() if iface: ips.append(iface.split(' ')[1].split('/')[0]) # Check the interface addresses against the primary address return primary_addrs in ips
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = psutil.disk_usage(part.mountpoint) except Exception, e: self.log.debug("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in self._collect_part_metrics(part, disk_usage).iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name)
def _exclude_disk_psutil(self, part): # skip cd-rom drives with no disk in it; they may raise # ENOENT, pop-up a Windows GUI error for a non-ready # partition or just hang; # and all the other excluded disks return ((Platform.is_win32() and ('cdrom' in part.opts or part.fstype == '')) or self._exclude_disk(part.device, part.fstype))
def _exclude_disk_psutil(self, part): # skip cd-rom drives with no disk in it; they may raise # ENOENT, pop-up a Windows GUI error for a non-ready # partition or just hang; # and all the other excluded disks return ((Platform.is_win32() and ('cdrom' in part.opts or part.fstype == '')) or self._exclude_disk(part.device, part.fstype, part.mountpoint))
def _get_pickle_path(cls): if Platform.is_win32(): path = os.path.join(_windows_commondata_path(), 'Datadog', cls.__name__ + '.pickle') else: path = os.path.join(tempfile.gettempdir(), cls.__name__ + '.pickle') return path
def get_system_stats(): systemStats = { 'machine': platform.machine(), 'platform': sys.platform, 'processor': platform.processor(), 'pythonV': platform.python_version(), } platf = sys.platform if Platform.is_linux(platf): grep = subprocess.Popen(['grep', 'model name', '/proc/cpuinfo'], stdout=subprocess.PIPE, close_fds=True) wc = subprocess.Popen(['wc', '-l'], stdin=grep.stdout, stdout=subprocess.PIPE, close_fds=True) systemStats['cpuCores'] = int(wc.communicate()[0]) if Platform.is_darwin(platf): systemStats['cpuCores'] = int(subprocess.Popen(['sysctl', 'hw.ncpu'], stdout=subprocess.PIPE, close_fds=True).communicate()[0].split(': ')[1]) if Platform.is_freebsd(platf): systemStats['cpuCores'] = int(subprocess.Popen(['sysctl', 'hw.ncpu'], stdout=subprocess.PIPE, close_fds=True).communicate()[0].split(': ')[1]) if Platform.is_linux(platf): systemStats['nixV'] = platform.dist() elif Platform.is_darwin(platf): systemStats['macV'] = platform.mac_ver() elif Platform.is_freebsd(platf): version = platform.uname()[2] systemStats['fbsdV'] = ('freebsd', version, '') # no codename for FreeBSD elif Platform.is_win32(platf): systemStats['winV'] = platform.win32_ver() return systemStats
def parse_df_output(self, df_output, platform_name, inodes=False, use_mount=False, blacklist_re=None): """ Parse the output of the df command. If use_volume is true the volume is used to anchor the metric, otherwise false the mount point is used. Returns a tuple of (disk, inode). """ usage_data = [] # Transform the raw output into tuples of the df data. devices = self._transform_df_output(df_output, blacklist_re) # If we want to use the mount point, replace the volume name on each # line. for parts in devices: try: if use_mount: parts[0] = parts[-1] if inodes: if Platform.is_darwin(platform_name): # Filesystem 512-blocks Used Available Capacity iused ifree %iused Mounted # Inodes are in position 5, 6 and we need to compute the total # Total parts[1] = int(parts[5]) + int(parts[6]) # Total parts[2] = int(parts[5]) # Used parts[3] = int(parts[6]) # Available elif Platform.is_freebsd(platform_name): # Filesystem 1K-blocks Used Avail Capacity iused ifree %iused Mounted # Inodes are in position 5, 6 and we need to compute the total parts[1] = int(parts[5]) + int(parts[6]) # Total parts[2] = int(parts[5]) # Used parts[3] = int(parts[6]) # Available else: parts[1] = int(parts[1]) # Total parts[2] = int(parts[2]) # Used parts[3] = int(parts[3]) # Available else: parts[1] = int(parts[1]) # Total parts[2] = int(parts[2]) # Used parts[3] = int(parts[3]) # Available except IndexError: self.logger.exception("Cannot parse %s" % (parts,)) usage_data.append(parts) return usage_data
def _save_logs_path(self): prefix = '' if Platform.is_windows(): prefix = 'windows_' config = get_logging_config() self._collector_log = config.get('{0}collector_log_file'.format(prefix)) self._forwarder_log = config.get('{0}forwarder_log_file'.format(prefix)) self._dogstatsd_log = config.get('{0}dogstatsd_log_file'.format(prefix)) self._jmxfetch_log = config.get('jmxfetch_log_file')
def test_collecting_disk_metrics(self): """Testing disk stats gathering""" if Platform.is_unix(): disk = Disk(logger) res = disk.check({}) # Assert we have disk & inode stats assert len(res) == 2 assert res[0] assert res[1]
def _save_logs_path(self): prefix = "" if Platform.is_windows(): prefix = "windows_" config = get_logging_config() self._collector_log = config.get("{0}collector_log_file".format(prefix)) self._forwarder_log = config.get("{0}forwarder_log_file".format(prefix)) self._dogstatsd_log = config.get("{0}dogstatsd_log_file".format(prefix)) self._jmxfetch_log = config.get("jmxfetch_log_file")
def check_user_rights(): if Platform.is_unix() and not os.geteuid() == 0: log.warning("You are not root, some information won't be collected") choice = raw_input("Are you sure you want to continue [y/N]? ").lower() if choice not in ["yes", "y"]: print "Aborting" sys.exit(1) else: log.warn("Your user has to have at least read access" " to the logs and conf files of the agent")
def _collect_part_metrics(self, part, usage): metrics = {} for name in ['total', 'used', 'free']: metrics[self.METRIC_DISK.format(name)] = getattr(usage, name) / 1024.0 metrics[self.METRIC_DISK.format('pct_usage')] = usage.percent if Platform.is_unix(): metrics.update(self._collect_inodes_metrics(part.mountpoint)) return metrics
def check_user_rights(): if Platform.is_unix() and not os.geteuid() == 0: log.warning("You are not root, some information won't be collected") choice = raw_input('Are you sure you want to continue [y/N]? ').lower() if choice not in ['yes', 'y']: print 'Aborting' sys.exit(1) else: log.warn('Your user has to have at least read access' ' to the logs and conf files of the agent')
def _supervisor_status(self): if Platform.is_windows(): print "Windows - status not implemented" else: agent_exec = self._get_path_agent_exec() print "{0} status".format(agent_exec) self._print_output_command([agent_exec, "status"]) supervisor_exec = self._get_path_supervisor_exec() print "{0} status".format(supervisor_exec) self._print_output_command([supervisor_exec, "-c", self._get_path_supervisor_conf(), "status"])
def check(self, instance): if instance is None: instance = {} self._excluded_ifaces = instance.get('excluded_interfaces', []) self._collect_cx_state = instance.get('collect_connection_state', False) self._exclude_iface_re = None exclude_re = instance.get('excluded_interface_re', None) if exclude_re: self.log.debug("Excluding network devices matching: %s" % exclude_re) self._exclude_iface_re = re.compile(exclude_re) if Platform.is_linux(): self._check_linux(instance) elif Platform.is_bsd(): self._check_bsd(instance) elif Platform.is_solaris(): self._check_solaris(instance)
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint) except TimeoutException: self.log.warn( u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...", part.mountpoint) continue except Exception as e: self.log.warn("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype, 'filesystem:{}'.format(part.fstype) ] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in self._collect_part_metrics( part, disk_usage).iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name) # And finally, latency metrics, a legacy gift from the old Windows Check if Platform.is_win32(): self.collect_latency_metrics()
def testMemory(self): global logger res = Memory(logger).check({}) if Platform.is_linux(): for k in ("swapTotal", "swapFree", "swapPctFree", "swapUsed", "physTotal", "physFree", "physUsed", "physBuffers", "physCached", "physUsable", "physPctUsable", "physShared"): assert k in res, res assert res["swapTotal"] == res["swapFree"] + res["swapUsed"] assert res["physTotal"] == res["physFree"] + res["physUsed"] elif sys.platform == 'darwin': for k in ("swapFree", "swapUsed", "physFree", "physUsed"): assert k in res, res
def _collect_part_metrics(self, part, usage): metrics = {} for name in ['total', 'used', 'free']: # For legacy reasons, the standard unit it kB metrics[self.METRIC_DISK.format(name)] = getattr(usage, name) / 1024.0 # FIXME: 6.x, use percent, a lot more logical than in_use metrics[self.METRIC_DISK.format('in_use')] = usage.percent / 100.0 if Platform.is_unix(): metrics.update(self._collect_inodes_metrics(part.mountpoint)) return metrics
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=self._all_partitions): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in self._collect_part_metrics(part).iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name) # And finally, latency metrics, a legacy gift from the old Windows Check if Platform.is_win32(): self.collect_latency_metrics()
def check(self, instance): host, port, user, password, mysql_sock, defaults_file, tags, options = self._get_config(instance) if (not host or not user) and not defaults_file: raise Exception("Mysql host and user are needed.") db = self._connect(host, port, mysql_sock, user, password, defaults_file) # Metric collection self._collect_metrics(host, db, tags, options) if Platform.is_linux(): self._collect_system_metrics(host, db, tags)
def check(self, instance): host, port, user, password, mysql_sock, defaults_file, tags, options = self._get_config(instance) if (not host or not user) and not defaults_file: raise Exception("Mysql host and user are needed.") db = self._connect(host, port, mysql_sock, user, password, defaults_file) # Metric collection self._collect_metrics(host, db, tags, options) if Platform.is_unix(): self._collect_system_metrics(host, db, tags)
def _supervisor_status(self): if Platform.is_windows(): print 'Windows - status not implemented' else: agent_exec = self._get_path_agent_exec() print '{0} status'.format(agent_exec) self._print_output_command([agent_exec, 'status']) supervisor_exec = self._get_path_supervisor_exec() print '{0} status'.format(supervisor_exec) self._print_output_command([supervisor_exec, '-c', self._get_path_supervisor_conf(), 'status'])
def check(self, instance): """Get disk space/inode stats""" # Windows and Mac will always have psutil # (we have packaged for both of them) if self._psutil(): if Platform.is_linux(): procfs_path = self.agentConfig.get('procfs_path', '/proc').rstrip('/') psutil.PROCFS_PATH = procfs_path self.collect_metrics_psutil() else: # FIXME: implement all_partitions (df -a) self.collect_metrics_manually()
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint) except TimeoutException: self.log.warn( u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...", part.mountpoint ) continue except Exception as e: self.log.warn("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in self._collect_part_metrics(part, disk_usage).iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name) # And finally, latency metrics, a legacy gift from the old Windows Check if Platform.is_win32(): self.collect_latency_metrics()
def _add_conf_tar(self): conf_path = get_config_path() log.info(" * {0}".format(conf_path)) self._tar.add(self._strip_comment(conf_path), os.path.join(self._prefix, "etc", "datadog.conf")) if not Platform.is_windows(): supervisor_path = os.path.join(os.path.dirname(get_config_path()), "supervisor.conf") log.info(" * {0}".format(supervisor_path)) self._tar.add(self._strip_comment(supervisor_path), os.path.join(self._prefix, "etc", "supervisor.conf")) for file_path in glob.glob(os.path.join(get_confd_path(), "*.yaml")) + glob.glob( os.path.join(get_confd_path(), "*.yaml.default") ): self._add_clean_confd(file_path)
def testLoad(self): global logger load = Load(logger) res = load.check({'system_stats': get_system_stats()}) assert 'system.load.1' in res if Platform.is_linux(): cores = int(get_system_stats().get('cpuCores')) assert 'system.load.norm.1' in res assert abs(res['system.load.1'] - cores * res['system.load.norm.1']) <= 0.1, (res['system.load.1'], cores * res['system.load.norm.1']) # same test but without cpu count, no normalized load sent. res = load.check({}) assert 'system.load.1' in res assert 'system.load.norm.1' not in res
def _add_conf_tar(self): conf_path = get_config_path() log.info(" * {0}".format(conf_path)) self._tar.add(self._strip_comment(conf_path), os.path.join(self._prefix, 'etc', 'datadog.conf')) if not Platform.is_windows(): supervisor_path = os.path.join(os.path.dirname(get_config_path()), 'supervisor.conf') log.info(" * {0}".format(supervisor_path)) self._tar.add(self._strip_comment(supervisor_path), os.path.join(self._prefix, 'etc', 'supervisor.conf')) for file_path in glob.glob(os.path.join(get_confd_path(), '*.yaml')): self._add_clean_confd(file_path)
def testMemory(self): global logger res = Memory(logger).check({}) if Platform.is_linux(): MEM_METRICS = ["swapTotal", "swapFree", "swapPctFree", "swapUsed", "physTotal", "physFree", "physUsed", "physBuffers", "physCached", "physUsable", "physPctUsable", "physShared"] for k in MEM_METRICS: # % metric is only here if total > 0 if k == 'swapPctFree' and res['swapTotal'] == 0: continue assert k in res, res assert res["swapTotal"] == res["swapFree"] + res["swapUsed"] assert res["physTotal"] == res["physFree"] + res["physUsed"] elif sys.platform == 'darwin': for k in ("swapFree", "swapUsed", "physFree", "physUsed"): assert k in res, res
def check(self, agentConfig): if Platform.is_linux(): try: loadAvrgProc = open('/proc/loadavg', 'r') uptime = loadAvrgProc.readlines() loadAvrgProc.close() except Exception: self.logger.exception('Cannot extract load') return False uptime = uptime[ 0] # readlines() provides a list but we want a string elif sys.platform in ('darwin', 'sunos5') or sys.platform.startswith("freebsd"): # Get output from uptime try: uptime = sp.Popen(['uptime'], stdout=sp.PIPE, close_fds=True).communicate()[0] except Exception: self.logger.exception('Cannot extract load') return False # Split out the 3 load average values load = [ res.replace(',', '.') for res in re.findall(r'([0-9]+[\.,]\d+)', uptime) ] # Normalize load by number of cores try: cores = int(agentConfig.get('system_stats').get('cpuCores')) assert cores >= 1, "Cannot determine number of cores" # Compute a normalized load, named .load.norm to make it easy to find next to .load return { 'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2]), 'system.load.norm.1': float(load[0]) / cores, 'system.load.norm.5': float(load[1]) / cores, 'system.load.norm.15': float(load[2]) / cores, } except Exception: # No normalized load available return { 'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2]) }
def pid_exists(pid): """ Check if a pid exists. Lighter than psutil.pid_exists """ if psutil: return psutil.pid_exists(pid) if Platform.is_windows(): import ctypes # Available from python2.5 kernel32 = ctypes.windll.kernel32 synchronize = 0x100000 process = kernel32.OpenProcess(synchronize, 0, pid) if process != 0: kernel32.CloseHandle(process) return True else: return False # Code from psutil._psposix.pid_exists # See https://github.com/giampaolo/psutil/blob/master/psutil/_psposix.py if pid == 0: # According to "man 2 kill" PID 0 has a special meaning: # it refers to <<every process in the process group of the # calling process>> so we don't want to go any further. # If we get here it means this UNIX platform *does* have # a process with id 0. return True try: os.kill(pid, 0) except OSError as err: if err.errno == errno.ESRCH: # ESRCH == No such process return False elif err.errno == errno.EPERM: # EPERM clearly means there's a process to deny access to return True else: # According to "man 2 kill" possible error values are # (EINVAL, EPERM, ESRCH) therefore we should never get # here. If we do let's be explicit in considering this # an error. raise err else: return True
def testLoad(self): global logger load = Load(logger) res = load.check({"system_stats": get_system_stats()}) assert "system.load.1" in res if Platform.is_linux(): cores = int(get_system_stats().get("cpuCores")) assert "system.load.norm.1" in res assert abs(res["system.load.1"] - cores * res["system.load.norm.1"]) <= 0.1, ( res["system.load.1"], cores * res["system.load.norm.1"], ) # same test but without cpu count, no normalized load sent. res = load.check({}) assert "system.load.1" in res assert "system.load.norm.1" not in res
def _add_conf_tar(self): conf_path = get_config_path() if self._can_read(conf_path): self._tar.add(self._strip_comment(conf_path), os.path.join(self._prefix, 'etc', 'datadog.conf')) if not Platform.is_windows(): supervisor_path = os.path.join(os.path.dirname(get_config_path()), 'supervisor.conf') if self._can_read(supervisor_path): self._tar.add( self._strip_comment(supervisor_path), os.path.join(self._prefix, 'etc', 'supervisor.conf')) for file_path in glob.glob(os.path.join(get_confd_path(), '*.yaml')) +\ glob.glob(os.path.join(get_confd_path(), '*.yaml.default')): if self._can_read(file_path, output=False): self._add_clean_confd(file_path)
def _load_conf(self, instance): self._excluded_filesystems = instance.get('excluded_filesystems', []) self._excluded_disks = instance.get('excluded_disks', []) self._tag_by_filesystem = _is_affirmative( instance.get('tag_by_filesystem', False)) # On Windows, we need all_partitions to True by default to collect # metrics about remote disks # On Linux, we need all_partitions to False to avoid collecting metrics # about nodev filesystems self._all_partitions = _is_affirmative( instance.get('all_partitions', Platform.is_win32())) # FIXME: 6.x, drop use_mount option in datadog.conf self._load_legacy_option(instance, 'use_mount', False, operation=_is_affirmative) # FIXME: 6.x, drop device_blacklist_re option in datadog.conf self._load_legacy_option(instance, 'excluded_disk_re', '^$', legacy_name='device_blacklist_re', operation=re.compile)
def check(self, agentConfig): if Platform.is_linux(): try: loadAvrgProc = open('/proc/loadavg', 'r') uptime = loadAvrgProc.readlines() loadAvrgProc.close() except Exception: self.logger.exception('Cannot extract load') return False uptime = uptime[0] # readlines() provides a list but we want a string elif sys.platform in ('darwin', 'sunos5') or sys.platform.startswith("freebsd"): # Get output from uptime try: uptime = sp.Popen(['uptime'], stdout=sp.PIPE, close_fds=True).communicate()[0] except Exception: self.logger.exception('Cannot extract load') return False # Split out the 3 load average values load = [res.replace(',', '.') for res in re.findall(r'([0-9]+[\.,]\d+)', uptime)] # Normalize load by number of cores try: cores = int(agentConfig.get('system_stats').get('cpuCores')) assert cores >= 1, "Cannot determine number of cores" # Compute a normalized load, named .load.norm to make it easy to find next to .load return {'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2]), 'system.load.norm.1': float(load[0])/cores, 'system.load.norm.5': float(load[1])/cores, 'system.load.norm.15': float(load[2])/cores, } except Exception: # No normalized load available return {'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2])}
def get_system_stats(): systemStats = { "machine": platform.machine(), "platform": sys.platform, "processor": platform.processor(), "pythonV": platform.python_version(), } platf = sys.platform if Platform.is_linux(platf): grep = subprocess.Popen(["grep", "model name", "/proc/cpuinfo"], stdout=subprocess.PIPE, close_fds=True) wc = subprocess.Popen(["wc", "-l"], stdin=grep.stdout, stdout=subprocess.PIPE, close_fds=True) systemStats["cpuCores"] = int(wc.communicate()[0]) if Platform.is_darwin(platf): systemStats["cpuCores"] = int( subprocess.Popen(["sysctl", "hw.ncpu"], stdout=subprocess.PIPE, close_fds=True) .communicate()[0] .split(": ")[1] ) if Platform.is_freebsd(platf): systemStats["cpuCores"] = int( subprocess.Popen(["sysctl", "hw.ncpu"], stdout=subprocess.PIPE, close_fds=True) .communicate()[0] .split(": ")[1] ) if Platform.is_linux(platf): systemStats["nixV"] = platform.dist() elif Platform.is_darwin(platf): systemStats["macV"] = platform.mac_ver() elif Platform.is_freebsd(platf): version = platform.uname()[2] systemStats["fbsdV"] = ("freebsd", version, "") # no codename for FreeBSD elif Platform.is_win32(platf): systemStats["winV"] = platform.win32_ver() return systemStats
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = psutil.disk_usage(part.mountpoint) except Exception, e: self.log.debug("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug("Passed: {0}".format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total) # The problem here is that total includes reserved space the user # doesn't have access to. This causes psutil to calculate a misleadng # percentage for in_use; a lower percentage than df shows. # Calculate in_use w/o reserved space; consistent w/ df's Use% metric. pmets = self._collect_part_metrics(part, disk_usage) used = "system.disk.used" free = "system.disk.free" pmets["system.disk.in_use"] = pmets[used] / (pmets[used] + pmets[free]) # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip("\\").lower() for metric_name, metric_value in pmets.iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name)
class Disk(AgentCheck): """ Collects metrics about the machine's disks. """ # -T for filesystem info DF_COMMAND = ['df', '-T'] METRIC_DISK = 'system.disk.{0}' METRIC_INODE = 'system.fs.inodes.{0}' def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception( "Disk check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) # Get the configuration once for all self._load_conf(instances[0]) def check(self, instance): """Get disk space/inode stats""" # Windows and Mac will always have psutil # (we have packaged for both of them) if self._psutil(): self.collect_metrics_psutil() else: # FIXME: implement all_partitions (df -a) self.collect_metrics_manually() @classmethod def _psutil(cls): return psutil is not None def _load_conf(self, instance): self._excluded_filesystems = instance.get('excluded_filesystems', []) self._excluded_disks = instance.get('excluded_disks', []) self._tag_by_filesystem = _is_affirmative( instance.get('tag_by_filesystem', False)) self._all_partitions = _is_affirmative( instance.get('all_partitions', False)) # Force exclusion of CDROM (iso9660) from disk check self._excluded_filesystems.append('iso9660') # FIXME: 6.x, drop use_mount option in datadog.conf self._load_legacy_option(instance, 'use_mount', False, operation=_is_affirmative) # FIXME: 6.x, drop device_blacklist_re option in datadog.conf self._load_legacy_option(instance, 'excluded_disk_re', '^$', legacy_name='device_blacklist_re', operation=re.compile) def _load_legacy_option(self, instance, option, default, legacy_name=None, operation=lambda l: l): value = instance.get(option, default) legacy_name = legacy_name or option if value == default and legacy_name in self.agentConfig: self.log.warn("Using `{0}` in datadog.conf has been deprecated" " in favor of `{1}` in disk.yaml".format( legacy_name, option)) value = self.agentConfig.get(legacy_name) or default setattr(self, '_{0}'.format(option), operation(value)) def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = psutil.disk_usage(part.mountpoint) except Exception, e: self.log.debug("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total) # The problem here is that total includes reserved space the user # doesn't have access to. This causes psutil to calculate a misleadng # percentage for in_use; a lower percentage than df shows. # Calculate in_use w/o reserved space; consistent w/ df's Use% metric. pmets = self._collect_part_metrics(part, disk_usage) used = 'system.disk.used' free = 'system.disk.free' pmets['system.disk.in_use'] = pmets[used] / (pmets[used] + pmets[free]) # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in pmets.iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name) # And finally, latency metrics, a legacy gift from the old Windows Check if Platform.is_win32(): self.collect_latency_metrics()
def check(self, agentConfig): """Return an aggregate of CPU stats across all CPUs When figures are not available, False is sent back. """ def format_results(us, sy, wa, idle, st): data = { 'cpuUser': us, 'cpuSystem': sy, 'cpuWait': wa, 'cpuIdle': idle, 'cpuStolen': st } for key in data.keys(): if data[key] is None: del data[key] return data def get_value(legend, data, name, filter_value=None): "Using the legend and a metric name, get the value or None from the data line" if name in legend: value = to_float(data[legend.index(name)]) if filter_value is not None: if value > filter_value: return None return value else: # FIXME return a float or False, would trigger type error if not python self.logger.debug("Cannot extract cpu value %s from %s (%s)" % (name, data, legend)) return 0.0 if Platform.is_linux(): mpstat = sp.Popen(['mpstat', '1', '3'], stdout=sp.PIPE, close_fds=True).communicate()[0] # topdog@ip:~$ mpstat 1 3 # Linux 2.6.32-341-ec2 (ip) 01/19/2012 _x86_64_ (2 CPU) # # 04:22:41 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle # 04:22:42 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # 04:22:43 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # 04:22:44 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # Average: all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # # OR # # Thanks to Mart Visser to spotting this one. # blah:/etc/dd-agent# mpstat # Linux 2.6.26-2-xen-amd64 (atira) 02/17/2012 _x86_64_ # # 05:27:03 PM CPU %user %nice %sys %iowait %irq %soft %steal %idle intr/s # 05:27:03 PM all 3.59 0.00 0.68 0.69 0.00 0.00 0.01 95.03 43.65 # lines = mpstat.split("\n") legend = [l for l in lines if "%usr" in l or "%user" in l] avg = [l for l in lines if "Average" in l] if len(legend) == 1 and len(avg) == 1: headers = [ h for h in legend[0].split() if h not in ("AM", "PM") ] data = avg[0].split() # Userland # Debian lenny says %user so we look for both # One of them will be 0 cpu_metrics = { "%usr": None, "%user": None, "%nice": None, "%iowait": None, "%idle": None, "%sys": None, "%irq": None, "%soft": None, "%steal": None, } for cpu_m in cpu_metrics: cpu_metrics[cpu_m] = get_value(headers, data, cpu_m, filter_value=110) if any([v is None for v in cpu_metrics.values()]): self.logger.warning("Invalid mpstat data: %s" % data) cpu_user = cpu_metrics["%usr"] + cpu_metrics[ "%user"] + cpu_metrics["%nice"] cpu_system = cpu_metrics["%sys"] + cpu_metrics[ "%irq"] + cpu_metrics["%soft"] cpu_wait = cpu_metrics["%iowait"] cpu_idle = cpu_metrics["%idle"] cpu_stolen = cpu_metrics["%steal"] return format_results(cpu_user, cpu_system, cpu_wait, cpu_idle, cpu_stolen) else: return False elif sys.platform == 'darwin': # generate 3 seconds of data # [' disk0 disk1 cpu load average', ' KB/t tps MB/s KB/t tps MB/s us sy id 1m 5m 15m', ' 21.23 13 0.27 17.85 7 0.13 14 7 79 1.04 1.27 1.31', ' 4.00 3 0.01 5.00 8 0.04 12 10 78 1.04 1.27 1.31', ''] iostats = sp.Popen(['iostat', '-C', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0] lines = [l for l in iostats.split("\n") if len(l) > 0] legend = [l for l in lines if "us" in l] if len(legend) == 1: headers = legend[0].split() data = lines[-1].split() cpu_user = get_value(headers, data, "us") cpu_sys = get_value(headers, data, "sy") cpu_wait = 0 cpu_idle = get_value(headers, data, "id") cpu_st = 0 return format_results(cpu_user, cpu_sys, cpu_wait, cpu_idle, cpu_st) else: self.logger.warn( "Expected to get at least 4 lines of data from iostat instead of just " + str(iostats[:max(80, len(iostats))])) return False elif sys.platform.startswith("freebsd"): # generate 3 seconds of data # tty ada0 cd0 pass0 cpu # tin tout KB/t tps MB/s KB/t tps MB/s KB/t tps MB/s us ni sy in id # 0 69 26.71 0 0.01 0.00 0 0.00 0.00 0 0.00 2 0 0 1 97 # 0 78 0.00 0 0.00 0.00 0 0.00 0.00 0 0.00 0 0 0 0 100 iostats = sp.Popen(['iostat', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0] lines = [l for l in iostats.split("\n") if len(l) > 0] legend = [l for l in lines if "us" in l] if len(legend) == 1: headers = legend[0].split() data = lines[-1].split() cpu_user = get_value(headers, data, "us") cpu_nice = get_value(headers, data, "ni") cpu_sys = get_value(headers, data, "sy") cpu_intr = get_value(headers, data, "in") cpu_wait = 0 cpu_idle = get_value(headers, data, "id") cpu_stol = 0 return format_results(cpu_user + cpu_nice, cpu_sys + cpu_intr, cpu_wait, cpu_idle, cpu_stol) else: self.logger.warn( "Expected to get at least 4 lines of data from iostat instead of just " + str(iostats[:max(80, len(iostats))])) return False elif sys.platform == 'sunos5': # mpstat -aq 1 2 # SET minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl sze # 0 5239 0 12857 22969 5523 14628 73 546 4055 1 146856 5 6 0 89 24 <-- since boot # 1 ... # SET minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl sze # 0 20374 0 45634 57792 5786 26767 80 876 20036 2 724475 13 13 0 75 24 <-- past 1s # 1 ... # http://docs.oracle.com/cd/E23824_01/html/821-1462/mpstat-1m.html # # Will aggregate over all processor sets try: mpstat = sp.Popen(['mpstat', '-aq', '1', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0] lines = [l for l in mpstat.split("\n") if len(l) > 0] # discard the first len(lines)/2 lines lines = lines[len(lines) / 2:] legend = [l for l in lines if "SET" in l] assert len(legend) == 1 if len(legend) == 1: headers = legend[0].split() # collect stats for each processor set # and aggregate them based on the relative set size d_lines = [l for l in lines if "SET" not in l] user = [ get_value(headers, l.split(), "usr") for l in d_lines ] kern = [ get_value(headers, l.split(), "sys") for l in d_lines ] wait = [ get_value(headers, l.split(), "wt") for l in d_lines ] idle = [ get_value(headers, l.split(), "idl") for l in d_lines ] size = [ get_value(headers, l.split(), "sze") for l in d_lines ] count = sum(size) rel_size = [s / count for s in size] dot = lambda v1, v2: reduce(operator.add, map(operator.mul, v1, v2)) return format_results(dot(user, rel_size), dot(kern, rel_size), dot(wait, rel_size), dot(idle, rel_size), 0.0) except Exception: self.logger.exception("Cannot compute CPU stats") return False else: self.logger.warn("CPUStats: unsupported platform") return False
def check(self, agentConfig): if Platform.is_linux(): try: meminfoProc = open('/proc/meminfo', 'r') lines = meminfoProc.readlines() meminfoProc.close() except Exception: self.logger.exception( 'Cannot get memory metrics from /proc/meminfo') return False # $ cat /proc/meminfo # MemTotal: 7995360 kB # MemFree: 1045120 kB # Buffers: 226284 kB # Cached: 775516 kB # SwapCached: 248868 kB # Active: 1004816 kB # Inactive: 1011948 kB # Active(anon): 455152 kB # Inactive(anon): 584664 kB # Active(file): 549664 kB # Inactive(file): 427284 kB # Unevictable: 4392476 kB # Mlocked: 4392476 kB # SwapTotal: 11120632 kB # SwapFree: 10555044 kB # Dirty: 2948 kB # Writeback: 0 kB # AnonPages: 5203560 kB # Mapped: 50520 kB # Shmem: 10108 kB # Slab: 161300 kB # SReclaimable: 136108 kB # SUnreclaim: 25192 kB # KernelStack: 3160 kB # PageTables: 26776 kB # NFS_Unstable: 0 kB # Bounce: 0 kB # WritebackTmp: 0 kB # CommitLimit: 15118312 kB # Committed_AS: 6703508 kB # VmallocTotal: 34359738367 kB # VmallocUsed: 400668 kB # VmallocChunk: 34359329524 kB # HardwareCorrupted: 0 kB # HugePages_Total: 0 # HugePages_Free: 0 # HugePages_Rsvd: 0 # HugePages_Surp: 0 # Hugepagesize: 2048 kB # DirectMap4k: 10112 kB # DirectMap2M: 8243200 kB regexp = re.compile( r'^(\w+):\s+([0-9]+)' ) # We run this several times so one-time compile now meminfo = {} for line in lines: try: match = re.search(regexp, line) if match is not None: meminfo[match.group(1)] = match.group(2) except Exception: self.logger.exception("Cannot parse /proc/meminfo") memData = {} # Physical memory # FIXME units are in MB, we should use bytes instead try: memData['physTotal'] = int(meminfo.get('MemTotal', 0)) / 1024 memData['physFree'] = int(meminfo.get('MemFree', 0)) / 1024 memData['physBuffers'] = int(meminfo.get('Buffers', 0)) / 1024 memData['physCached'] = int(meminfo.get('Cached', 0)) / 1024 memData['physShared'] = int(meminfo.get('Shmem', 0)) / 1024 memData[ 'physUsed'] = memData['physTotal'] - memData['physFree'] # Usable is relative since cached and buffers are actually used to speed things up. memData['physUsable'] = memData['physFree'] + memData[ 'physBuffers'] + memData['physCached'] if memData['physTotal'] > 0: memData['physPctUsable'] = float( memData['physUsable']) / float(memData['physTotal']) except Exception: self.logger.exception( 'Cannot compute stats from /proc/meminfo') # Swap # FIXME units are in MB, we should use bytes instead try: memData['swapTotal'] = int(meminfo.get('SwapTotal', 0)) / 1024 memData['swapFree'] = int(meminfo.get('SwapFree', 0)) / 1024 memData[ 'swapUsed'] = memData['swapTotal'] - memData['swapFree'] if memData['swapTotal'] > 0: memData['swapPctFree'] = float( memData['swapFree']) / float(memData['swapTotal']) except Exception: self.logger.exception('Cannot compute swap stats') return memData elif sys.platform == 'darwin': macV = platform.mac_ver() macV_minor_version = int( re.match(r'10\.(\d+)\.?.*', macV[0]).group(1)) try: top = sp.Popen(['top', '-l 1'], stdout=sp.PIPE, close_fds=True).communicate()[0] sysctl = sp.Popen(['sysctl', 'vm.swapusage'], stdout=sp.PIPE, close_fds=True).communicate()[0] except StandardError: self.logger.exception('getMemoryUsage') return False # Deal with top lines = top.split('\n') physParts = re.findall(r'([0-9]\d+)', lines[self.topIndex]) # Deal with sysctl swapParts = re.findall(r'([0-9]+\.\d+)', sysctl) # Mavericks changes the layout of physical memory format in `top` physUsedPartIndex = 3 physFreePartIndex = 4 if macV and (macV_minor_version >= 9): physUsedPartIndex = 0 physFreePartIndex = 2 return { 'physUsed': physParts[physUsedPartIndex], 'physFree': physParts[physFreePartIndex], 'swapUsed': swapParts[1], 'swapFree': swapParts[2] } elif sys.platform.startswith("freebsd"): try: sysctl = sp.Popen(['sysctl', 'vm.stats.vm'], stdout=sp.PIPE, close_fds=True).communicate()[0] except Exception: self.logger.exception('getMemoryUsage') return False lines = sysctl.split('\n') # ... # vm.stats.vm.v_page_size: 4096 # vm.stats.vm.v_page_count: 759884 # vm.stats.vm.v_wire_count: 122726 # vm.stats.vm.v_active_count: 109350 # vm.stats.vm.v_cache_count: 17437 # vm.stats.vm.v_inactive_count: 479673 # vm.stats.vm.v_free_count: 30542 # ... # We run this several times so one-time compile now regexp = re.compile(r'^vm\.stats\.vm\.(\w+):\s+([0-9]+)') meminfo = {} for line in lines: try: match = re.search(regexp, line) if match is not None: meminfo[match.group(1)] = match.group(2) except Exception: self.logger.exception( "Cannot parse sysctl vm.stats.vm output") memData = {} # Physical memory try: pageSize = int(meminfo.get('v_page_size')) memData['physTotal'] = (int(meminfo.get('v_page_count', 0)) * pageSize) / 1048576 memData['physFree'] = (int(meminfo.get('v_free_count', 0)) * pageSize) / 1048576 memData['physCached'] = (int(meminfo.get('v_cache_count', 0)) * pageSize) / 1048576 memData['physUsed'] = ( (int(meminfo.get('v_active_count'), 0) + int(meminfo.get('v_wire_count', 0))) * pageSize) / 1048576 memData['physUsable'] = ( (int(meminfo.get('v_free_count'), 0) + int(meminfo.get('v_cache_count', 0)) + int(meminfo.get('v_inactive_count', 0))) * pageSize) / 1048576 if memData['physTotal'] > 0: memData['physPctUsable'] = float( memData['physUsable']) / float(memData['physTotal']) except Exception: self.logger.exception( 'Cannot compute stats from /proc/meminfo') # Swap try: sysctl = sp.Popen(['swapinfo', '-m'], stdout=sp.PIPE, close_fds=True).communicate()[0] except Exception: self.logger.exception('getMemoryUsage') return False lines = sysctl.split('\n') # ... # Device 1M-blocks Used Avail Capacity # /dev/ad0s1b 570 0 570 0% # ... assert "Device" in lines[0] try: memData['swapTotal'] = 0 memData['swapFree'] = 0 memData['swapUsed'] = 0 for line in lines[1:-1]: line = line.split() memData['swapTotal'] += int(line[1]) memData['swapFree'] += int(line[3]) memData['swapUsed'] += int(line[2]) except Exception: self.logger.exception('Cannot compute stats from swapinfo') return memData elif sys.platform == 'sunos5': try: memData = {} kmem = sp.Popen(["kstat", "-c", "zone_memory_cap", "-p"], stdout=sp.PIPE, close_fds=True).communicate()[0] # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anon_alloc_fail 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anonpgin 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:class zone_memory_cap # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:crtime 16359935.0680834 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:execpgin 185 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:fspgin 2556 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle_usec 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:nover 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pagedout 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pgpgin 2741 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:physcap 536870912 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:rss 115544064 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:snaptime 16787393.9439095 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swap 91828224 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swapcap 1073741824 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename 53aa9b7e-48ba-4152-a52b-a6368c3d9e7c # turn memory_cap:360:zone_name:key value # into { "key": value, ...} kv = [ l.strip().split() for l in kmem.split("\n") if len(l) > 0 ] entries = dict([(k.split(":")[-1], v) for (k, v) in kv]) # extract rss, physcap, swap, swapcap, turn into MB convert = lambda v: int(long(v)) / 2**20 memData["physTotal"] = convert(entries["physcap"]) memData["physUsed"] = convert(entries["rss"]) memData[ "physFree"] = memData["physTotal"] - memData["physUsed"] memData["swapTotal"] = convert(entries["swapcap"]) memData["swapUsed"] = convert(entries["swap"]) memData[ "swapFree"] = memData["swapTotal"] - memData["swapUsed"] if memData['swapTotal'] > 0: memData['swapPctFree'] = float( memData['swapFree']) / float(memData['swapTotal']) return memData except Exception: self.logger.exception( "Cannot compute mem stats from kstat -c zone_memory_cap") return False else: return False