Example #1
0
    def testNetwork(self):
        # FIXME: cx_state to true, but needs sysstat installed
        config = """
init_config:

instances:
    - collect_connection_state: false
      excluded_interfaces:
        - lo
        - lo0
"""
        check, instances = get_check("network", config)

        check.check(instances[0])
        check.get_metrics()

        metric_names = [m[0] for m in check.aggregator.metrics]

        assert "system.net.bytes_rcvd" in metric_names
        assert "system.net.bytes_sent" in metric_names
        if Platform.is_linux():
            assert "system.net.tcp.retrans_segs" in metric_names
            assert "system.net.tcp.in_segs" in metric_names
            assert "system.net.tcp.out_segs" in metric_names
        elif Platform.is_bsd():
            assert "system.net.tcp.retrans_packs" in metric_names
            assert "system.net.tcp.sent_packs" in metric_names
            assert "system.net.tcp.rcv_packs" in metric_names
Example #2
0
    def testNetwork(self):
        # FIXME: cx_state to true, but needs sysstat installed
        config = """
init_config:

instances:
    - collect_connection_state: false
      excluded_interfaces:
        - lo
        - lo0
"""
        check, instances = get_check('network', config)

        check.check(instances[0])
        check.get_metrics()

        metric_names = [m[0] for m in check.aggregator.metrics]

        assert 'system.net.bytes_rcvd' in metric_names
        assert 'system.net.bytes_sent' in metric_names
        if Platform.is_linux():
            assert 'system.net.tcp.retrans_segs' in metric_names
            assert 'system.net.tcp.in_segs' in metric_names
            assert 'system.net.tcp.out_segs' in metric_names
        elif Platform.is_bsd():
            assert 'system.net.tcp.retrans_packs' in metric_names
            assert 'system.net.tcp.sent_packs' in metric_names
            assert 'system.net.tcp.rcv_packs' in metric_names
Example #3
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue

            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint)
            except TimeoutException:
                self.log.warn(
                    u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...",
                    part.mountpoint)
                continue
            except Exception as e:
                self.log.warn("Unable to get disk metrics for %s: %s",
                              part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total)
            #       The problem here is that total includes reserved space the user
            #       doesn't have access to. This causes psutil to calculate a misleadng
            #       percentage for in_use; a lower percentage than df shows.

            # Calculate in_use w/o reserved space; consistent w/ df's Use% metric.
            pmets = self._collect_part_metrics(part, disk_usage)
            used = 'system.disk.used'
            free = 'system.disk.free'
            pmets['system.disk.in_use'] = pmets[used] / (pmets[used] +
                                                         pmets[free])

            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in pmets.iteritems():
                self.gauge(metric_name,
                           metric_value,
                           tags=tags,
                           device_name=device_name)
        # And finally, latency metrics, a legacy gift from the old Windows Check
        if Platform.is_win32():
            self.collect_latency_metrics()
Example #4
0
    def parse_df_output(self,
                        df_output,
                        platform_name,
                        inodes=False,
                        use_mount=False,
                        blacklist_re=None):
        """
        Parse the output of the df command. If use_volume is true the volume
        is used to anchor the metric, otherwise false the mount
        point is used. Returns a tuple of (disk, inode).
        """
        usage_data = []

        # Transform the raw output into tuples of the df data.
        devices = self._transform_df_output(df_output, blacklist_re)

        # If we want to use the mount point, replace the volume name on each
        # line.
        for parts in devices:
            try:
                if use_mount:
                    parts[0] = parts[-1]
                if inodes:
                    if Platform.is_darwin(platform_name):
                        # Filesystem 512-blocks Used Available Capacity iused ifree %iused  Mounted
                        # Inodes are in position 5, 6 and we need to compute the total
                        # Total
                        parts[1] = int(parts[5]) + int(parts[6])  # Total
                        parts[2] = int(parts[5])  # Used
                        parts[3] = int(parts[6])  # Available
                    elif Platform.is_freebsd(platform_name):
                        # Filesystem 1K-blocks Used Avail Capacity iused ifree %iused Mounted
                        # Inodes are in position 5, 6 and we need to compute the total
                        parts[1] = int(parts[5]) + int(parts[6])  # Total
                        parts[2] = int(parts[5])  # Used
                        parts[3] = int(parts[6])  # Available
                    else:
                        parts[1] = int(parts[1])  # Total
                        parts[2] = int(parts[2])  # Used
                        parts[3] = int(parts[3])  # Available
                else:
                    parts[1] = int(parts[1])  # Total
                    parts[2] = int(parts[2])  # Used
                    parts[3] = int(parts[3])  # Available
            except IndexError:
                self.logger.exception("Cannot parse %s" % (parts, ))

            usage_data.append(parts)

        return usage_data
Example #5
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue

            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint)
            except TimeoutException:
                self.log.warn(
                    u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...",
                    part.mountpoint
                )
                continue
            except Exception as e:
                self.log.warn("Unable to get disk metrics for %s: %s", part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total)
            #       The problem here is that total includes reserved space the user
            #       doesn't have access to. This causes psutil to calculate a misleadng
            #       percentage for in_use; a lower percentage than df shows.

            # Calculate in_use w/o reserved space; consistent w/ df's Use% metric.
            pmets = self._collect_part_metrics(part, disk_usage)
            used = 'system.disk.used'
            free = 'system.disk.free'
            pmets['system.disk.in_use'] = pmets[used] / (pmets[used] + pmets[free])

            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in pmets.iteritems():
                self.gauge(metric_name, metric_value,
                           tags=tags, device_name=device_name)
        # And finally, latency metrics, a legacy gift from the old Windows Check
        if Platform.is_win32():
            self.collect_latency_metrics()
Example #6
0
 def testMemory(self):
     global logger
     res = Memory(logger).check({})
     if Platform.is_linux():
         MEM_METRICS = [
             "swapTotal",
             "swapFree",
             "swapPctFree",
             "swapUsed",
             "physTotal",
             "physFree",
             "physUsed",
             "physBuffers",
             "physCached",
             "physUsable",
             "physPctUsable",
             "physShared",
         ]
         for k in MEM_METRICS:
             # % metric is only here if total > 0
             if k == "swapPctFree" and res["swapTotal"] == 0:
                 continue
             assert k in res, res
         assert res["swapTotal"] == res["swapFree"] + res["swapUsed"]
         assert res["physTotal"] == res["physFree"] + res["physUsed"]
     elif sys.platform == "darwin":
         for k in ("swapFree", "swapUsed", "physFree", "physUsed"):
             assert k in res, res
Example #7
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            if self._exclude_disk_psutil(part):
                continue
            try:
                disk_usage = psutil.disk_usage(part.mountpoint)
            except Exception, e:
                self.log.debug("Unable to get disk metrics for %s: %s",
                               part.mountpoint, e)
                continue

            if disk_usage.total == 0:
                continue
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            pmets = self._collect_part_metrics(part, disk_usage)
            used = 'system.disk.used'
            free = 'system.disk.free'
            pmets['system.disk.pct_usage'] = (pmets[used] / (pmets[used] + pmets[free])) * 100

            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in pmets.iteritems():
                self.gauge(metric_name, metric_value,
                           tags=tags, device_name=device_name)
Example #8
0
    def check(self, instance):
        """ Collect metrics for the given gunicorn instance. """
        self.log.debug("Running instance: %s", instance)

        if Platform.is_linux():
            procfs_path = self.agentConfig.get('procfs_path', '/proc').rstrip('/')
            psutil.PROCFS_PATH = procfs_path

        # Validate the config.
        if not instance or self.PROC_NAME not in instance:
            raise GUnicornCheckError("instance must specify: %s" % self.PROC_NAME)

        # Load the gunicorn master procedure.
        proc_name = instance.get(self.PROC_NAME)
        master_proc = self._get_master_proc_by_name(proc_name)

        # Fetch the worker procs and count their states.
        worker_procs = master_proc.children()
        working, idle = self._count_workers(worker_procs)

        # if no workers are running, alert CRITICAL, otherwise OK
        msg = "%s working and %s idle workers for %s" % (working, idle, proc_name)
        status = AgentCheck.CRITICAL if working == 0 and idle == 0 else AgentCheck.OK

        self.service_check(self.SVC_NAME, status, tags=['app:' + proc_name], message=msg)

        # Submit the data.
        self.log.debug("instance %s procs - working:%s idle:%s" % (proc_name, working, idle))
        self.gauge("gunicorn.workers", working, self.WORKING_TAGS)
        self.gauge("gunicorn.workers", idle, self.IDLE_TAGS)
Example #9
0
    def check(self, instance):
        """ Collect metrics for the given gunicorn instance. """
        self.log.debug("Running instance: %s", instance)

        if Platform.is_linux():
            procfs_path = self.agentConfig.get('procfs_path', '/proc').rstrip('/')
            psutil.PROCFS_PATH = procfs_path

        # Validate the config.
        if not instance or self.PROC_NAME not in instance:
            raise GUnicornCheckError("instance must specify: %s" % self.PROC_NAME)

        # Load the gunicorn master procedure.
        proc_name = instance.get(self.PROC_NAME)
        master_proc = self._get_master_proc_by_name(proc_name)

        # Fetch the worker procs and count their states.
        worker_procs = master_proc.children()
        working, idle = self._count_workers(worker_procs)

        # if no workers are running, alert CRITICAL, otherwise OK
        msg = "%s working and %s idle workers for %s" % (working, idle, proc_name)
        status = AgentCheck.CRITICAL if working == 0 and idle == 0 else AgentCheck.OK

        self.service_check(self.SVC_NAME, status, tags=['app:' + proc_name], message=msg)

        # Submit the data.
        self.log.debug("instance %s procs - working:%s idle:%s" % (proc_name, working, idle))
        self.gauge("gunicorn.workers", working, self.WORKING_TAGS)
        self.gauge("gunicorn.workers", idle, self.IDLE_TAGS)
Example #10
0
    def _host_matches_node(self, primary_addrs):
        """ For < 0.19, check if the current host matches the IP given in the
            cluster nodes check `/_cluster/nodes`. Uses `ip addr` on Linux and
            `ifconfig` on Mac
        """
        if Platform.is_darwin():
            ifaces = subprocess.Popen(["ifconfig"], stdout=subprocess.PIPE)
        else:
            ifaces = subprocess.Popen(["ip", "addr"], stdout=subprocess.PIPE)
        grepper = subprocess.Popen(
            ["grep", "inet"], stdin=ifaces.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )

        ifaces.stdout.close()
        out, err = grepper.communicate()

        # Capture the list of interface IPs
        ips = []
        for iface in out.split("\n"):
            iface = iface.strip()
            if iface:
                ips.append(iface.split(" ")[1].split("/")[0])

        # Check the interface addresses against the primary address
        return primary_addrs in ips
Example #11
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue
            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = psutil.disk_usage(part.mountpoint)
            except Exception, e:
                self.log.debug("Unable to get disk metrics for %s: %s",
                               part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in self._collect_part_metrics(
                    part, disk_usage).iteritems():
                self.gauge(metric_name,
                           metric_value,
                           tags=tags,
                           device_name=device_name)
Example #12
0
    def _host_matches_node(self, primary_addrs):
        """ For < 0.19, check if the current host matches the IP given in the
            cluster nodes check `/_cluster/nodes`. Uses `ip addr` on Linux and
            `ifconfig` on Mac
        """
        if Platform.is_darwin():
            ifaces = subprocess.Popen(['ifconfig'], stdout=subprocess.PIPE)
        else:
            ifaces = subprocess.Popen(['ip', 'addr'], stdout=subprocess.PIPE)
        grepper = subprocess.Popen(['grep', 'inet'],
                                   stdin=ifaces.stdout,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

        ifaces.stdout.close()
        out, err = grepper.communicate()

        # Capture the list of interface IPs
        ips = []
        for iface in out.split("\n"):
            iface = iface.strip()
            if iface:
                ips.append(iface.split(' ')[1].split('/')[0])

        # Check the interface addresses against the primary address
        return primary_addrs in ips
Example #13
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue
            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = psutil.disk_usage(part.mountpoint)
            except Exception, e:
                self.log.debug("Unable to get disk metrics for %s: %s",
                               part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in self._collect_part_metrics(part, disk_usage).iteritems():
                self.gauge(metric_name, metric_value,
                           tags=tags, device_name=device_name)
Example #14
0
 def _exclude_disk_psutil(self, part):
     # skip cd-rom drives with no disk in it; they may raise
     # ENOENT, pop-up a Windows GUI error for a non-ready
     # partition or just hang;
     # and all the other excluded disks
     return ((Platform.is_win32() and
              ('cdrom' in part.opts or part.fstype == ''))
             or self._exclude_disk(part.device, part.fstype))
Example #15
0
 def _exclude_disk_psutil(self, part):
     # skip cd-rom drives with no disk in it; they may raise
     # ENOENT, pop-up a Windows GUI error for a non-ready
     # partition or just hang;
     # and all the other excluded disks
     return ((Platform.is_win32() and ('cdrom' in part.opts or
                                       part.fstype == '')) or
             self._exclude_disk(part.device, part.fstype, part.mountpoint))
Example #16
0
 def _get_pickle_path(cls):
     if Platform.is_win32():
         path = os.path.join(_windows_commondata_path(), 'Datadog',
                             cls.__name__ + '.pickle')
     else:
         path = os.path.join(tempfile.gettempdir(),
                             cls.__name__ + '.pickle')
     return path
Example #17
0
def get_system_stats():
    systemStats = {
        'machine': platform.machine(),
        'platform': sys.platform,
        'processor': platform.processor(),
        'pythonV': platform.python_version(),
    }

    platf = sys.platform

    if  Platform.is_linux(platf):
        grep = subprocess.Popen(['grep', 'model name', '/proc/cpuinfo'], stdout=subprocess.PIPE, close_fds=True)
        wc = subprocess.Popen(['wc', '-l'], stdin=grep.stdout, stdout=subprocess.PIPE, close_fds=True)
        systemStats['cpuCores'] = int(wc.communicate()[0])

    if Platform.is_darwin(platf):
        systemStats['cpuCores'] = int(subprocess.Popen(['sysctl', 'hw.ncpu'], stdout=subprocess.PIPE, close_fds=True).communicate()[0].split(': ')[1])

    if Platform.is_freebsd(platf):
        systemStats['cpuCores'] = int(subprocess.Popen(['sysctl', 'hw.ncpu'], stdout=subprocess.PIPE, close_fds=True).communicate()[0].split(': ')[1])

    if Platform.is_linux(platf):
        systemStats['nixV'] = platform.dist()

    elif Platform.is_darwin(platf):
        systemStats['macV'] = platform.mac_ver()

    elif Platform.is_freebsd(platf):
        version = platform.uname()[2]
        systemStats['fbsdV'] = ('freebsd', version, '')  # no codename for FreeBSD

    elif Platform.is_win32(platf):
        systemStats['winV'] = platform.win32_ver()

    return systemStats
Example #18
0
    def parse_df_output(self, df_output, platform_name, inodes=False, use_mount=False, blacklist_re=None):
        """
        Parse the output of the df command. If use_volume is true the volume
        is used to anchor the metric, otherwise false the mount 
        point is used. Returns a tuple of (disk, inode).
        """
        usage_data = []

        # Transform the raw output into tuples of the df data.
        devices = self._transform_df_output(df_output, blacklist_re)

        # If we want to use the mount point, replace the volume name on each
        # line.
        for parts in devices:
            try:
                if use_mount:
                    parts[0] = parts[-1]
                if inodes:
                    if Platform.is_darwin(platform_name):
                        # Filesystem 512-blocks Used Available Capacity iused ifree %iused  Mounted
                        # Inodes are in position 5, 6 and we need to compute the total
                        # Total
                        parts[1] = int(parts[5]) + int(parts[6]) # Total
                        parts[2] = int(parts[5]) # Used
                        parts[3] = int(parts[6]) # Available
                    elif Platform.is_freebsd(platform_name):
                        # Filesystem 1K-blocks Used Avail Capacity iused ifree %iused Mounted
                        # Inodes are in position 5, 6 and we need to compute the total
                        parts[1] = int(parts[5]) + int(parts[6]) # Total
                        parts[2] = int(parts[5]) # Used
                        parts[3] = int(parts[6]) # Available
                    else:
                        parts[1] = int(parts[1]) # Total
                        parts[2] = int(parts[2]) # Used
                        parts[3] = int(parts[3]) # Available
                else:
                    parts[1] = int(parts[1]) # Total
                    parts[2] = int(parts[2]) # Used
                    parts[3] = int(parts[3]) # Available
            except IndexError:
                self.logger.exception("Cannot parse %s" % (parts,))

            usage_data.append(parts)

        return usage_data
Example #19
0
def get_system_stats():
    systemStats = {
        'machine': platform.machine(),
        'platform': sys.platform,
        'processor': platform.processor(),
        'pythonV': platform.python_version(),
    }

    platf = sys.platform
    
    if  Platform.is_linux(platf):
        grep = subprocess.Popen(['grep', 'model name', '/proc/cpuinfo'], stdout=subprocess.PIPE, close_fds=True)
        wc = subprocess.Popen(['wc', '-l'], stdin=grep.stdout, stdout=subprocess.PIPE, close_fds=True)
        systemStats['cpuCores'] = int(wc.communicate()[0])

    if Platform.is_darwin(platf):
        systemStats['cpuCores'] = int(subprocess.Popen(['sysctl', 'hw.ncpu'], stdout=subprocess.PIPE, close_fds=True).communicate()[0].split(': ')[1])

    if Platform.is_freebsd(platf):
        systemStats['cpuCores'] = int(subprocess.Popen(['sysctl', 'hw.ncpu'], stdout=subprocess.PIPE, close_fds=True).communicate()[0].split(': ')[1])

    if Platform.is_linux(platf):
        systemStats['nixV'] = platform.dist()

    elif Platform.is_darwin(platf):
        systemStats['macV'] = platform.mac_ver()

    elif Platform.is_freebsd(platf):
        version = platform.uname()[2]
        systemStats['fbsdV'] = ('freebsd', version, '')  # no codename for FreeBSD

    elif Platform.is_win32(platf):
        systemStats['winV'] = platform.win32_ver()

    return systemStats
Example #20
0
 def _save_logs_path(self):
     prefix = ''
     if Platform.is_windows():
         prefix = 'windows_'
     config = get_logging_config()
     self._collector_log = config.get('{0}collector_log_file'.format(prefix))
     self._forwarder_log = config.get('{0}forwarder_log_file'.format(prefix))
     self._dogstatsd_log = config.get('{0}dogstatsd_log_file'.format(prefix))
     self._jmxfetch_log = config.get('jmxfetch_log_file')
Example #21
0
 def test_collecting_disk_metrics(self):
     """Testing disk stats gathering"""
     if Platform.is_unix():
         disk = Disk(logger)
         res = disk.check({})
         # Assert we have disk & inode stats
         assert len(res) == 2
         assert res[0]
         assert res[1]
Example #22
0
 def _save_logs_path(self):
     prefix = ''
     if Platform.is_windows():
         prefix = 'windows_'
     config = get_logging_config()
     self._collector_log = config.get('{0}collector_log_file'.format(prefix))
     self._forwarder_log = config.get('{0}forwarder_log_file'.format(prefix))
     self._dogstatsd_log = config.get('{0}dogstatsd_log_file'.format(prefix))
     self._jmxfetch_log = config.get('jmxfetch_log_file')
Example #23
0
 def _save_logs_path(self):
     prefix = ""
     if Platform.is_windows():
         prefix = "windows_"
     config = get_logging_config()
     self._collector_log = config.get("{0}collector_log_file".format(prefix))
     self._forwarder_log = config.get("{0}forwarder_log_file".format(prefix))
     self._dogstatsd_log = config.get("{0}dogstatsd_log_file".format(prefix))
     self._jmxfetch_log = config.get("jmxfetch_log_file")
Example #24
0
 def check_user_rights():
     if Platform.is_unix() and not os.geteuid() == 0:
         log.warning("You are not root, some information won't be collected")
         choice = raw_input("Are you sure you want to continue [y/N]? ").lower()
         if choice not in ["yes", "y"]:
             print "Aborting"
             sys.exit(1)
         else:
             log.warn("Your user has to have at least read access" " to the logs and conf files of the agent")
Example #25
0
    def _collect_part_metrics(self, part, usage):
        metrics = {}
        for name in ['total', 'used', 'free']:
            metrics[self.METRIC_DISK.format(name)] = getattr(usage, name) / 1024.0

        metrics[self.METRIC_DISK.format('pct_usage')] = usage.percent
        if Platform.is_unix():
            metrics.update(self._collect_inodes_metrics(part.mountpoint))

        return metrics
Example #26
0
 def check_user_rights():
     if Platform.is_unix() and not os.geteuid() == 0:
         log.warning("You are not root, some information won't be collected")
         choice = raw_input('Are you sure you want to continue [y/N]? ').lower()
         if choice not in ['yes', 'y']:
             print 'Aborting'
             sys.exit(1)
         else:
             log.warn('Your user has to have at least read access'
                      ' to the logs and conf files of the agent')
Example #27
0
 def _supervisor_status(self):
     if Platform.is_windows():
         print "Windows - status not implemented"
     else:
         agent_exec = self._get_path_agent_exec()
         print "{0} status".format(agent_exec)
         self._print_output_command([agent_exec, "status"])
         supervisor_exec = self._get_path_supervisor_exec()
         print "{0} status".format(supervisor_exec)
         self._print_output_command([supervisor_exec, "-c", self._get_path_supervisor_conf(), "status"])
Example #28
0
    def check(self, instance):
        if instance is None:
            instance = {}

        self._excluded_ifaces = instance.get('excluded_interfaces', [])
        self._collect_cx_state = instance.get('collect_connection_state', False)

        self._exclude_iface_re = None
        exclude_re = instance.get('excluded_interface_re', None)
        if exclude_re:
            self.log.debug("Excluding network devices matching: %s" % exclude_re)
            self._exclude_iface_re = re.compile(exclude_re)

        if Platform.is_linux():
            self._check_linux(instance)
        elif Platform.is_bsd():
            self._check_bsd(instance)
        elif Platform.is_solaris():
            self._check_solaris(instance)
Example #29
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue

            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint)
            except TimeoutException:
                self.log.warn(
                    u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...",
                    part.mountpoint)
                continue
            except Exception as e:
                self.log.warn("Unable to get disk metrics for %s: %s",
                              part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype, 'filesystem:{}'.format(part.fstype)
                    ] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in self._collect_part_metrics(
                    part, disk_usage).iteritems():
                self.gauge(metric_name,
                           metric_value,
                           tags=tags,
                           device_name=device_name)
        # And finally, latency metrics, a legacy gift from the old Windows Check
        if Platform.is_win32():
            self.collect_latency_metrics()
Example #30
0
 def testMemory(self):
     global logger
     res = Memory(logger).check({})
     if Platform.is_linux():
         for k in ("swapTotal", "swapFree", "swapPctFree", "swapUsed", "physTotal", "physFree", "physUsed", "physBuffers", "physCached", "physUsable", "physPctUsable", "physShared"):
             assert k in res, res
         assert res["swapTotal"] == res["swapFree"] + res["swapUsed"]
         assert res["physTotal"] == res["physFree"] + res["physUsed"]
     elif sys.platform == 'darwin':
         for k in ("swapFree", "swapUsed", "physFree", "physUsed"):
             assert k in res, res
Example #31
0
    def _collect_part_metrics(self, part, usage):
        metrics = {}
        for name in ['total', 'used', 'free']:
            # For legacy reasons,  the standard unit it kB
            metrics[self.METRIC_DISK.format(name)] = getattr(usage, name) / 1024.0
        # FIXME: 6.x, use percent, a lot more logical than in_use
        metrics[self.METRIC_DISK.format('in_use')] = usage.percent / 100.0
        if Platform.is_unix():
            metrics.update(self._collect_inodes_metrics(part.mountpoint))

        return metrics
Example #32
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=self._all_partitions):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device
            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in self._collect_part_metrics(part).iteritems():
                self.gauge(metric_name, metric_value,
                           tags=tags, device_name=device_name)
        # And finally, latency metrics, a legacy gift from the old Windows Check
        if Platform.is_win32():
            self.collect_latency_metrics()
Example #33
0
    def check(self, instance):
        host, port, user, password, mysql_sock, defaults_file, tags, options = self._get_config(instance)

        if (not host or not user) and not defaults_file:
            raise Exception("Mysql host and user are needed.")

        db = self._connect(host, port, mysql_sock, user, password, defaults_file)

        # Metric collection
        self._collect_metrics(host, db, tags, options)
        if Platform.is_linux():
            self._collect_system_metrics(host, db, tags)
Example #34
0
    def check(self, instance):
        if instance is None:
            instance = {}

        self._excluded_ifaces = instance.get('excluded_interfaces', [])
        self._collect_cx_state = instance.get('collect_connection_state',
                                              False)

        self._exclude_iface_re = None
        exclude_re = instance.get('excluded_interface_re', None)
        if exclude_re:
            self.log.debug("Excluding network devices matching: %s" %
                           exclude_re)
            self._exclude_iface_re = re.compile(exclude_re)

        if Platform.is_linux():
            self._check_linux(instance)
        elif Platform.is_bsd():
            self._check_bsd(instance)
        elif Platform.is_solaris():
            self._check_solaris(instance)
Example #35
0
    def check(self, instance):
        host, port, user, password, mysql_sock, defaults_file, tags, options = self._get_config(instance)

        if (not host or not user) and not defaults_file:
            raise Exception("Mysql host and user are needed.")

        db = self._connect(host, port, mysql_sock, user, password, defaults_file)

        # Metric collection
        self._collect_metrics(host, db, tags, options)
        if Platform.is_unix():
            self._collect_system_metrics(host, db, tags)
Example #36
0
 def _supervisor_status(self):
     if Platform.is_windows():
         print 'Windows - status not implemented'
     else:
         agent_exec = self._get_path_agent_exec()
         print '{0} status'.format(agent_exec)
         self._print_output_command([agent_exec, 'status'])
         supervisor_exec = self._get_path_supervisor_exec()
         print '{0} status'.format(supervisor_exec)
         self._print_output_command([supervisor_exec,
                                     '-c', self._get_path_supervisor_conf(),
                                     'status'])
Example #37
0
 def _supervisor_status(self):
     if Platform.is_windows():
         print 'Windows - status not implemented'
     else:
         agent_exec = self._get_path_agent_exec()
         print '{0} status'.format(agent_exec)
         self._print_output_command([agent_exec, 'status'])
         supervisor_exec = self._get_path_supervisor_exec()
         print '{0} status'.format(supervisor_exec)
         self._print_output_command([supervisor_exec,
                                     '-c', self._get_path_supervisor_conf(),
                                     'status'])
Example #38
0
 def check(self, instance):
     """Get disk space/inode stats"""
     # Windows and Mac will always have psutil
     # (we have packaged for both of them)
     if self._psutil():
         if Platform.is_linux():
             procfs_path = self.agentConfig.get('procfs_path', '/proc').rstrip('/')
             psutil.PROCFS_PATH = procfs_path
         self.collect_metrics_psutil()
     else:
         # FIXME: implement all_partitions (df -a)
         self.collect_metrics_manually()
Example #39
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue

            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint)
            except TimeoutException:
                self.log.warn(
                    u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...",
                    part.mountpoint
                )
                continue
            except Exception as e:
                self.log.warn("Unable to get disk metrics for %s: %s", part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in self._collect_part_metrics(part, disk_usage).iteritems():
                self.gauge(metric_name, metric_value,
                           tags=tags, device_name=device_name)
        # And finally, latency metrics, a legacy gift from the old Windows Check
        if Platform.is_win32():
            self.collect_latency_metrics()
Example #40
0
    def _add_conf_tar(self):
        conf_path = get_config_path()
        log.info("  * {0}".format(conf_path))
        self._tar.add(self._strip_comment(conf_path), os.path.join(self._prefix, "etc", "datadog.conf"))

        if not Platform.is_windows():
            supervisor_path = os.path.join(os.path.dirname(get_config_path()), "supervisor.conf")
            log.info("  * {0}".format(supervisor_path))
            self._tar.add(self._strip_comment(supervisor_path), os.path.join(self._prefix, "etc", "supervisor.conf"))

        for file_path in glob.glob(os.path.join(get_confd_path(), "*.yaml")) + glob.glob(
            os.path.join(get_confd_path(), "*.yaml.default")
        ):
            self._add_clean_confd(file_path)
Example #41
0
    def testLoad(self):
        global logger
        load = Load(logger)
        res = load.check({'system_stats': get_system_stats()})
        assert 'system.load.1' in res
        if Platform.is_linux():
            cores = int(get_system_stats().get('cpuCores'))
            assert 'system.load.norm.1' in res
            assert abs(res['system.load.1'] - cores * res['system.load.norm.1']) <= 0.1, (res['system.load.1'], cores * res['system.load.norm.1'])

        # same test but without cpu count, no normalized load sent.
        res = load.check({})
        assert 'system.load.1' in res
        assert 'system.load.norm.1' not in res
Example #42
0
    def _add_conf_tar(self):
        conf_path = get_config_path()
        log.info("  * {0}".format(conf_path))
        self._tar.add(self._strip_comment(conf_path),
                      os.path.join(self._prefix, 'etc', 'datadog.conf'))

        if not Platform.is_windows():
            supervisor_path = os.path.join(os.path.dirname(get_config_path()),
                                           'supervisor.conf')
            log.info("  * {0}".format(supervisor_path))
            self._tar.add(self._strip_comment(supervisor_path),
                          os.path.join(self._prefix, 'etc', 'supervisor.conf'))

        for file_path in glob.glob(os.path.join(get_confd_path(), '*.yaml')):
            self._add_clean_confd(file_path)
Example #43
0
 def testMemory(self):
     global logger
     res = Memory(logger).check({})
     if Platform.is_linux():
         MEM_METRICS = ["swapTotal", "swapFree", "swapPctFree", "swapUsed", "physTotal", "physFree", "physUsed", "physBuffers", "physCached", "physUsable", "physPctUsable", "physShared"]
         for k in MEM_METRICS:
             # % metric is only here if total > 0
             if k == 'swapPctFree' and res['swapTotal'] == 0:
                 continue
             assert k in res, res
         assert res["swapTotal"] == res["swapFree"] + res["swapUsed"]
         assert res["physTotal"] == res["physFree"] + res["physUsed"]
     elif sys.platform == 'darwin':
         for k in ("swapFree", "swapUsed", "physFree", "physUsed"):
             assert k in res, res
Example #44
0
    def check(self, agentConfig):
        if Platform.is_linux():
            try:
                loadAvrgProc = open('/proc/loadavg', 'r')
                uptime = loadAvrgProc.readlines()
                loadAvrgProc.close()
            except Exception:
                self.logger.exception('Cannot extract load')
                return False

            uptime = uptime[
                0]  # readlines() provides a list but we want a string

        elif sys.platform in ('darwin',
                              'sunos5') or sys.platform.startswith("freebsd"):
            # Get output from uptime
            try:
                uptime = sp.Popen(['uptime'], stdout=sp.PIPE,
                                  close_fds=True).communicate()[0]
            except Exception:
                self.logger.exception('Cannot extract load')
                return False

        # Split out the 3 load average values
        load = [
            res.replace(',', '.')
            for res in re.findall(r'([0-9]+[\.,]\d+)', uptime)
        ]
        # Normalize load by number of cores
        try:
            cores = int(agentConfig.get('system_stats').get('cpuCores'))
            assert cores >= 1, "Cannot determine number of cores"
            # Compute a normalized load, named .load.norm to make it easy to find next to .load
            return {
                'system.load.1': float(load[0]),
                'system.load.5': float(load[1]),
                'system.load.15': float(load[2]),
                'system.load.norm.1': float(load[0]) / cores,
                'system.load.norm.5': float(load[1]) / cores,
                'system.load.norm.15': float(load[2]) / cores,
            }
        except Exception:
            # No normalized load available
            return {
                'system.load.1': float(load[0]),
                'system.load.5': float(load[1]),
                'system.load.15': float(load[2])
            }
Example #45
0
def pid_exists(pid):
    """
    Check if a pid exists.
    Lighter than psutil.pid_exists
    """
    if psutil:
        return psutil.pid_exists(pid)

    if Platform.is_windows():
        import ctypes  # Available from python2.5
        kernel32 = ctypes.windll.kernel32
        synchronize = 0x100000

        process = kernel32.OpenProcess(synchronize, 0, pid)
        if process != 0:
            kernel32.CloseHandle(process)
            return True
        else:
            return False

    # Code from psutil._psposix.pid_exists
    # See https://github.com/giampaolo/psutil/blob/master/psutil/_psposix.py
    if pid == 0:
        # According to "man 2 kill" PID 0 has a special meaning:
        # it refers to <<every process in the process group of the
        # calling process>> so we don't want to go any further.
        # If we get here it means this UNIX platform *does* have
        # a process with id 0.
        return True
    try:
        os.kill(pid, 0)
    except OSError as err:
        if err.errno == errno.ESRCH:
            # ESRCH == No such process
            return False
        elif err.errno == errno.EPERM:
            # EPERM clearly means there's a process to deny access to
            return True
        else:
            # According to "man 2 kill" possible error values are
            # (EINVAL, EPERM, ESRCH) therefore we should never get
            # here. If we do let's be explicit in considering this
            # an error.
            raise err
    else:
        return True
Example #46
0
    def testLoad(self):
        global logger
        load = Load(logger)
        res = load.check({"system_stats": get_system_stats()})
        assert "system.load.1" in res
        if Platform.is_linux():
            cores = int(get_system_stats().get("cpuCores"))
            assert "system.load.norm.1" in res
            assert abs(res["system.load.1"] - cores * res["system.load.norm.1"]) <= 0.1, (
                res["system.load.1"],
                cores * res["system.load.norm.1"],
            )

        # same test but without cpu count, no normalized load sent.
        res = load.check({})
        assert "system.load.1" in res
        assert "system.load.norm.1" not in res
Example #47
0
    def _add_conf_tar(self):
        conf_path = get_config_path()
        if self._can_read(conf_path):
            self._tar.add(self._strip_comment(conf_path),
                          os.path.join(self._prefix, 'etc', 'datadog.conf'))

        if not Platform.is_windows():
            supervisor_path = os.path.join(os.path.dirname(get_config_path()),
                                           'supervisor.conf')
            if self._can_read(supervisor_path):
                self._tar.add(
                    self._strip_comment(supervisor_path),
                    os.path.join(self._prefix, 'etc', 'supervisor.conf'))

        for file_path in glob.glob(os.path.join(get_confd_path(), '*.yaml')) +\
                glob.glob(os.path.join(get_confd_path(), '*.yaml.default')):
            if self._can_read(file_path, output=False):
                self._add_clean_confd(file_path)
Example #48
0
    def _load_conf(self, instance):
        self._excluded_filesystems = instance.get('excluded_filesystems', [])
        self._excluded_disks = instance.get('excluded_disks', [])
        self._tag_by_filesystem = _is_affirmative(
            instance.get('tag_by_filesystem', False))
        # On Windows, we need all_partitions to True by default to collect
        # metrics about remote disks
        # On Linux, we need all_partitions to False to avoid collecting metrics
        # about nodev filesystems
        self._all_partitions = _is_affirmative(
            instance.get('all_partitions', Platform.is_win32()))

        # FIXME: 6.x, drop use_mount option in datadog.conf
        self._load_legacy_option(instance, 'use_mount', False,
                                 operation=_is_affirmative)
        # FIXME: 6.x, drop device_blacklist_re option in datadog.conf
        self._load_legacy_option(instance, 'excluded_disk_re', '^$',
                                 legacy_name='device_blacklist_re',
                                 operation=re.compile)
Example #49
0
 def check(self, agentConfig):
     if Platform.is_linux():
         try:
             loadAvrgProc = open('/proc/loadavg', 'r')
             uptime = loadAvrgProc.readlines()
             loadAvrgProc.close()
         except Exception:
             self.logger.exception('Cannot extract load')
             return False
         
         uptime = uptime[0] # readlines() provides a list but we want a string
     
     elif sys.platform in ('darwin', 'sunos5') or sys.platform.startswith("freebsd"):
         # Get output from uptime
         try:
             uptime = sp.Popen(['uptime'],
                                       stdout=sp.PIPE,
                                       close_fds=True).communicate()[0]
         except Exception:
             self.logger.exception('Cannot extract load')
             return False
             
     # Split out the 3 load average values
     load = [res.replace(',', '.') for res in re.findall(r'([0-9]+[\.,]\d+)', uptime)]
     # Normalize load by number of cores
     try:
         cores = int(agentConfig.get('system_stats').get('cpuCores'))
         assert cores >= 1, "Cannot determine number of cores"
         # Compute a normalized load, named .load.norm to make it easy to find next to .load
         return {'system.load.1': float(load[0]),
                 'system.load.5': float(load[1]),
                 'system.load.15': float(load[2]),
                 'system.load.norm.1': float(load[0])/cores,
                 'system.load.norm.5': float(load[1])/cores,
                 'system.load.norm.15': float(load[2])/cores,
                 }
     except Exception:
         # No normalized load available
         return {'system.load.1': float(load[0]),
                 'system.load.5': float(load[1]),
                 'system.load.15': float(load[2])}
Example #50
0
def get_system_stats():
    systemStats = {
        "machine": platform.machine(),
        "platform": sys.platform,
        "processor": platform.processor(),
        "pythonV": platform.python_version(),
    }

    platf = sys.platform

    if Platform.is_linux(platf):
        grep = subprocess.Popen(["grep", "model name", "/proc/cpuinfo"], stdout=subprocess.PIPE, close_fds=True)
        wc = subprocess.Popen(["wc", "-l"], stdin=grep.stdout, stdout=subprocess.PIPE, close_fds=True)
        systemStats["cpuCores"] = int(wc.communicate()[0])

    if Platform.is_darwin(platf):
        systemStats["cpuCores"] = int(
            subprocess.Popen(["sysctl", "hw.ncpu"], stdout=subprocess.PIPE, close_fds=True)
            .communicate()[0]
            .split(": ")[1]
        )

    if Platform.is_freebsd(platf):
        systemStats["cpuCores"] = int(
            subprocess.Popen(["sysctl", "hw.ncpu"], stdout=subprocess.PIPE, close_fds=True)
            .communicate()[0]
            .split(": ")[1]
        )

    if Platform.is_linux(platf):
        systemStats["nixV"] = platform.dist()

    elif Platform.is_darwin(platf):
        systemStats["macV"] = platform.mac_ver()

    elif Platform.is_freebsd(platf):
        version = platform.uname()[2]
        systemStats["fbsdV"] = ("freebsd", version, "")  # no codename for FreeBSD

    elif Platform.is_win32(platf):
        systemStats["winV"] = platform.win32_ver()

    return systemStats
Example #51
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue
            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = psutil.disk_usage(part.mountpoint)
            except Exception, e:
                self.log.debug("Unable to get disk metrics for %s: %s", part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug("Passed: {0}".format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total)
            #       The problem here is that total includes reserved space the user
            #       doesn't have access to. This causes psutil to calculate a misleadng
            #       percentage for in_use; a lower percentage than df shows.

            # Calculate in_use w/o reserved space; consistent w/ df's Use% metric.
            pmets = self._collect_part_metrics(part, disk_usage)
            used = "system.disk.used"
            free = "system.disk.free"
            pmets["system.disk.in_use"] = pmets[used] / (pmets[used] + pmets[free])

            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip("\\").lower()
            for metric_name, metric_value in pmets.iteritems():
                self.gauge(metric_name, metric_value, tags=tags, device_name=device_name)
Example #52
0
class Disk(AgentCheck):
    """ Collects metrics about the machine's disks. """
    # -T for filesystem info
    DF_COMMAND = ['df', '-T']
    METRIC_DISK = 'system.disk.{0}'
    METRIC_INODE = 'system.fs.inodes.{0}'

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                "Disk check only supports one configured instance.")
        AgentCheck.__init__(self,
                            name,
                            init_config,
                            agentConfig,
                            instances=instances)
        # Get the configuration once for all
        self._load_conf(instances[0])

    def check(self, instance):
        """Get disk space/inode stats"""
        # Windows and Mac will always have psutil
        # (we have packaged for both of them)
        if self._psutil():
            self.collect_metrics_psutil()
        else:
            # FIXME: implement all_partitions (df -a)
            self.collect_metrics_manually()

    @classmethod
    def _psutil(cls):
        return psutil is not None

    def _load_conf(self, instance):
        self._excluded_filesystems = instance.get('excluded_filesystems', [])
        self._excluded_disks = instance.get('excluded_disks', [])
        self._tag_by_filesystem = _is_affirmative(
            instance.get('tag_by_filesystem', False))
        self._all_partitions = _is_affirmative(
            instance.get('all_partitions', False))

        # Force exclusion of CDROM (iso9660) from disk check
        self._excluded_filesystems.append('iso9660')

        # FIXME: 6.x, drop use_mount option in datadog.conf
        self._load_legacy_option(instance,
                                 'use_mount',
                                 False,
                                 operation=_is_affirmative)
        # FIXME: 6.x, drop device_blacklist_re option in datadog.conf
        self._load_legacy_option(instance,
                                 'excluded_disk_re',
                                 '^$',
                                 legacy_name='device_blacklist_re',
                                 operation=re.compile)

    def _load_legacy_option(self,
                            instance,
                            option,
                            default,
                            legacy_name=None,
                            operation=lambda l: l):
        value = instance.get(option, default)
        legacy_name = legacy_name or option

        if value == default and legacy_name in self.agentConfig:
            self.log.warn("Using `{0}` in datadog.conf has been deprecated"
                          " in favor of `{1}` in disk.yaml".format(
                              legacy_name, option))
            value = self.agentConfig.get(legacy_name) or default
        setattr(self, '_{0}'.format(option), operation(value))

    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue
            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = psutil.disk_usage(part.mountpoint)
            except Exception, e:
                self.log.debug("Unable to get disk metrics for %s: %s",
                               part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # Note: psutil (0.3.0 to at least 3.1.1) calculates in_use as (used / total)
            #       The problem here is that total includes reserved space the user
            #       doesn't have access to. This causes psutil to calculate a misleadng
            #       percentage for in_use; a lower percentage than df shows.

            # Calculate in_use w/o reserved space; consistent w/ df's Use% metric.
            pmets = self._collect_part_metrics(part, disk_usage)
            used = 'system.disk.used'
            free = 'system.disk.free'
            pmets['system.disk.in_use'] = pmets[used] / (pmets[used] +
                                                         pmets[free])

            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in pmets.iteritems():
                self.gauge(metric_name,
                           metric_value,
                           tags=tags,
                           device_name=device_name)
        # And finally, latency metrics, a legacy gift from the old Windows Check
        if Platform.is_win32():
            self.collect_latency_metrics()
Example #53
0
    def check(self, agentConfig):
        """Return an aggregate of CPU stats across all CPUs
        When figures are not available, False is sent back.
        """
        def format_results(us, sy, wa, idle, st):
            data = {
                'cpuUser': us,
                'cpuSystem': sy,
                'cpuWait': wa,
                'cpuIdle': idle,
                'cpuStolen': st
            }
            for key in data.keys():
                if data[key] is None:
                    del data[key]
            return data

        def get_value(legend, data, name, filter_value=None):
            "Using the legend and a metric name, get the value or None from the data line"
            if name in legend:
                value = to_float(data[legend.index(name)])
                if filter_value is not None:
                    if value > filter_value:
                        return None
                return value

            else:
                # FIXME return a float or False, would trigger type error if not python
                self.logger.debug("Cannot extract cpu value %s from %s (%s)" %
                                  (name, data, legend))
                return 0.0

        if Platform.is_linux():
            mpstat = sp.Popen(['mpstat', '1', '3'],
                              stdout=sp.PIPE,
                              close_fds=True).communicate()[0]
            # topdog@ip:~$ mpstat 1 3
            # Linux 2.6.32-341-ec2 (ip)   01/19/2012  _x86_64_  (2 CPU)
            #
            # 04:22:41 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
            # 04:22:42 PM  all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
            # 04:22:43 PM  all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
            # 04:22:44 PM  all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
            # Average:     all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
            #
            # OR
            #
            # Thanks to Mart Visser to spotting this one.
            # blah:/etc/dd-agent# mpstat
            # Linux 2.6.26-2-xen-amd64 (atira)  02/17/2012  _x86_64_
            #
            # 05:27:03 PM  CPU    %user   %nice   %sys %iowait    %irq   %soft  %steal  %idle   intr/s
            # 05:27:03 PM  all    3.59    0.00    0.68    0.69    0.00   0.00    0.01   95.03    43.65
            #
            lines = mpstat.split("\n")
            legend = [l for l in lines if "%usr" in l or "%user" in l]
            avg = [l for l in lines if "Average" in l]
            if len(legend) == 1 and len(avg) == 1:
                headers = [
                    h for h in legend[0].split() if h not in ("AM", "PM")
                ]
                data = avg[0].split()

                # Userland
                # Debian lenny says %user so we look for both
                # One of them will be 0
                cpu_metrics = {
                    "%usr": None,
                    "%user": None,
                    "%nice": None,
                    "%iowait": None,
                    "%idle": None,
                    "%sys": None,
                    "%irq": None,
                    "%soft": None,
                    "%steal": None,
                }

                for cpu_m in cpu_metrics:
                    cpu_metrics[cpu_m] = get_value(headers,
                                                   data,
                                                   cpu_m,
                                                   filter_value=110)

                if any([v is None for v in cpu_metrics.values()]):
                    self.logger.warning("Invalid mpstat data: %s" % data)

                cpu_user = cpu_metrics["%usr"] + cpu_metrics[
                    "%user"] + cpu_metrics["%nice"]
                cpu_system = cpu_metrics["%sys"] + cpu_metrics[
                    "%irq"] + cpu_metrics["%soft"]
                cpu_wait = cpu_metrics["%iowait"]
                cpu_idle = cpu_metrics["%idle"]
                cpu_stolen = cpu_metrics["%steal"]

                return format_results(cpu_user, cpu_system, cpu_wait, cpu_idle,
                                      cpu_stolen)
            else:
                return False

        elif sys.platform == 'darwin':
            # generate 3 seconds of data
            # ['          disk0           disk1       cpu     load average', '    KB/t tps  MB/s     KB/t tps  MB/s  us sy id   1m   5m   15m', '   21.23  13  0.27    17.85   7  0.13  14  7 79  1.04 1.27 1.31', '    4.00   3  0.01     5.00   8  0.04  12 10 78  1.04 1.27 1.31', '']
            iostats = sp.Popen(['iostat', '-C', '-w', '3', '-c', '2'],
                               stdout=sp.PIPE,
                               close_fds=True).communicate()[0]
            lines = [l for l in iostats.split("\n") if len(l) > 0]
            legend = [l for l in lines if "us" in l]
            if len(legend) == 1:
                headers = legend[0].split()
                data = lines[-1].split()
                cpu_user = get_value(headers, data, "us")
                cpu_sys = get_value(headers, data, "sy")
                cpu_wait = 0
                cpu_idle = get_value(headers, data, "id")
                cpu_st = 0
                return format_results(cpu_user, cpu_sys, cpu_wait, cpu_idle,
                                      cpu_st)
            else:
                self.logger.warn(
                    "Expected to get at least 4 lines of data from iostat instead of just "
                    + str(iostats[:max(80, len(iostats))]))
                return False

        elif sys.platform.startswith("freebsd"):
            # generate 3 seconds of data
            # tty            ada0              cd0            pass0             cpu
            # tin  tout  KB/t tps  MB/s   KB/t tps  MB/s   KB/t tps  MB/s  us ni sy in id
            # 0    69 26.71   0  0.01   0.00   0  0.00   0.00   0  0.00   2  0  0  1 97
            # 0    78  0.00   0  0.00   0.00   0  0.00   0.00   0  0.00   0  0  0  0 100
            iostats = sp.Popen(['iostat', '-w', '3', '-c', '2'],
                               stdout=sp.PIPE,
                               close_fds=True).communicate()[0]
            lines = [l for l in iostats.split("\n") if len(l) > 0]
            legend = [l for l in lines if "us" in l]
            if len(legend) == 1:
                headers = legend[0].split()
                data = lines[-1].split()
                cpu_user = get_value(headers, data, "us")
                cpu_nice = get_value(headers, data, "ni")
                cpu_sys = get_value(headers, data, "sy")
                cpu_intr = get_value(headers, data, "in")
                cpu_wait = 0
                cpu_idle = get_value(headers, data, "id")
                cpu_stol = 0
                return format_results(cpu_user + cpu_nice, cpu_sys + cpu_intr,
                                      cpu_wait, cpu_idle, cpu_stol)

            else:
                self.logger.warn(
                    "Expected to get at least 4 lines of data from iostat instead of just "
                    + str(iostats[:max(80, len(iostats))]))
                return False

        elif sys.platform == 'sunos5':
            # mpstat -aq 1 2
            # SET minf mjf xcal  intr ithr  csw icsw migr smtx  srw syscl  usr sys  wt idl sze
            # 0 5239   0 12857 22969 5523 14628   73  546 4055    1 146856    5   6   0  89  24 <-- since boot
            # 1 ...
            # SET minf mjf xcal  intr ithr  csw icsw migr smtx  srw syscl  usr sys  wt idl sze
            # 0 20374   0 45634 57792 5786 26767   80  876 20036    2 724475   13  13   0  75  24 <-- past 1s
            # 1 ...
            # http://docs.oracle.com/cd/E23824_01/html/821-1462/mpstat-1m.html
            #
            # Will aggregate over all processor sets
            try:
                mpstat = sp.Popen(['mpstat', '-aq', '1', '2'],
                                  stdout=sp.PIPE,
                                  close_fds=True).communicate()[0]
                lines = [l for l in mpstat.split("\n") if len(l) > 0]
                # discard the first len(lines)/2 lines
                lines = lines[len(lines) / 2:]
                legend = [l for l in lines if "SET" in l]
                assert len(legend) == 1
                if len(legend) == 1:
                    headers = legend[0].split()
                    # collect stats for each processor set
                    # and aggregate them based on the relative set size
                    d_lines = [l for l in lines if "SET" not in l]
                    user = [
                        get_value(headers, l.split(), "usr") for l in d_lines
                    ]
                    kern = [
                        get_value(headers, l.split(), "sys") for l in d_lines
                    ]
                    wait = [
                        get_value(headers, l.split(), "wt") for l in d_lines
                    ]
                    idle = [
                        get_value(headers, l.split(), "idl") for l in d_lines
                    ]
                    size = [
                        get_value(headers, l.split(), "sze") for l in d_lines
                    ]
                    count = sum(size)
                    rel_size = [s / count for s in size]
                    dot = lambda v1, v2: reduce(operator.add,
                                                map(operator.mul, v1, v2))
                    return format_results(dot(user, rel_size),
                                          dot(kern, rel_size),
                                          dot(wait, rel_size),
                                          dot(idle, rel_size), 0.0)
            except Exception:
                self.logger.exception("Cannot compute CPU stats")
                return False
        else:
            self.logger.warn("CPUStats: unsupported platform")
            return False
Example #54
0
    def check(self, agentConfig):
        if Platform.is_linux():
            try:
                meminfoProc = open('/proc/meminfo', 'r')
                lines = meminfoProc.readlines()
                meminfoProc.close()
            except Exception:
                self.logger.exception(
                    'Cannot get memory metrics from /proc/meminfo')
                return False

            # $ cat /proc/meminfo
            # MemTotal:        7995360 kB
            # MemFree:         1045120 kB
            # Buffers:          226284 kB
            # Cached:           775516 kB
            # SwapCached:       248868 kB
            # Active:          1004816 kB
            # Inactive:        1011948 kB
            # Active(anon):     455152 kB
            # Inactive(anon):   584664 kB
            # Active(file):     549664 kB
            # Inactive(file):   427284 kB
            # Unevictable:     4392476 kB
            # Mlocked:         4392476 kB
            # SwapTotal:      11120632 kB
            # SwapFree:       10555044 kB
            # Dirty:              2948 kB
            # Writeback:             0 kB
            # AnonPages:       5203560 kB
            # Mapped:            50520 kB
            # Shmem:             10108 kB
            # Slab:             161300 kB
            # SReclaimable:     136108 kB
            # SUnreclaim:        25192 kB
            # KernelStack:        3160 kB
            # PageTables:        26776 kB
            # NFS_Unstable:          0 kB
            # Bounce:                0 kB
            # WritebackTmp:          0 kB
            # CommitLimit:    15118312 kB
            # Committed_AS:    6703508 kB
            # VmallocTotal:   34359738367 kB
            # VmallocUsed:      400668 kB
            # VmallocChunk:   34359329524 kB
            # HardwareCorrupted:     0 kB
            # HugePages_Total:       0
            # HugePages_Free:        0
            # HugePages_Rsvd:        0
            # HugePages_Surp:        0
            # Hugepagesize:       2048 kB
            # DirectMap4k:       10112 kB
            # DirectMap2M:     8243200 kB

            regexp = re.compile(
                r'^(\w+):\s+([0-9]+)'
            )  # We run this several times so one-time compile now
            meminfo = {}

            for line in lines:
                try:
                    match = re.search(regexp, line)
                    if match is not None:
                        meminfo[match.group(1)] = match.group(2)
                except Exception:
                    self.logger.exception("Cannot parse /proc/meminfo")

            memData = {}

            # Physical memory
            # FIXME units are in MB, we should use bytes instead
            try:
                memData['physTotal'] = int(meminfo.get('MemTotal', 0)) / 1024
                memData['physFree'] = int(meminfo.get('MemFree', 0)) / 1024
                memData['physBuffers'] = int(meminfo.get('Buffers', 0)) / 1024
                memData['physCached'] = int(meminfo.get('Cached', 0)) / 1024
                memData['physShared'] = int(meminfo.get('Shmem', 0)) / 1024

                memData[
                    'physUsed'] = memData['physTotal'] - memData['physFree']
                # Usable is relative since cached and buffers are actually used to speed things up.
                memData['physUsable'] = memData['physFree'] + memData[
                    'physBuffers'] + memData['physCached']

                if memData['physTotal'] > 0:
                    memData['physPctUsable'] = float(
                        memData['physUsable']) / float(memData['physTotal'])
            except Exception:
                self.logger.exception(
                    'Cannot compute stats from /proc/meminfo')

            # Swap
            # FIXME units are in MB, we should use bytes instead
            try:
                memData['swapTotal'] = int(meminfo.get('SwapTotal', 0)) / 1024
                memData['swapFree'] = int(meminfo.get('SwapFree', 0)) / 1024

                memData[
                    'swapUsed'] = memData['swapTotal'] - memData['swapFree']

                if memData['swapTotal'] > 0:
                    memData['swapPctFree'] = float(
                        memData['swapFree']) / float(memData['swapTotal'])
            except Exception:
                self.logger.exception('Cannot compute swap stats')

            return memData

        elif sys.platform == 'darwin':
            macV = platform.mac_ver()
            macV_minor_version = int(
                re.match(r'10\.(\d+)\.?.*', macV[0]).group(1))

            try:
                top = sp.Popen(['top', '-l 1'], stdout=sp.PIPE,
                               close_fds=True).communicate()[0]
                sysctl = sp.Popen(['sysctl', 'vm.swapusage'],
                                  stdout=sp.PIPE,
                                  close_fds=True).communicate()[0]
            except StandardError:
                self.logger.exception('getMemoryUsage')
                return False

            # Deal with top
            lines = top.split('\n')
            physParts = re.findall(r'([0-9]\d+)', lines[self.topIndex])

            # Deal with sysctl
            swapParts = re.findall(r'([0-9]+\.\d+)', sysctl)

            # Mavericks changes the layout of physical memory format in `top`
            physUsedPartIndex = 3
            physFreePartIndex = 4
            if macV and (macV_minor_version >= 9):
                physUsedPartIndex = 0
                physFreePartIndex = 2

            return {
                'physUsed': physParts[physUsedPartIndex],
                'physFree': physParts[physFreePartIndex],
                'swapUsed': swapParts[1],
                'swapFree': swapParts[2]
            }

        elif sys.platform.startswith("freebsd"):
            try:
                sysctl = sp.Popen(['sysctl', 'vm.stats.vm'],
                                  stdout=sp.PIPE,
                                  close_fds=True).communicate()[0]
            except Exception:
                self.logger.exception('getMemoryUsage')
                return False

            lines = sysctl.split('\n')

            # ...
            # vm.stats.vm.v_page_size: 4096
            # vm.stats.vm.v_page_count: 759884
            # vm.stats.vm.v_wire_count: 122726
            # vm.stats.vm.v_active_count: 109350
            # vm.stats.vm.v_cache_count: 17437
            # vm.stats.vm.v_inactive_count: 479673
            # vm.stats.vm.v_free_count: 30542
            # ...

            # We run this several times so one-time compile now
            regexp = re.compile(r'^vm\.stats\.vm\.(\w+):\s+([0-9]+)')
            meminfo = {}

            for line in lines:
                try:
                    match = re.search(regexp, line)
                    if match is not None:
                        meminfo[match.group(1)] = match.group(2)
                except Exception:
                    self.logger.exception(
                        "Cannot parse sysctl vm.stats.vm output")

            memData = {}

            # Physical memory
            try:
                pageSize = int(meminfo.get('v_page_size'))

                memData['physTotal'] = (int(meminfo.get('v_page_count', 0)) *
                                        pageSize) / 1048576
                memData['physFree'] = (int(meminfo.get('v_free_count', 0)) *
                                       pageSize) / 1048576
                memData['physCached'] = (int(meminfo.get('v_cache_count', 0)) *
                                         pageSize) / 1048576
                memData['physUsed'] = (
                    (int(meminfo.get('v_active_count'), 0) +
                     int(meminfo.get('v_wire_count', 0))) * pageSize) / 1048576
                memData['physUsable'] = (
                    (int(meminfo.get('v_free_count'), 0) +
                     int(meminfo.get('v_cache_count', 0)) +
                     int(meminfo.get('v_inactive_count', 0))) *
                    pageSize) / 1048576

                if memData['physTotal'] > 0:
                    memData['physPctUsable'] = float(
                        memData['physUsable']) / float(memData['physTotal'])
            except Exception:
                self.logger.exception(
                    'Cannot compute stats from /proc/meminfo')

            # Swap
            try:
                sysctl = sp.Popen(['swapinfo', '-m'],
                                  stdout=sp.PIPE,
                                  close_fds=True).communicate()[0]
            except Exception:
                self.logger.exception('getMemoryUsage')
                return False

            lines = sysctl.split('\n')

            # ...
            # Device          1M-blocks     Used    Avail Capacity
            # /dev/ad0s1b           570        0      570     0%
            # ...

            assert "Device" in lines[0]

            try:
                memData['swapTotal'] = 0
                memData['swapFree'] = 0
                memData['swapUsed'] = 0
                for line in lines[1:-1]:
                    line = line.split()
                    memData['swapTotal'] += int(line[1])
                    memData['swapFree'] += int(line[3])
                    memData['swapUsed'] += int(line[2])
            except Exception:
                self.logger.exception('Cannot compute stats from swapinfo')

            return memData
        elif sys.platform == 'sunos5':
            try:
                memData = {}
                kmem = sp.Popen(["kstat", "-c", "zone_memory_cap", "-p"],
                                stdout=sp.PIPE,
                                close_fds=True).communicate()[0]

                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anon_alloc_fail   0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anonpgin  0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:class     zone_memory_cap
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:crtime    16359935.0680834
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:execpgin  185
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:fspgin    2556
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle     0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle_usec        0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:nover     0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pagedout  0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pgpgin    2741
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:physcap   536870912  <--
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:rss       115544064  <--
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:snaptime  16787393.9439095
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swap      91828224   <--
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swapcap   1073741824 <--
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename  53aa9b7e-48ba-4152-a52b-a6368c3d9e7c

                # turn memory_cap:360:zone_name:key value
                # into { "key": value, ...}
                kv = [
                    l.strip().split() for l in kmem.split("\n") if len(l) > 0
                ]
                entries = dict([(k.split(":")[-1], v) for (k, v) in kv])
                # extract rss, physcap, swap, swapcap, turn into MB
                convert = lambda v: int(long(v)) / 2**20
                memData["physTotal"] = convert(entries["physcap"])
                memData["physUsed"] = convert(entries["rss"])
                memData[
                    "physFree"] = memData["physTotal"] - memData["physUsed"]
                memData["swapTotal"] = convert(entries["swapcap"])
                memData["swapUsed"] = convert(entries["swap"])
                memData[
                    "swapFree"] = memData["swapTotal"] - memData["swapUsed"]

                if memData['swapTotal'] > 0:
                    memData['swapPctFree'] = float(
                        memData['swapFree']) / float(memData['swapTotal'])
                return memData
            except Exception:
                self.logger.exception(
                    "Cannot compute mem stats from kstat -c zone_memory_cap")
                return False
        else:
            return False