Example #1
0
    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []
        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_id[:12])
                return []
            # get labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get replication controller
            created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}'))
            if created_by.get('reference', {}).get('kind') == 'ReplicationController':
                tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name'))

            # get kubernetes namespace
            tags.append('kube_namespace:%s' % pod_metadata.get('namespace'))

        elif Platform.is_swarm():
            c_labels = state.inspect_container(c_id).get('Labels', {})
            swarm_svc = c_labels.get(DockerUtil.SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % c_labels)


        return tags
Example #2
0
    def testNetwork(self):
        # FIXME: cx_state to true, but needs sysstat installed
        config = """
init_config:

instances:
    - collect_connection_state: false
      excluded_interfaces:
        - lo
        - lo0
"""
        check, instances = get_check('network', config)

        check.check(instances[0])
        check.get_metrics()

        metric_names = [m[0] for m in check.aggregator.metrics]

        assert 'system.net.bytes_rcvd' in metric_names
        assert 'system.net.bytes_sent' in metric_names
        if Platform.is_linux():
            assert 'system.net.tcp.retrans_segs' in metric_names
            assert 'system.net.tcp.in_segs' in metric_names
            assert 'system.net.tcp.out_segs' in metric_names
        elif Platform.is_bsd():
            assert 'system.net.tcp.retrans_packs' in metric_names
            assert 'system.net.tcp.sent_packs' in metric_names
            assert 'system.net.tcp.rcv_packs' in metric_names
    def _get_host_address(self, state, c_id, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_inspect = state.inspect_container(c_id)
        c_id = c_inspect.get('Id', '')
        c_img = self.dockerutil.image_name_extractor(c_inspect)

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_ip = state.get_kube_config(c_id, 'status').get('podIP')
            if pod_ip:
                return pod_ip

        if Platform.is_rancher():
            # try to get the rancher IP address
            log.debug("No IP address was found in container %s (%s) "
                      "trying with the Rancher label" % (c_id[:12], c_img))

            ip_addr = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_CONTAINER_IP)
            if ip_addr:
                return ip_addr.split('/')[0]

        log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img))
        return None
Example #4
0
    def __init__(self, parent=None):
        QMenu.__init__(self, parent)
        self.options = {}
        system_tray_menu = [
            (self.START, lambda: agent_manager("start")),
            (self.STOP, lambda: agent_manager("stop")),
            (self.RESTART, lambda: agent_manager("restart")),
        ]
        # First the version
        self.addAction(self.ABOUT.format(get_version())).setEnabled(False)
        self.addSeparator()

        for name, action in system_tray_menu:
            self.add_option(name, action)

        # enable or disable mac login
        if Platform.is_mac():
            self.add_option(self.MAC_LOGIN.format(self.enable_or_disable_mac()),
                            lambda: self.enable_or_disable_login())
        elif Platform.is_windows():
            self.add_option(self.FLARE, lambda: thread.start_new_thread(windows_flare, ()))

        # And finally the exit
        self.add_option(self.EXIT, lambda: sys.exit(0))

        self.connect(self, SIGNAL("aboutToShow()"), lambda: self.update_options())
Example #5
0
    def _get_checks_to_refresh(self, state, c_id):
        """Get the list of checks applied to a container from the identifier_to_checks cache in the config store.
        Use the DATADOG_ID label or the image."""
        inspect = state.inspect_container(c_id)

        # If the container was removed we can't tell which check is concerned
        # so we have to reload everything.
        # Same thing if it's stopped and we're on Kubernetes in auto_conf mode
        # because the pod was deleted and its template could have been in the annotations.
        if not inspect or \
                (not inspect.get('State', {}).get('Running')
                    and Platform.is_k8s() and not self.agentConfig.get('sd_config_backend')):
            self.reload_check_configs = True
            return

        identifier = inspect.get('Config', {}).get('Labels', {}).get(DATADOG_ID) or \
            self.dockerutil.image_name_extractor(inspect)

        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_annotations': kube_metadata.get('annotations'),
                'kube_container_name': state.get_kube_container_name(c_id),
            }

        return self.config_store.get_checks_to_refresh(identifier, **platform_kwargs)
Example #6
0
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.docker_client = self.dockerutil.client
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                self.kubeutil = None
                log.error("Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

        if Platform.is_nomad():
            self.nomadutil = NomadUtil()
        elif Platform.is_ecs_instance():
            self.ecsutil = ECSUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
Example #7
0
def get_config_path(cfg_path=None, os_name=None):
    # Check if there's an override and if it exists
    if cfg_path is not None and os.path.exists(cfg_path):
        return cfg_path

    # Check if there's a config stored in the current agent directory
    try:
        path = os.path.realpath(__file__)
        path = os.path.dirname(path)
        return _config_path(path)
    except PathNotFound as e:
        pass

    # Check for an OS-specific path, continue on not-found exceptions
    bad_path = ''
    try:
        if Platform.is_windows():
            common_data = _windows_commondata_path()
            return _config_path(os.path.join(common_data, 'Datadog'))
        elif Platform.is_mac():
            return _config_path(MAC_CONFIG_PATH)
        else:
            return _config_path(UNIX_CONFIG_PATH)
    except PathNotFound as e:
        if len(e.args) > 0:
            bad_path = e.args[0]

    # If all searches fail, exit the agent with an error
    sys.stderr.write("Please supply a configuration file at %s or in the directory where "
                     "the Agent is currently deployed.\n" % bad_path)
    sys.exit(3)
Example #8
0
def initialize_logging(logger_name):
    try:
        logging_config = get_logging_config()

        logging.basicConfig(
            format=get_log_format(logger_name),
            level=logging_config['log_level'] or logging.INFO,
        )

        log_file = logging_config.get('%s_log_file' % logger_name)
        if log_file is not None and not logging_config['disable_file_logging']:
            # make sure the log directory is writeable
            # NOTE: the entire directory needs to be writable so that rotation works
            if os.access(os.path.dirname(log_file), os.R_OK | os.W_OK):
                file_handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=LOGGING_MAX_BYTES, backupCount=1)
                formatter = logging.Formatter(get_log_format(logger_name), get_log_date_format())
                file_handler.setFormatter(formatter)

                root_log = logging.getLogger()
                root_log.addHandler(file_handler)
            else:
                sys.stderr.write("Log file is unwritable: '%s'\n" % log_file)

        # set up syslog
        if logging_config['log_to_syslog']:
            try:
                from logging.handlers import SysLogHandler

                if logging_config['syslog_host'] is not None and logging_config['syslog_port'] is not None:
                    sys_log_addr = (logging_config['syslog_host'], logging_config['syslog_port'])
                else:
                    sys_log_addr = "/dev/log"
                    # Special-case BSDs
                    if Platform.is_darwin():
                        sys_log_addr = "/var/run/syslog"
                    elif Platform.is_freebsd():
                        sys_log_addr = "/var/run/log"

                handler = SysLogHandler(address=sys_log_addr, facility=SysLogHandler.LOG_DAEMON)
                handler.setFormatter(logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
                root_log = logging.getLogger()
                root_log.addHandler(handler)
            except Exception, e:
                sys.stderr.write("Error setting up syslog: '%s'\n" % str(e))
                traceback.print_exc()

        # Setting up logging in the event viewer for windows
        if get_os() == 'windows' and logging_config['log_to_event_viewer']:
            try:
                from logging.handlers import NTEventLogHandler
                nt_event_handler = NTEventLogHandler(logger_name, get_win32service_file('windows', 'win32service.pyd'), 'Application')
                nt_event_handler.setFormatter(logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
                nt_event_handler.setLevel(logging.ERROR)
                app_log = logging.getLogger(logger_name)
                app_log.addHandler(nt_event_handler)
            except Exception, e:
                sys.stderr.write("Error setting up Event viewer logging: '%s'\n" % str(e))
                traceback.print_exc()
Example #9
0
    def test_check(self):
        config = {'instances': self.MYSQL_CONFIG}
        self.run_check_twice(config)

        # Test service check
        self.assertServiceCheck('mysql.can_connect', status=AgentCheck.OK,
                                tags=self.SC_TAGS, count=1)

        # Travis MySQL not running replication - FIX in flavored test.
        self.assertServiceCheck('mysql.replication.slave_running', status=AgentCheck.CRITICAL,
                                tags=self.SC_TAGS, count=1)

        ver = map(lambda x: int(x), self.service_metadata[0]['version'].split("."))
        ver = tuple(ver)

        testable_metrics = (self.STATUS_VARS + self.VARIABLES_VARS + self.INNODB_VARS
                      + self.BINLOG_VARS + self.SYSTEM_METRICS + self.SCHEMA_VARS + self.SYNTHETIC_VARS)

        if ver >= (5, 6, 0):
            testable_metrics.extend(self.PERFORMANCE_VARS)

        # Test metrics
        for mname in testable_metrics:
            # These two are currently not guaranteed outside of a Linux
            # environment.
            if mname == 'mysql.performance.user_time' and not Platform.is_linux():
                continue
            if mname == 'mysql.performance.kernel_time' and not Platform.is_linux():
                continue
            if mname == 'mysql.performance.cpu_time' and Platform.is_windows():
                continue

            if mname == 'mysql.performance.query_run_time.avg':
                self.assertMetric(mname, tags=self.METRIC_TAGS+['schema:testdb'], count=1)
            elif mname == 'mysql.info.schema.size':
                self.assertMetric(mname, tags=self.METRIC_TAGS+['schema:testdb'], count=1)
                self.assertMetric(mname, tags=self.METRIC_TAGS+['schema:information_schema'], count=1)
                self.assertMetric(mname, tags=self.METRIC_TAGS+['schema:performance_schema'], count=1)
            else:
                self.assertMetric(mname, tags=self.METRIC_TAGS, count=1)


        # Assert service metadata
        self.assertServiceMetadata(['version'], count=1)

        # test custom query metrics
        self.assertMetric('alice.age', value=25)
        self.assertMetric('bob.age', value=20)

        # test optional metrics
        self._test_optional_metrics((self.OPTIONAL_REPLICATION_METRICS
                                     + self.OPTIONAL_INNODB_VARS
                                     + self.OPTIONAL_STATUS_VARS
                                     + self.OPTIONAL_STATUS_VARS_5_6_6), 1)

        # Raises when COVERAGE=true and coverage < 100%
        self.coverage_report()
Example #10
0
    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        c_inspect = state.inspect_container(c_id)
        tags = self.dockerutil.extract_container_tags(c_inspect)

        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_id[:12])
                return []

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            # add creator tags
            creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata)
            tags.extend(creator_tags)

            # add services tags
            if self.kubeutil.collect_service_tag:
                services = self.kubeutil.match_services_for_pod(pod_metadata)
                for s in services:
                    if s is not None:
                        tags.append('kube_service:%s' % s)

        elif Platform.is_swarm():
            c_labels = c_inspect.get('Config', {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        elif Platform.is_rancher():
            service_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_SVC_NAME)
            stack_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_STACK_NAME)
            container_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_CONTAINER_NAME)
            if service_name:
                tags.append('rancher_service:%s' % service_name)
            if stack_name:
                tags.append('rancher_stack:%s' % stack_name)
            if container_name:
                tags.append('rancher_container:%s' % container_name)

        if self.metadata_collector.has_detected():
            orch_tags = self.metadata_collector.get_container_tags(co=c_inspect)
            tags.extend(orch_tags)

        return tags
Example #11
0
 def _get_events(self):
     """Get the list of events."""
     events, changed_container_ids = self.docker_util.get_events()
     if not self._disable_net_metrics:
         self._invalidate_network_mapping_cache(events)
     if changed_container_ids and self._service_discovery:
         get_sd_backend(self.agentConfig).update_checks(changed_container_ids)
     if changed_container_ids:
         self.metadata_collector.invalidate_cache(events)
         if Platform.is_nomad():
             self.nomadutil.invalidate_cache(events)
         elif Platform.is_ecs_instance():
             self.ecsutil.invalidate_cache(events)
     return events
Example #12
0
    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []
        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_id[:12])
                return []

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            # get created-by
            created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}'))
            creator_kind = created_by.get('reference', {}).get('kind')
            creator_name = created_by.get('reference', {}).get('name')

            # add creator tags
            if creator_name:
                if creator_kind == 'ReplicationController':
                    tags.append('kube_replication_controller:%s' % creator_name)
                elif creator_kind == 'DaemonSet':
                    tags.append('kube_daemon_set:%s' % creator_name)
                elif creator_kind == 'ReplicaSet':
                    tags.append('kube_replica_set:%s' % creator_name)
            else:
                log.debug('creator-name for pod %s is empty, this should not happen' % pod_metadata.get('name'))

            # FIXME haissam: for service and deployment we need to store a list of these guys
            # that we query from the apiserver and to compare their selectors with the pod labels.
            # For service it's straight forward.
            # For deployment we only need to do it if the pod creator is a ReplicaSet.
            # Details: https://kubernetes.io/docs/user-guide/deployments/#selector

        elif Platform.is_swarm():
            c_labels = state.inspect_container(c_id).get('Config', {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        return tags
Example #13
0
def is_my_process(pid):
    """
    Check if the pid in the pid given corresponds to a running process
    and if psutil is available, check if it's process corresponding to
    the current executable
    """
    pid_existence = pid_exists(pid)

    if not psutil or not pid_existence:
        return pid_existence

    if Platform.is_windows():
        # We can't check anything else on Windows
        return True
    else:
        try:
            command = psutil.Process(pid).cmdline() or []
        except psutil.Error:
            # If we can't communicate with the process,
            # it's not an agent one
            return False
        # Check that the second arg contains (agent|dogstatsd).py
        # see http://stackoverflow.com/a/2345265
        exec_name = os.path.basename(inspect.stack()[-1][1]).lower()
        return len(command) > 1 and exec_name in command[1].lower()
Example #14
0
def get_subprocess_output(command, log, shell=False, stdin=None, output_expected=True):
    """
    Run the given subprocess command and return it's output. Raise an Exception
    if an error occurs.
    """

    # Use tempfile, allowing a larger amount of memory. The subprocess.Popen
    # docs warn that the data read is buffered in memory. They suggest not to
    # use subprocess.PIPE if the data size is large or unlimited.
    with nested(tempfile.TemporaryFile(), tempfile.TemporaryFile()) as (stdout_f, stderr_f):
        proc = subprocess.Popen(command,
                                close_fds=not Platform.is_windows(),  # only set to True when on Unix, for WIN compatibility
                                shell=shell,
                                stdin=stdin,
                                stdout=stdout_f,
                                stderr=stderr_f)
        proc.wait()
        stderr_f.seek(0)
        err = stderr_f.read()
        if err:
            log.debug("Error while running {0} : {1}".format(" ".join(command),
                                                             err))

        stdout_f.seek(0)
        output = stdout_f.read()

    if output_expected and output is None:
        raise SubprocessOutputEmptyError("get_subprocess_output expected output but had none.")

    return (output, err, proc.returncode)
Example #15
0
    def check(self, instance):
        btrfs_devices = {}
        excluded_devices = instance.get('excluded_devices', [])

        if Platform.is_linux():
            procfs_path = self.agentConfig.get('procfs_path', '/proc').rstrip('/')
            psutil.PROCFS_PATH = procfs_path

        for p in psutil.disk_partitions():
            if (p.fstype == 'btrfs' and p.device not in btrfs_devices
                    and p.device not in excluded_devices):
                btrfs_devices[p.device] = p.mountpoint

        if len(btrfs_devices) == 0:
            raise Exception("No btrfs device found")

        for device, mountpoint in btrfs_devices.iteritems():
            for flags, total_bytes, used_bytes in self.get_usage(mountpoint):
                replication_type, usage_type = FLAGS_MAPPER[flags]
                tags = [
                    'usage_type:{0}'.format(usage_type),
                    'replication_type:{0}'.format(replication_type),
                ]

                free = total_bytes - used_bytes
                usage = float(used_bytes) / float(total_bytes)

                self.gauge('system.disk.btrfs.total', total_bytes, tags=tags, device_name=device)
                self.gauge('system.disk.btrfs.used', used_bytes, tags=tags, device_name=device)
                self.gauge('system.disk.btrfs.free', free, tags=tags, device_name=device)
                self.gauge('system.disk.btrfs.usage', usage, tags=tags, device_name=device)
Example #16
0
    def _get_port(self, container_inspect, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        c_id = container_inspect.get('Id', '')

        try:
            ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys())
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and Platform.is_k8s():
                log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." %
                          (c_id[:12], container_inspect.get('Config', {}).get('Image', '')))
                co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', [])
                c_name = None
                for co in co_statuses:
                    if co.get('containerID', '').split('//')[-1] == c_id:
                        c_name = co.get('name')
                        break
                containers = self._get_kube_config(c_id, 'spec').get('containers', [])
                for co in containers:
                    if co.get('name') == c_name:
                        ports = map(lambda x: str(x.get('containerPort')), co.get('ports', []))
        ports = sorted(ports, key=lambda x: int(x))
        return self._extract_port_from_list(ports, tpl_var)
Example #17
0
    def _header_lines(self, indent, title=None):
        # Don't indent the header
        lines = self._title_lines(title)
        if self.created_seconds_ago() > 120:
            styles = ['red', 'bold']
        else:
            styles = []
        # We color it in red if the status is too old
        fields = [
            (
                style("Status date", *styles),
                style("%s (%ss ago)" % (
                    self.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                    self.created_seconds_ago()), *styles
                )
            )
        ]

        fields += [
            ("Pid", self.created_by_pid),
            ("Platform", platform.platform()),
            ("Python Version", "%s, %s" % (
                platform.python_version(),
                Platform.python_architecture())),
            ("Logs", logger_info()),
        ]

        for key, value in fields:
            l = indent + "%s: %s" % (key, value)
            lines.append(l)
        return lines + [""]
Example #18
0
    def _get_check_configs(self, state, c_id, identifier):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_container_name': state.get_kube_container_name(c_id),
                'kube_annotations': kube_metadata.get('annotations'),
            }
        config_templates = self._get_config_templates(identifier, **platform_kwargs)
        if not config_templates:
            return None

        check_configs = []
        tags = self.get_tags(state, c_id)
        for config_tpl in config_templates:
            source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # insert tags in instance_tpl and process values for template variables
            instance_tpl, var_values = self._fill_tpl(state, c_id, instance_tpl, variables, tags)

            tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values)
            if tpl and len(tpl) == 2:
                init_config, instance = tpl
                check_configs.append((source, (check_name, init_config, instance)))

        return check_configs
Example #19
0
    def _get_host_address(self, state, c_id, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_inspect = state.inspect_container(c_id)
        c_id, c_img = c_inspect.get('Id', ''), c_inspect.get('Config', {}).get('Image', '')

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        # try to get the bridge (default) IP address
        log.debug("No IP address was found in container %s (%s) "
            "networks, trying with the IPAddress field" % (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_ip = state.get_kube_config(c_id, 'status').get('podIP')
            if pod_ip:
                return pod_ip

        log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img))
        return None
Example #20
0
    def check(self, agentConfig):

        if not Platform.is_linux():
            return False

        try:
            proc_location = agentConfig.get('procfs_path', '/proc').rstrip('/')
            proc_fh = "{}/sys/fs/file-nr".format(proc_location)
            with open(proc_fh, 'r') as file_handle:
                handle_contents = file_handle.readline()
        except Exception:
            self.logger.exception("Cannot extract system file handles stats")
            return False

        handle_metrics = handle_contents.split()

        # https://www.kernel.org/doc/Documentation/sysctl/fs.txt
        allocated_fh = float(handle_metrics[0])
        allocated_unused_fh = float(handle_metrics[1])
        max_fh = float(handle_metrics[2])

        num_used = allocated_fh - allocated_unused_fh
        fh_in_use = num_used / max_fh

        return {
            'system.fs.file_handles.allocated': allocated_fh,
            'system.fs.file_handles.allocated_unused': allocated_unused_fh,
            'system.fs.file_handles.in_use': fh_in_use,
            'system.fs.file_handles.used': num_used,
            'system.fs.file_handles.max': max_fh,
        }
Example #21
0
    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        state = self._make_fetch_state()
        containers = [(
            self.dockerutil.image_name_extractor(container),
            container.get('Id'), container.get('Labels')
        ) for container in self.docker_client.containers()]

        if Platform.is_k8s():
            self.kubeutil.check_services_cache_freshness()

        for image, cid, labels in containers:
            try:
                # value of the DATADOG_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(state, cid, identifier) or []
                for conf in check_configs:
                    source, (check_name, init_config, instance) = conf

                    # build instances list if needed
                    if configs.get(check_name) is None:
                        configs[check_name] = (source, (init_config, [instance]))
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \
                            'Keeping the first one found.'
                        if configs[check_name][1][0] != init_config:
                            log.warning(conflict_init_msg.format(check_name))
                        configs[check_name][1][1].append(instance)
            except Exception:
                log.exception('Building config for container %s based on image %s using service '
                              'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs
Example #22
0
    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)

            try:
                self._report_cgroup_metrics(container, tags)
                if "_proc_root" not in container:
                    containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                    continue
                self._report_net_metrics(container, tags)
            except BogusPIDException as e:
                self.log.warning('Unable to report cgroup metrics: %s', e)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)
Example #23
0
def get_checks():
    checks = {}
    conf_d_directory = get_confd_path()

    for filename in sorted(os.listdir(conf_d_directory)):
        module_name, ext = osp.splitext(filename)
        if Platform.is_windows():
            excluded_checks = EXCLUDED_WINDOWS_CHECKS
        else:
            excluded_checks = EXCLUDED_MAC_CHECKS
        if filename.split(".")[0] in excluded_checks:
            continue
        if ext not in (".yaml", ".example", ".disabled"):
            continue

        agent_check = AgentCheck(filename, ext, conf_d_directory)
        if (
            agent_check.enabled
            or agent_check.module_name not in checks
            or (not agent_check.is_example and not checks[agent_check.module_name].enabled)
        ):
            checks[agent_check.module_name] = agent_check

    checks_list = checks.values()
    checks_list.sort(key=lambda c: c.module_name)

    return checks_list
Example #24
0
    def upload(self, email=None):
        self._check_size()

        if self._cmdline:
            self._ask_for_confirmation()

        if not email:
            email = self._ask_for_email()

        log.info("Uploading {0} to Datadog Support".format(self._tar_path))
        url = self._url
        if self._case_id:
            url = '{0}/{1}'.format(self._url, str(self._case_id))
        url = "{0}?api_key={1}".format(url, self._api_key)
        requests_options = {
            'data': {
                'case_id': self._case_id,
                'hostname': self._hostname,
                'email': email
            },
            'files': {'flare_file': open(self._tar_path, 'rb')},
            'timeout': self.TIMEOUT
        }
        if Platform.is_windows():
            requests_options['verify'] = os.path.realpath(os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                os.pardir, os.pardir,
                'datadog-cert.pem'
            ))

        self._resp = requests.post(url, **requests_options)
        self._analyse_result()
        return self._case_id
Example #25
0
    def _add_conf_tar(self):
        conf_path = get_config_path()
        if self._can_read(conf_path):
            self._add_file_tar(
                self._strip_comment(conf_path),
                os.path.join('etc', 'datadog.conf'),
                original_file_path=conf_path
            )

        if not Platform.is_windows():
            supervisor_path = os.path.join(
                os.path.dirname(get_config_path()),
                'supervisor.conf'
            )
            if self._can_read(supervisor_path):
                self._add_file_tar(
                    self._strip_comment(supervisor_path),
                    os.path.join('etc', 'supervisor.conf'),
                    original_file_path=supervisor_path
                )

        for file_path in glob.glob(os.path.join(get_confd_path(), '*.yaml')) +\
                glob.glob(os.path.join(get_confd_path(), '*.yaml.default')):
            if self._can_read(file_path, output=False):
                self._add_clean_confd(file_path)
Example #26
0
    def _add_conf_tar(self):
        conf_path = get_config_path()
        if self._can_read(conf_path, output=False):
            self._add_clean_conf(
                conf_path,
                'etc',
                self.MAIN_CREDENTIALS
            )

        if not Platform.is_windows():
            supervisor_path = os.path.join(
                os.path.dirname(get_config_path()),
                'supervisor.conf'
            )
            if self._can_read(supervisor_path, output=False):
                self._add_clean_conf(
                    supervisor_path,
                    'etc'
                )

        for file_path in glob.glob(os.path.join(get_confd_path(), '*.yaml')) +\
                glob.glob(os.path.join(get_confd_path(), '*.yaml.default')):
            if self._can_read(file_path, output=False):
                self._add_clean_conf(
                    file_path,
                    os.path.join('etc', 'confd'),
                    self.CHECK_CREDENTIALS
                )
Example #27
0
    def _host_matches_node(self, primary_addrs):
        """ For < 0.19, check if the current host matches the IP given in the
            cluster nodes check `/_cluster/nodes`. Uses `ip addr` on Linux and
            `ifconfig` on Mac
        """
        if Platform.is_darwin():
            ifaces = subprocess.Popen(['ifconfig'], stdout=subprocess.PIPE)
        else:
            ifaces = subprocess.Popen(['ip', 'addr'], stdout=subprocess.PIPE)
        grepper = subprocess.Popen(
            ['grep', 'inet'], stdin=ifaces.stdout, stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)

        ifaces.stdout.close()
        out, err = grepper.communicate()

        # Capture the list of interface IPs
        ips = []
        for iface in out.split("\n"):
            iface = iface.strip()
            if iface:
                ips.append(iface.split(' ')[1].split('/')[0])

        # Check the interface addresses against the primary address
        return primary_addrs in ips
Example #28
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # ad stands for access denied
        # We cache the PIDs getting this error and don't iterate on them
        # more often than `access_denied_cache_duration`
        # This cache is for all PIDs so it's global, but it should
        # be refreshed by instance
        self.last_ad_cache_ts = {}
        self.ad_cache = set()
        self.access_denied_cache_duration = int(
            init_config.get("access_denied_cache_duration", DEFAULT_AD_CACHE_DURATION)
        )

        # By default cache the PID list for a while
        # Sometimes it's not wanted b/c it can mess with no-data monitoring
        # This cache is indexed per instance
        self.last_pid_cache_ts = {}
        self.pid_cache = {}
        self.pid_cache_duration = int(init_config.get("pid_cache_duration", DEFAULT_PID_CACHE_DURATION))

        if Platform.is_linux():
            procfs_path = init_config.get("procfs_path")
            if procfs_path:
                psutil.PROCFS_PATH = procfs_path

        # Process cache, indexed by instance
        self.process_cache = defaultdict(dict)
Example #29
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()
            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                self.kubeutil = KubeUtil()
            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning("You must specify an exclude section to enable filtering")
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
Example #30
0
def get_hostname(config=None):
    """
    Get the canonical host name this agent should identify as. This is
    the authoritative source of the host name for the agent.

    Tries, in order:

      * agent config (datadog.conf, "hostname:")
      * 'hostname -f' (on unix)
      * socket.gethostname()
    """
    hostname = None

    # first, try the config
    if config is None:
        from config import get_config
        config = get_config(parse_args=True)
    config_hostname = config.get('hostname')
    if config_hostname and is_valid_hostname(config_hostname):
        return config_hostname

    #Try to get GCE instance name
    if hostname is None:
        gce_hostname = GCE.get_hostname(config)
        if gce_hostname is not None:
            if is_valid_hostname(gce_hostname):
                return gce_hostname
    # then move on to os-specific detection
    if hostname is None:
        def _get_hostname_unix():
            try:
                # try fqdn
                p = subprocess.Popen(['/bin/hostname', '-f'], stdout=subprocess.PIPE)
                out, err = p.communicate()
                if p.returncode == 0:
                    return out.strip()
            except Exception:
                return None

        os_name = get_os()
        if os_name in ['mac', 'freebsd', 'linux', 'solaris']:
            unix_hostname = _get_hostname_unix()
            if unix_hostname and is_valid_hostname(unix_hostname):
                hostname = unix_hostname

    # if we have an ec2 default hostname, see if there's an instance-id available
    if (Platform.is_ecs_instance()) or (hostname is not None and True in [hostname.lower().startswith(p) for p in [u'ip-', u'domu']]):
        instanceid = EC2.get_instance_id(config)
        if instanceid:
            hostname = instanceid

    # fall back on socket.gethostname(), socket.getfqdn() is too unreliable
    if hostname is None:
        try:
            socket_hostname = socket.gethostname()
        except socket.error, e:
            socket_hostname = None
        if socket_hostname and is_valid_hostname(socket_hostname):
            hostname = socket_hostname
Example #31
0
def get_jmx_status_path():
    if Platform.is_win32():
        path = os.path.join(_windows_commondata_path(), 'Datadog')
    else:
        path = tempfile.gettempdir()
    return path
Example #32
0
    def _start(self, path_to_java, java_run_opts, jmx_checks, command,
               reporter, tools_jar_path, custom_jar_paths,
               redirect_std_streams):
        if reporter is None:
            statsd_host = self.agent_config.get('bind_host', 'localhost')
            if statsd_host == "0.0.0.0":
                # If statsd is bound to all interfaces, just use localhost for clients
                statsd_host = "localhost"
            statsd_port = self.agent_config.get('dogstatsd_port', "8125")
            reporter = "statsd:%s:%s" % (statsd_host, statsd_port)

        log.info("Starting jmxfetch:")
        try:
            path_to_java = path_to_java or "java"
            java_run_opts = java_run_opts or ""
            path_to_jmxfetch = self._get_path_to_jmxfetch()
            path_to_status_file = JMXFiles.get_status_file_path()

            classpath = path_to_jmxfetch
            if tools_jar_path is not None:
                classpath = r"%s%s%s" % (tools_jar_path, os.pathsep, classpath)
            if custom_jar_paths:
                classpath = r"%s%s%s" % (os.pathsep.join(custom_jar_paths),
                                         os.pathsep, classpath)
            if self.config_jar_path:
                classpath = r"%s%s%s" % (self.config_jar_path, os.pathsep,
                                         classpath)

            subprocess_args = [
                path_to_java,  # Path to the java bin
                '-classpath',
                classpath,
                JMXFETCH_MAIN_CLASS,
                '--check_period',
                str(self.check_frequency *
                    1000),  # Period of the main loop of jmxfetch in ms
                '--conf_directory',
                r"%s" % self.
                confd_path,  # Path of the conf.d directory that will be read by jmxfetch,
                '--log_level',
                JAVA_LOGGING_LEVEL.get(
                    self.logging_config.get("log_level"), "INFO"
                ),  # Log Level: Mapping from Python log level to log4j log levels
                '--log_location',
                r"%s" % self.logging_config.get(
                    'jmxfetch_log_file'),  # Path of the log file
                '--reporter',
                reporter,  # Reporter to use
                '--status_location',
                r"%s" %
                path_to_status_file,  # Path to the status file to write
                command,  # Name of the command
            ]

            if Platform.is_windows():
                # Signal handlers are not supported on Windows:
                # use a file to trigger JMXFetch exit instead
                path_to_exit_file = JMXFiles.get_python_exit_file_path()
                subprocess_args.insert(
                    len(subprocess_args) - 1, '--exit_file_location')
                subprocess_args.insert(
                    len(subprocess_args) - 1, path_to_exit_file)

            if self.service_discovery:
                pipe_path = get_jmx_pipe_path()
                subprocess_args.insert(4, '--tmp_directory')
                subprocess_args.insert(5, pipe_path)
                subprocess_args.insert(4, '--sd_pipe')
                subprocess_args.insert(5, SD_PIPE_NAME)
                subprocess_args.insert(4, '--sd_enabled')

            if self.pool_size:
                subprocess_args.insert(4, '--thread_pool_size')
                subprocess_args.insert(5, self.pool_size)
            if self.reconnection_pool_size:
                subprocess_args.insert(4, '--reconnection_thread_pool_size')
                subprocess_args.insert(5, self.reconnection_pool_size)
            if self.collection_to:
                subprocess_args.insert(4, '--collection_timeout')
                subprocess_args.insert(5, self.collection_to)
            if self.reconnection_to:
                subprocess_args.insert(4, '--reconnection_timeout')
                subprocess_args.insert(5, self.reconnection_to)

            if jmx_checks:
                subprocess_args.insert(4, '--check')
                for check in jmx_checks:
                    subprocess_args.insert(5, check)

            # Specify a maximum memory allocation pool for the JVM
            if "Xmx" not in java_run_opts and "XX:MaxHeapSize" not in java_run_opts:
                java_run_opts += _JVM_DEFAULT_SD_MAX_MEMORY_ALLOCATION if self.service_discovery else _JVM_DEFAULT_MAX_MEMORY_ALLOCATION
            # Specify the initial memory allocation pool for the JVM
            if "Xms" not in java_run_opts and "XX:InitialHeapSize" not in java_run_opts:
                java_run_opts += _JVM_DEFAULT_INITIAL_MEMORY_ALLOCATION

            for opt in java_run_opts.split():
                subprocess_args.insert(1, opt)

            log.info("Running %s" % " ".join(subprocess_args))
            return self.execute(subprocess_args, redirect_std_streams)

        except OSError:
            java_path_msg = "Couldn't launch JMXTerm. Is Java in your PATH ?"
            log.exception(java_path_msg)
            invalid_checks = {}
            for check in jmx_checks:
                check_name = check.split('.')[0]
                check_name = check_name.encode('ascii', 'ignore')
                invalid_checks[check_name] = java_path_msg
            JMXFiles.write_status_file(invalid_checks)
            raise
        except Exception:
            log.info("unable to launch JMXFetch")
            raise
Example #33
0
    def run(self, config=None):
        """Main loop of the collector"""

        # Gracefully exit on sigterm.
        signal.signal(signal.SIGTERM, self._handle_sigterm)

        if not Platform.is_windows():
            # A SIGUSR1 signals an exit with an autorestart
            signal.signal(signal.SIGUSR1, self._handle_sigusr1)

            # Handle Keyboard Interrupt
            signal.signal(signal.SIGINT, self._handle_sigterm)

            # A SIGHUP signals a configuration reload
            signal.signal(signal.SIGHUP, self._handle_sighup)
        else:
            sdk_integrations = get_sdk_integration_paths()
            for name, path in sdk_integrations.iteritems():
                lib_path = os.path.join(path, 'lib')
                if os.path.exists(lib_path):
                    sys.path.append(lib_path)

        # Save the agent start-up stats.
        CollectorStatus().persist()

        # Intialize the collector.
        if not config:
            try:
                config = get_config(parse_args=True)
            except:
                log.warning("Failed to load configuration")
                sys.exit(2)

        self._agentConfig = self._set_agent_config_hostname(config)
        hostname = get_hostname(self._agentConfig)
        systemStats = get_system_stats(proc_path=self._agentConfig.get(
            'procfs_path', '/proc').rstrip('/'))
        emitters = self._get_emitters()

        # Initialize service discovery
        if self._agentConfig.get('service_discovery'):
            self.sd_backend = get_sd_backend(self._agentConfig)

        if self.sd_backend and _is_affirmative(
                self._agentConfig.get('sd_jmx_enable', False)):
            pipe_path = get_jmx_pipe_path()
            if Platform.is_windows():
                pipe_name = pipe_path.format(pipename=SD_PIPE_NAME)
            else:
                pipe_name = os.path.join(pipe_path, SD_PIPE_NAME)

            if os.access(pipe_path, os.W_OK):
                if not os.path.exists(pipe_name):
                    os.mkfifo(pipe_name)
                self.sd_pipe = os.open(
                    pipe_name, os.O_RDWR)  # RW to avoid blocking (will only W)

                # Initialize Supervisor proxy
                self.supervisor_proxy = self._get_supervisor_socket(
                    self._agentConfig)
            else:
                log.debug(
                    'Unable to create pipe in temporary directory. JMX service discovery disabled.'
                )

        # Load the checks.d checks
        self._checksd = load_check_directory(self._agentConfig, hostname)

        # Load JMX configs if available
        if self._jmx_service_discovery_enabled:
            self.sd_pipe_jmx_configs(hostname)

        # Initialize the Collector
        self.collector = Collector(self._agentConfig, emitters, systemStats,
                                   hostname)

        # In developer mode, the number of runs to be included in a single collector profile
        try:
            self.collector_profile_interval = int(
                self._agentConfig.get('collector_profile_interval',
                                      DEFAULT_COLLECTOR_PROFILE_INTERVAL))
        except ValueError:
            log.warn('collector_profile_interval is invalid. '
                     'Using default value instead (%s).' %
                     DEFAULT_COLLECTOR_PROFILE_INTERVAL)
            self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL

        # Configure the watchdog.
        self.check_frequency = int(self._agentConfig['check_freq'])
        watchdog = self._get_watchdog(self.check_frequency)

        # Initialize the auto-restarter
        self.restart_interval = int(
            self._agentConfig.get('restart_interval', RESTART_INTERVAL))
        self.agent_start = time.time()

        self.allow_profiling = _is_affirmative(
            self._agentConfig.get('allow_profiling', True))

        profiled = False
        collector_profiled_runs = 0

        # Run the main loop.
        while self.run_forever:
            # Setup profiling if necessary
            if self.allow_profiling and self.in_developer_mode and not profiled:
                try:
                    profiler = AgentProfiler()
                    profiler.enable_profiling()
                    profiled = True
                except Exception as e:
                    log.warn("Cannot enable profiler: %s" % str(e))

            if self.reload_configs_flag:
                if isinstance(self.reload_configs_flag, set):
                    self.reload_configs(
                        checks_to_reload=self.reload_configs_flag)
                else:
                    self.reload_configs()

            # JMXFetch restarts should prompt re-piping *all* JMX configs
            if self._jmx_service_discovery_enabled and \
                    (not self.reload_configs_flag or isinstance(self.reload_configs_flag, set)):
                try:
                    jmx_launch = JMXFetch._get_jmx_launchtime()
                    if self.last_jmx_piped and self.last_jmx_piped < jmx_launch:
                        self.sd_pipe_jmx_configs(hostname)
                except Exception as e:
                    log.debug("could not stat JMX lunch file: %s", e)

            # Do the work. Pass `configs_reloaded` to let the collector know if it needs to
            # look for the AgentMetrics check and pop it out.
            self.collector.run(
                checksd=self._checksd,
                start_event=self.start_event,
                configs_reloaded=True if self.reload_configs_flag else False)

            self.reload_configs_flag = False

            # Look for change in the config template store.
            # The self.sd_backend.reload_check_configs flag is set
            # to True if a config reload is needed.
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               not self.sd_backend.reload_check_configs:
                try:
                    self.sd_backend.reload_check_configs = get_config_store(
                        self._agentConfig).crawl_config_template()
                except Exception as e:
                    log.warn(
                        'Something went wrong while looking for config template changes: %s'
                        % str(e))

            # Check if we should run service discovery
            # The `reload_check_configs` flag can be set through the docker_daemon check or
            # using ConfigStore.crawl_config_template
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               self.sd_backend.reload_check_configs:
                self.reload_configs_flag = self.sd_backend.reload_check_configs
                self.sd_backend.reload_check_configs = False

            if profiled:
                if collector_profiled_runs >= self.collector_profile_interval:
                    try:
                        profiler.disable_profiling()
                        profiled = False
                        collector_profiled_runs = 0
                    except Exception as e:
                        log.warn("Cannot disable profiler: %s" % str(e))

            # Check if we should restart.
            if self.autorestart and self._should_restart():
                self._do_restart()

            # Only plan for next loop if we will continue, otherwise exit quickly.
            if self.run_forever:
                if watchdog:
                    watchdog.reset()
                if profiled:
                    collector_profiled_runs += 1
                log.debug("Sleeping for {0} seconds".format(
                    self.check_frequency))
                time.sleep(self.check_frequency)

        # Now clean-up.
        try:
            CollectorStatus.remove_latest_status()
        except Exception:
            pass

        # Explicitly kill the process, because it might be running as a daemon.
        log.info("Exiting. Bye bye.")
        sys.exit(0)
Example #34
0
import traceback
from types import ListType, TupleType

# 3p
try:
    import psutil
except ImportError:
    psutil = None
import yaml

# project
from checks import check_status
from util import get_hostname, get_next_id, LaconicFilter, yLoader
from utils.platform import Platform
from utils.profile import pretty_statistics
if Platform.is_windows():
    from utils.debug import run_check  # noqa - windows debug purpose

log = logging.getLogger(__name__)

# Default methods run when collecting info about the agent in developer mode
DEFAULT_PSUTIL_METHODS = ['get_memory_info', 'get_io_counters']

AGENT_METRICS_CHECK_NAME = 'agent_metrics'


# Konstants
class CheckException(Exception):
    pass

Example #35
0
 def test_is_k8s(self):
     os.unsetenv('KUBERNETES_PORT')
     self.assertFalse(Platform.is_k8s())
     os.environ['KUBERNETES_PORT'] = '999'
     self.assertTrue(Platform.is_k8s())
Example #36
0
    def check(self, agentConfig):
        """Return an aggregate of CPU stats across all CPUs
        When figures are not available, False is sent back.
        """
        def format_results(us, sy, wa, idle, st, guest=None):
            data = {
                'cpuUser': us,
                'cpuSystem': sy,
                'cpuWait': wa,
                'cpuIdle': idle,
                'cpuStolen': st,
                'cpuGuest': guest
            }
            return dict((k, v) for k, v in data.iteritems() if v is not None)

        def get_value(legend, data, name, filter_value=None):
            "Using the legend and a metric name, get the value or None from the data line"
            if name in legend:
                value = to_float(data[legend.index(name)])
                if filter_value is not None:
                    if value > filter_value:
                        return None
                return value

            else:
                # FIXME return a float or False, would trigger type error if not python
                self.logger.debug("Cannot extract cpu value %s from %s (%s)" %
                                  (name, data, legend))
                return 0.0

        try:
            if Platform.is_linux():
                output, _, _ = get_subprocess_output(['mpstat', '1', '3'],
                                                     self.logger)
                mpstat = output.splitlines()
                # topdog@ip:~$ mpstat 1 3
                # Linux 2.6.32-341-ec2 (ip)   01/19/2012  _x86_64_  (2 CPU)
                #
                # 04:22:41 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
                # 04:22:42 PM  all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
                # 04:22:43 PM  all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
                # 04:22:44 PM  all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
                # Average:     all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
                #
                # OR
                #
                # Thanks to Mart Visser to spotting this one.
                # blah:/etc/dd-agent# mpstat
                # Linux 2.6.26-2-xen-amd64 (atira)  02/17/2012  _x86_64_
                #
                # 05:27:03 PM  CPU    %user   %nice   %sys %iowait    %irq   %soft  %steal  %idle   intr/s
                # 05:27:03 PM  all    3.59    0.00    0.68    0.69    0.00   0.00    0.01   95.03    43.65
                #
                legend = [l for l in mpstat if "%usr" in l or "%user" in l]
                avg = [l for l in mpstat if "Average" in l]
                if len(legend) == 1 and len(avg) == 1:
                    headers = [
                        h for h in legend[0].split() if h not in ("AM", "PM")
                    ]
                    data = avg[0].split()

                    # Userland
                    # Debian lenny says %user so we look for both
                    # One of them will be 0
                    cpu_metrics = {
                        "%usr": None,
                        "%user": None,
                        "%nice": None,
                        "%iowait": None,
                        "%idle": None,
                        "%sys": None,
                        "%irq": None,
                        "%soft": None,
                        "%steal": None,
                        "%guest": None
                    }

                    for cpu_m in cpu_metrics:
                        cpu_metrics[cpu_m] = get_value(headers,
                                                       data,
                                                       cpu_m,
                                                       filter_value=110)

                    if any([v is None for v in cpu_metrics.values()]):
                        self.logger.warning("Invalid mpstat data: %s" % data)

                    cpu_user = cpu_metrics["%usr"] + cpu_metrics[
                        "%user"] + cpu_metrics["%nice"]
                    cpu_system = cpu_metrics["%sys"] + cpu_metrics[
                        "%irq"] + cpu_metrics["%soft"]
                    cpu_wait = cpu_metrics["%iowait"]
                    cpu_idle = cpu_metrics["%idle"]
                    cpu_stolen = cpu_metrics["%steal"]
                    cpu_guest = cpu_metrics["%guest"]

                    return format_results(cpu_user, cpu_system, cpu_wait,
                                          cpu_idle, cpu_stolen, cpu_guest)
                else:
                    return False

            elif sys.platform == 'darwin':
                # generate 3 seconds of data
                # ['          disk0           disk1       cpu     load average', '    KB/t tps  MB/s     KB/t tps  MB/s  us sy id   1m   5m   15m', '   21.23  13  0.27    17.85   7  0.13  14  7 79  1.04 1.27 1.31', '    4.00   3  0.01     5.00   8  0.04  12 10 78  1.04 1.27 1.31', '']
                iostats, _, _ = get_subprocess_output(
                    ['iostat', '-C', '-w', '3', '-c', '2'], self.logger)
                lines = [l for l in iostats.splitlines() if len(l) > 0]
                legend = [l for l in lines if "us" in l]
                if len(legend) == 1:
                    headers = legend[0].split()
                    data = lines[-1].split()
                    cpu_user = get_value(headers, data, "us")
                    cpu_sys = get_value(headers, data, "sy")
                    cpu_wait = 0
                    cpu_idle = get_value(headers, data, "id")
                    cpu_st = 0
                    return format_results(cpu_user, cpu_sys, cpu_wait,
                                          cpu_idle, cpu_st)
                else:
                    self.logger.warn(
                        "Expected to get at least 4 lines of data from iostat instead of just "
                        + str(iostats[:max(80, len(iostats))]))
                    return False

            elif sys.platform.startswith("freebsd"):
                # generate 3 seconds of data
                # tty            ada0              cd0            pass0             cpu
                # tin  tout  KB/t tps  MB/s   KB/t tps  MB/s   KB/t tps  MB/s  us ni sy in id
                # 0    69 26.71   0  0.01   0.00   0  0.00   0.00   0  0.00   2  0  0  1 97
                # 0    78  0.00   0  0.00   0.00   0  0.00   0.00   0  0.00   0  0  0  0 100
                iostats, _, _ = get_subprocess_output(
                    ['iostat', '-w', '3', '-c', '2'], self.logger)
                lines = [l for l in iostats.splitlines() if len(l) > 0]
                legend = [l for l in lines if "us" in l]
                if len(legend) == 1:
                    headers = legend[0].split()
                    data = lines[-1].split()
                    cpu_user = get_value(headers, data, "us")
                    cpu_nice = get_value(headers, data, "ni")
                    cpu_sys = get_value(headers, data, "sy")
                    cpu_intr = get_value(headers, data, "in")
                    cpu_wait = 0
                    cpu_idle = get_value(headers, data, "id")
                    cpu_stol = 0
                    return format_results(cpu_user + cpu_nice,
                                          cpu_sys + cpu_intr, cpu_wait,
                                          cpu_idle, cpu_stol)

                else:
                    self.logger.warn(
                        "Expected to get at least 4 lines of data from iostat instead of just "
                        + str(iostats[:max(80, len(iostats))]))
                    return False

            elif sys.platform == 'sunos5':
                # mpstat -aq 1 2
                # SET minf mjf xcal  intr ithr  csw icsw migr smtx  srw syscl  usr sys  wt idl sze
                # 0 5239   0 12857 22969 5523 14628   73  546 4055    1 146856    5   6   0  89  24 <-- since boot
                # 1 ...
                # SET minf mjf xcal  intr ithr  csw icsw migr smtx  srw syscl  usr sys  wt idl sze
                # 0 20374   0 45634 57792 5786 26767   80  876 20036    2 724475   13  13   0  75  24 <-- past 1s
                # 1 ...
                # http://docs.oracle.com/cd/E23824_01/html/821-1462/mpstat-1m.html
                #
                # Will aggregate over all processor sets
                output, _, _ = get_subprocess_output(
                    ['mpstat', '-aq', '1', '2'], self.logger)
                mpstat = output.splitlines()
                lines = [l for l in mpstat if len(l) > 0]
                # discard the first len(lines)/2 lines
                lines = lines[len(lines) / 2:]
                legend = [l for l in lines if "SET" in l]
                assert len(legend) == 1
                if len(legend) == 1:
                    headers = legend[0].split()
                    # collect stats for each processor set
                    # and aggregate them based on the relative set size
                    d_lines = [l for l in lines if "SET" not in l]
                    user = [
                        get_value(headers, l.split(), "usr") for l in d_lines
                    ]
                    kern = [
                        get_value(headers, l.split(), "sys") for l in d_lines
                    ]
                    wait = [
                        get_value(headers, l.split(), "wt") for l in d_lines
                    ]
                    idle = [
                        get_value(headers, l.split(), "idl") for l in d_lines
                    ]
                    size = [
                        get_value(headers, l.split(), "sze") for l in d_lines
                    ]
                    count = sum(size)
                    rel_size = [s / count for s in size]
                    dot = lambda v1, v2: reduce(operator.add,
                                                map(operator.mul, v1, v2))
                    return format_results(dot(user, rel_size),
                                          dot(kern, rel_size),
                                          dot(wait, rel_size),
                                          dot(idle, rel_size), 0.0)
            else:
                self.logger.warn("CPUStats: unsupported platform")
                return False
        except Exception:
            self.logger.exception("Cannot compute CPU stats")
            return False
Example #37
0
def get_config(parse_args=True, cfg_path=None, options=None):
    if parse_args:
        options, _ = get_parsed_args()

    # General config
    agentConfig = {
        'check_freq': DEFAULT_CHECK_FREQUENCY,
        'monitorstatsd_port': 8125,
        'monitorstatsd_target': 'http://localhost:17123',
        'graphite_listen_port': None,
        'hostname': None,
        'listen_port': None,
        'tags': None,
        'use_ec2_instance_id': False,  # DEPRECATED
        'version': get_version(),
        'watchmonitor': True,
        'additional_checksd': '/etc/monitor-agent/checks.d/',
        'bind_host': get_default_bind_host(),
        'statsd_metric_namespace': None,
        'utf8_decoding': False
    }

    if Platform.is_mac():
        agentConfig['additional_checksd'] = '/opt/datadog-agent/etc/checks.d'

    # Config handling
    try:
        # Find the right config file
        path = os.path.realpath(__file__)
        path = os.path.dirname(path)

        config_path = get_config_path(cfg_path, os_name=get_os())
        config = ConfigParser.ConfigParser()
        config.readfp(skip_leading_wsp(open(config_path)))

        # bulk import
        for option in config.options('Main'):
            agentConfig[option] = config.get('Main', option)

        # Store developer mode setting in the agentConfig
        if config.has_option('Main', 'developer_mode'):
            agentConfig['developer_mode'] = _is_affirmative(
                config.get('Main', 'developer_mode'))

        # Allow an override with the --profile option
        if options is not None and options.profile:
            agentConfig['developer_mode'] = True

        # Get check frequency
        if config.has_option("Main", "frequency"):
            agentConfig['check_freq'] = config.get("Main", "frequency")

        #
        # Core config
        #

        # FIXME unnecessarily complex
        agentConfig['use_forwarder'] = False
        if options is not None and options.use_forwarder:
            listen_port = 17123
            if config.has_option('Main', 'listen_port'):
                listen_port = int(config.get('Main', 'listen_port'))
            agentConfig['m_url'] = "http://" + agentConfig[
                'bind_host'] + ":" + str(listen_port)
            agentConfig['use_forwarder'] = True
        elif options is not None and not options.disable_dd and options.m_url:
            agentConfig['m_url'] = options.m_url
        else:
            agentConfig['m_url'] = config.get('Main', 'm_url')
        if agentConfig['m_url'].endswith('/'):
            agentConfig['m_url'] = agentConfig['m_url'][:-1]

        # Extra checks.d path
        # the linux directory is set by default
        if config.has_option('Main', 'additional_checksd'):
            agentConfig['additional_checksd'] = config.get(
                'Main', 'additional_checksd')
        elif get_os() == 'windows':
            # default windows location
            common_path = _windows_commondata_path()
            agentConfig['additional_checksd'] = os.path.join(
                common_path, 'Datamonitor', 'checks.d')

        if config.has_option('Main', 'use_monitorstatsd'):
            agentConfig['use_monitorstatsd'] = config.get(
                'Main', 'use_monitorstatsd').lower() in ("yes", "true")
        else:
            agentConfig['use_monitorstatsd'] = True

        # Concerns only Windows
        if config.has_option('Main', 'use_web_info_page'):
            agentConfig['use_web_info_page'] = config.get(
                'Main', 'use_web_info_page').lower() in ("yes", "true")
        else:
            agentConfig['use_web_info_page'] = True

        # Which API key to use
        agentConfig['api_key'] = config.get('Main', 'api_key')

        # local traffic only? Default to no
        agentConfig['non_local_traffic'] = False
        if config.has_option('Main', 'non_local_traffic'):
            agentConfig['non_local_traffic'] = config.get(
                'Main', 'non_local_traffic').lower() in ("yes", "true")

        # DEPRECATED
        if config.has_option('Main', 'use_ec2_instance_id'):
            use_ec2_instance_id = config.get('Main', 'use_ec2_instance_id')
            # translate yes into True, the rest into False
            agentConfig['use_ec2_instance_id'] = (
                use_ec2_instance_id.lower() == 'yes')

        if config.has_option('Main', 'check_freq'):
            try:
                agentConfig['check_freq'] = int(
                    config.get('Main', 'check_freq'))
            except Exception:
                pass

        # Custom histogram aggregate/percentile metrics
        if config.has_option('Main', 'histogram_aggregates'):
            agentConfig['histogram_aggregates'] = get_histogram_aggregates(
                config.get('Main', 'histogram_aggregates'))

        if config.has_option('Main', 'histogram_percentiles'):
            agentConfig['histogram_percentiles'] = get_histogram_percentiles(
                config.get('Main', 'histogram_percentiles'))

        # Disable Watchmonitor (optionally)
        if config.has_option('Main', 'watchmonitor'):
            if config.get('Main', 'watchmonitor').lower() in ('no', 'false'):
                agentConfig['watchmonitor'] = False

        # Optional graphite listener
        if config.has_option('Main', 'graphite_listen_port'):
            agentConfig['graphite_listen_port'] = \
                int(config.get('Main', 'graphite_listen_port'))
        else:
            agentConfig['graphite_listen_port'] = None

        # monitorstatsd config
        monitorstatsd_defaults = {
            'monitorstatsd_port':
            8125,
            'monitorstatsd_target':
            'http://' + agentConfig['bind_host'] + ':17123',
        }
        for key, value in monitorstatsd_defaults.iteritems():
            if config.has_option('Main', key):
                agentConfig[key] = config.get('Main', key)
            else:
                agentConfig[key] = value

        # Create app:xxx tags based on monitored apps
        agentConfig['create_dd_check_tags'] = config.has_option('Main', 'create_dd_check_tags') and \
            _is_affirmative(config.get('Main', 'create_dd_check_tags'))

        # Forwarding to external statsd server
        if config.has_option('Main', 'statsd_forward_host'):
            agentConfig['statsd_forward_host'] = config.get(
                'Main', 'statsd_forward_host')
            if config.has_option('Main', 'statsd_forward_port'):
                agentConfig['statsd_forward_port'] = int(
                    config.get('Main', 'statsd_forward_port'))

        # optionally send monitorstatsd data directly to the agent.
        if config.has_option('Main', 'monitorstatsd_use_murl'):
            if _is_affirmative(config.get('Main', 'monitorstatsd_use_murl')):
                agentConfig['monitorstatsd_target'] = agentConfig['m_url']

        # Optional config
        # FIXME not the prettiest code ever...
        if config.has_option('Main', 'use_mount'):
            agentConfig['use_mount'] = _is_affirmative(
                config.get('Main', 'use_mount'))

        if options is not None and options.autorestart:
            agentConfig['autorestart'] = True
        elif config.has_option('Main', 'autorestart'):
            agentConfig['autorestart'] = _is_affirmative(
                config.get('Main', 'autorestart'))

        if config.has_option('Main', 'check_timings'):
            agentConfig['check_timings'] = _is_affirmative(
                config.get('Main', 'check_timings'))

        if config.has_option('Main', 'exclude_process_args'):
            agentConfig['exclude_process_args'] = _is_affirmative(
                config.get('Main', 'exclude_process_args'))

        try:
            filter_device_re = config.get('Main', 'device_blacklist_re')
            agentConfig['device_blacklist_re'] = re.compile(filter_device_re)
        except ConfigParser.NoOptionError:
            pass

        if config.has_option('datamonitor', 'ddforwarder_log'):
            agentConfig['has_datamonitor'] = True

        # monitorstream config
        if config.has_option("Main", "monitorstream_log"):
            # Older version, single log support
            log_path = config.get("Main", "monitorstream_log")
            if config.has_option("Main", "monitorstream_line_parser"):
                agentConfig["monitorstreams"] = ':'.join([
                    log_path,
                    config.get("Main", "monitorstream_line_parser")
                ])
            else:
                agentConfig["monitorstreams"] = log_path

        elif config.has_option("Main", "monitorstreams"):
            agentConfig["monitorstreams"] = config.get("Main",
                                                       "monitorstreams")

        if config.has_option("Main", "nagios_perf_cfg"):
            agentConfig["nagios_perf_cfg"] = config.get(
                "Main", "nagios_perf_cfg")

        if config.has_option("Main", "use_curl_http_client"):
            agentConfig["use_curl_http_client"] = _is_affirmative(
                config.get("Main", "use_curl_http_client"))
        else:
            # Default to False as there are some issues with the curl client and ELB
            agentConfig["use_curl_http_client"] = False

        if config.has_section('WMI'):
            agentConfig['WMI'] = {}
            for key, value in config.items('WMI'):
                agentConfig['WMI'][key] = value

        if (config.has_option("Main", "limit_memory_consumption") and
                config.get("Main", "limit_memory_consumption") is not None):
            agentConfig["limit_memory_consumption"] = int(
                config.get("Main", "limit_memory_consumption"))
        else:
            agentConfig["limit_memory_consumption"] = None

        if config.has_option("Main", "skip_ssl_validation"):
            agentConfig["skip_ssl_validation"] = _is_affirmative(
                config.get("Main", "skip_ssl_validation"))

        agentConfig["collect_instance_metadata"] = True
        if config.has_option("Main", "collect_instance_metadata"):
            agentConfig["collect_instance_metadata"] = _is_affirmative(
                config.get("Main", "collect_instance_metadata"))

        agentConfig["proxy_forbid_method_switch"] = False
        if config.has_option("Main", "proxy_forbid_method_switch"):
            agentConfig["proxy_forbid_method_switch"] = _is_affirmative(
                config.get("Main", "proxy_forbid_method_switch"))

        agentConfig["collect_ec2_tags"] = False
        if config.has_option("Main", "collect_ec2_tags"):
            agentConfig["collect_ec2_tags"] = _is_affirmative(
                config.get("Main", "collect_ec2_tags"))

        agentConfig["utf8_decoding"] = False
        if config.has_option("Main", "utf8_decoding"):
            agentConfig["utf8_decoding"] = _is_affirmative(
                config.get("Main", "utf8_decoding"))

        agentConfig["gce_updated_hostname"] = False
        if config.has_option("Main", "gce_updated_hostname"):
            agentConfig["gce_updated_hostname"] = _is_affirmative(
                config.get("Main", "gce_updated_hostname"))

    except ConfigParser.NoSectionError as e:
        sys.stderr.write('Config file not found or incorrectly formatted.\n')
        sys.exit(2)

    except ConfigParser.ParsingError as e:
        sys.stderr.write('Config file not found or incorrectly formatted.\n')
        sys.exit(2)

    except ConfigParser.NoOptionError as e:
        sys.stderr.write(
            'There are some items missing from your config file, but nothing fatal [%s]'
            % e)

    # Storing proxy settings in the agentConfig
    agentConfig['proxy_settings'] = get_proxy(agentConfig)
    if agentConfig.get('ca_certs', None) is None:
        agentConfig['ssl_certificate'] = get_ssl_certificate(
            get_os(), 'datamonitor-cert.pem')
    else:
        agentConfig['ssl_certificate'] = agentConfig['ca_certs']

    # self-updater relative conf
    agentConfig['interval'] = config.get('Main', 'updater_interval')

    return agentConfig
Example #38
0
    def _check_linux(self, instance):
        """
        _check_linux can be run inside a container and still collects the network metrics from the host
        For that procfs_path can be set to something like "/host/proc"
        When a custom procfs_path is set, the collect_connection_state option is ignored
        """
        proc_location = self.agentConfig.get('procfs_path',
                                             '/proc').rstrip('/')

        if Platform.is_containerized() and proc_location != "/proc":
            proc_location = "%s/1" % proc_location

        if self._is_collect_cx_state_runnable(proc_location):
            try:
                self.log.debug("Using `ss` to collect connection state")
                # Try using `ss` for increased performance over `netstat`
                for ip_version in ['4', '6']:
                    for protocol in ['tcp', 'udp']:
                        # Call `ss` for each IP version because there's no built-in way of distinguishing
                        # between the IP versions in the output
                        # Also calls `ss` for each protocol, because on some systems (e.g. Ubuntu 14.04), there is a
                        # bug that print `tcp` even if it's `udp`
                        output, _, _ = get_subprocess_output([
                            "ss", "-n", "-{0}".format(protocol[0]), "-a",
                            "-{0}".format(ip_version)
                        ], self.log)
                        lines = output.splitlines()

                        # State      Recv-Q Send-Q     Local Address:Port       Peer Address:Port
                        # UNCONN     0      0              127.0.0.1:8125                  *:*
                        # ESTAB      0      0              127.0.0.1:37036         127.0.0.1:8125
                        # UNCONN     0      0        fe80::a00:27ff:fe1c:3c4:123          :::*
                        # TIME-WAIT  0      0          90.56.111.177:56867        46.105.75.4:143
                        # LISTEN     0      0       ::ffff:127.0.0.1:33217  ::ffff:127.0.0.1:7199
                        # ESTAB      0      0       ::ffff:127.0.0.1:58975  ::ffff:127.0.0.1:2181

                        metrics = self._parse_linux_cx_state(
                            lines[1:],
                            self.tcp_states['ss'],
                            0,
                            protocol=protocol,
                            ip_version=ip_version)
                        # Only send the metrics which match the loop iteration's ip version
                        for stat, metric in self.cx_state_gauge.iteritems():
                            if stat[0].endswith(ip_version) and stat[
                                    0].startswith(protocol):
                                self.gauge(metric, metrics.get(metric))

            except OSError:
                self.log.info("`ss` not found: using `netstat` as a fallback")
                output, _, _ = get_subprocess_output(
                    ["netstat", "-n", "-u", "-t", "-a"], self.log)
                lines = output.splitlines()
                # Active Internet connections (w/o servers)
                # Proto Recv-Q Send-Q Local Address           Foreign Address         State
                # tcp        0      0 46.105.75.4:80          79.220.227.193:2032     SYN_RECV
                # tcp        0      0 46.105.75.4:143         90.56.111.177:56867     ESTABLISHED
                # tcp        0      0 46.105.75.4:50468       107.20.207.175:443      TIME_WAIT
                # tcp6       0      0 46.105.75.4:80          93.15.237.188:58038     FIN_WAIT2
                # tcp6       0      0 46.105.75.4:80          79.220.227.193:2029     ESTABLISHED
                # udp        0      0 0.0.0.0:123             0.0.0.0:*
                # udp6       0      0 :::41458                :::*

                metrics = self._parse_linux_cx_state(
                    lines[2:], self.tcp_states['netstat'], 5)
                for metric, value in metrics.iteritems():
                    self.gauge(metric, value)
            except SubprocessOutputEmptyError:
                self.log.exception("Error collecting connection stats.")

        proc_dev_path = "{}/net/dev".format(proc_location)
        with open(proc_dev_path, 'r') as proc:
            lines = proc.readlines()
        # Inter-|   Receive                                                 |  Transmit
        #  face |bytes     packets errs drop fifo frame compressed multicast|bytes       packets errs drop fifo colls carrier compressed
        #     lo:45890956   112797   0    0    0     0          0         0    45890956   112797    0    0    0     0       0          0
        #   eth0:631947052 1042233   0   19    0   184          0      1206  1208625538  1320529    0    0    0     0       0          0
        #   eth1:       0        0   0    0    0     0          0         0           0        0    0    0    0     0       0          0
        for l in lines[2:]:
            cols = l.split(':', 1)
            x = cols[1].split()
            # Filter inactive interfaces
            if self._parse_value(x[0]) or self._parse_value(x[8]):
                iface = cols[0].strip()
                metrics = {
                    'bytes_rcvd':
                    self._parse_value(x[0]),
                    'bytes_sent':
                    self._parse_value(x[8]),
                    'packets_in.count':
                    self._parse_value(x[1]),
                    'packets_in.error':
                    self._parse_value(x[2]) + self._parse_value(x[3]),
                    'packets_out.count':
                    self._parse_value(x[9]),
                    'packets_out.error':
                    self._parse_value(x[10]) + self._parse_value(x[11]),
                }
                self._submit_devicemetrics(iface, metrics)

        netstat_data = {}
        for f in ['netstat', 'snmp']:
            proc_data_path = "{}/net/{}".format(proc_location, f)
            try:
                with open(proc_data_path, 'r') as netstat:
                    while True:
                        n_header = netstat.readline()
                        if not n_header:
                            break  # No more? Abort!
                        n_data = netstat.readline()

                        h_parts = n_header.strip().split(' ')
                        h_values = n_data.strip().split(' ')
                        ns_category = h_parts[0][:-1]
                        netstat_data[ns_category] = {}
                        # Turn the data into a dictionary
                        for idx, hpart in enumerate(h_parts[1:]):
                            netstat_data[ns_category][hpart] = h_values[idx +
                                                                        1]
            except IOError:
                # On Openshift, /proc/net/snmp is only readable by root
                self.log.debug("Unable to read %s.", proc_data_path)

        nstat_metrics_names = {
            'Tcp': {
                'RetransSegs': 'system.net.tcp.retrans_segs',
                'InSegs': 'system.net.tcp.in_segs',
                'OutSegs': 'system.net.tcp.out_segs',
            },
            'TcpExt': {
                'ListenOverflows': 'system.net.tcp.listen_overflows',
                'ListenDrops': 'system.net.tcp.listen_drops',
                'TCPBacklogDrop': 'system.net.tcp.backlog_drops',
                'TCPRetransFail': 'system.net.tcp.failed_retransmits',
            },
            'Udp': {
                'InDatagrams': 'system.net.udp.in_datagrams',
                'NoPorts': 'system.net.udp.no_ports',
                'InErrors': 'system.net.udp.in_errors',
                'OutDatagrams': 'system.net.udp.out_datagrams',
                'RcvbufErrors': 'system.net.udp.rcv_buf_errors',
                'SndbufErrors': 'system.net.udp.snd_buf_errors',
                'InCsumErrors': 'system.net.udp.in_csum_errors'
            }
        }

        # Skip the first line, as it's junk
        for k in nstat_metrics_names:
            for met in nstat_metrics_names[k]:
                if met in netstat_data.get(k, {}):
                    self.rate(nstat_metrics_names[k][met],
                              self._parse_value(netstat_data[k][met]))
Example #39
0
    def test_relocated_procfs(self):
        from utils.platform import Platform
        import tempfile
        import shutil
        import uuid

        already_linux = Platform.is_linux()
        unique_process_name = str(uuid.uuid4())
        my_procfs = tempfile.mkdtemp()

        def _fake_procfs(arg, root=my_procfs):
            for key, val in arg.iteritems():
                path = os.path.join(root, key)
                if isinstance(val, dict):
                    os.mkdir(path)
                    _fake_procfs(val, path)
                else:
                    with open(path, "w") as f:
                        f.write(str(val))

        _fake_procfs({
            '1': {
                'status': ("Name:\t%s\nThreads:\t1\n") % unique_process_name,
                'stat': ('1 (%s) S 0 1 1 ' + ' 0' * 46) % unique_process_name,
                'cmdline': unique_process_name,
            },
            'stat': ("cpu  13034 0 18596 380856797 2013 2 2962 0 0 0\n"
                     "btime 1448632481\n"),
        })

        config = {
            'init_config': {
                'procfs_path': my_procfs
            },
            'instances': [{
                'name': 'moved_procfs',
                'search_string': [unique_process_name],
                'exact_match': False,
                'ignored_denied_access': True,
                'thresholds': {
                    'warning': [1, 10],
                    'critical': [1, 100]
                },
            }]
        }

        version = int(psutil.__version__.replace(".", ""))
        try:

            def import_mock(name,
                            i_globals={},
                            i_locals={},
                            fromlist=[],
                            level=-1,
                            orig_import=__import__):
                # _psutil_linux and _psutil_posix are the C bindings; use a mock for those
                if name in ('_psutil_linux', '_psutil_posix'
                            ) or level >= 1 and ('_psutil_linux' in fromlist or
                                                 '_psutil_posix' in fromlist):
                    m = MagicMock()
                    # the import system will ask us for our own name
                    m._psutil_linux = m
                    m._psutil_posix = m
                    # there's a version safety check in psutil/__init__.py; this skips it
                    m.version = version
                    return m
                return orig_import(name, i_globals, i_locals, fromlist, level)

            # contextlib.nested is deprecated in favor of with MGR1, MGR2, ... etc, but we have too many mocks to fit on one line and apparently \ line
            # continuation is not flake8 compliant, even when semantically required (as here). Patch is unlikely to throw errors that are suppressed, so
            # the main downside of contextlib is avoided.
            with contextlib.nested(
                    patch('sys.platform', 'linux'),
                    patch('socket.AF_PACKET', create=True),
                    patch('__builtin__.__import__', side_effect=import_mock)):
                if not already_linux:
                    # Reloading psutil fails on linux, but we only need to do so if we didn't start out on a linux platform
                    reload(psutil)
                assert Platform.is_linux()

                self.run_check(
                    config,
                    mocks={'get_pagefault_stats': noop_get_pagefault_stats})
        finally:
            shutil.rmtree(my_procfs)
            if not already_linux:
                # restore the original psutil that doesn't have our mocks
                reload(psutil)
            else:
                psutil.PROCFS_PATH = '/proc'

        expected_tags = self.generate_expected_tags(config['instances'][0])
        self.assertServiceCheckOK('process.up',
                                  count=1,
                                  tags=expected_tags +
                                  ['process:moved_procfs'])

        self.assertMetric('system.processes.number',
                          at_least=1,
                          tags=expected_tags)
        self.assertMetric('system.processes.threads',
                          at_least=1,
                          tags=expected_tags)
        self.assertMetric('system.processes.run_time.avg',
                          at_least=1,
                          tags=expected_tags)
        self.assertMetric('system.processes.run_time.max',
                          at_least=1,
                          tags=expected_tags)
        self.assertMetric('system.processes.run_time.min',
                          at_least=1,
                          tags=expected_tags)

        self.coverage_report()
Example #40
0
    def _check_bsd(self, instance):
        netstat_flags = ['-i', '-b']

        # FreeBSD's netstat truncates device names unless you pass '-W'
        if Platform.is_freebsd():
            netstat_flags.append('-W')

        try:
            output, _, _ = get_subprocess_output(["netstat"] + netstat_flags,
                                                 self.log)
            lines = output.splitlines()
            # Name  Mtu   Network       Address            Ipkts Ierrs     Ibytes    Opkts Oerrs     Obytes  Coll
            # lo0   16384 <Link#1>                        318258     0  428252203   318258     0  428252203     0
            # lo0   16384 localhost   fe80:1::1           318258     -  428252203   318258     -  428252203     -
            # lo0   16384 127           localhost         318258     -  428252203   318258     -  428252203     -
            # lo0   16384 localhost   ::1                 318258     -  428252203   318258     -  428252203     -
            # gif0* 1280  <Link#2>                             0     0          0        0     0          0     0
            # stf0* 1280  <Link#3>                             0     0          0        0     0          0     0
            # en0   1500  <Link#4>    04:0c:ce:db:4e:fa 20801309     0 13835457425 15149389     0 11508790198     0
            # en0   1500  seneca.loca fe80:4::60c:ceff: 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  192.168.1     192.168.1.63    20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # p2p0  2304  <Link#5>    06:0c:ce:db:4e:fa        0     0          0        0     0          0     0
            # ham0  1404  <Link#6>    7a:79:05:4d:bf:f5    30100     0    6815204    18742     0    8494811     0
            # ham0  1404  5             5.77.191.245       30100     -    6815204    18742     -    8494811     -
            # ham0  1404  seneca.loca fe80:6::7879:5ff:    30100     -    6815204    18742     -    8494811     -
            # ham0  1404  2620:9b::54 2620:9b::54d:bff5    30100     -    6815204    18742     -    8494811     -

            headers = lines[0].split()

            # Given the irregular structure of the table above, better to parse from the end of each line
            # Verify headers first
            #          -7       -6       -5        -4       -3       -2        -1
            for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes",
                      "Coll"):
                if h not in headers:
                    self.log.error("%s not found in %s; cannot parse" %
                                   (h, headers))
                    return False

            current = None
            for l in lines[1:]:
                # Another header row, abort now, this is IPv6 land
                if "Name" in l:
                    break

                x = l.split()
                if len(x) == 0:
                    break

                iface = x[0]
                if iface.endswith("*"):
                    iface = iface[:-1]
                if iface == current:
                    # skip multiple lines of same interface
                    continue
                else:
                    current = iface

                # Filter inactive interfaces
                if self._parse_value(x[-5]) or self._parse_value(x[-2]):
                    iface = current
                    metrics = {
                        'bytes_rcvd': self._parse_value(x[-5]),
                        'bytes_sent': self._parse_value(x[-2]),
                        'packets_in.count': self._parse_value(x[-7]),
                        'packets_in.error': self._parse_value(x[-6]),
                        'packets_out.count': self._parse_value(x[-4]),
                        'packets_out.error': self._parse_value(x[-3]),
                    }
                    self._submit_devicemetrics(iface, metrics)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting connection stats.")

        try:
            netstat, _, _ = get_subprocess_output(
                ["netstat", "-s", "-p"
                 "tcp"], self.log)
            #3651535 packets sent
            #        972097 data packets (615753248 bytes)
            #        5009 data packets (2832232 bytes) retransmitted
            #        0 resends initiated by MTU discovery
            #        2086952 ack-only packets (471 delayed)
            #        0 URG only packets
            #        0 window probe packets
            #        310851 window update packets
            #        336829 control packets
            #        0 data packets sent after flow control
            #        3058232 checksummed in software
            #        3058232 segments (571218834 bytes) over IPv4
            #        0 segments (0 bytes) over IPv6
            #4807551 packets received
            #        1143534 acks (for 616095538 bytes)
            #        165400 duplicate acks
            #        ...

            self._submit_regexed_values(netstat, BSD_TCP_METRICS)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting TCP stats.")
Example #41
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        """
        Initialize a new check.

        :param name: The name of the check
        :param init_config: The config for initializing the check
        :param agentConfig: The global configuration for the agent
        :param instances: A list of configuration objects for each instance.
        """
        from aggregator import MetricsAggregator

        self._enabled_checks.append(name)
        self._enabled_checks = list(set(self._enabled_checks))

        self.name = name
        self.init_config = init_config or {}
        self.agentConfig = agentConfig
        self.in_developer_mode = agentConfig.get('developer_mode') and psutil
        self._internal_profiling_stats = None

        self.hostname = agentConfig.get('checksd_hostname') or get_hostname(
            agentConfig)
        self.log = logging.getLogger('%s.%s' % (__name__, name))
        self.aggregator = MetricsAggregator(
            self.hostname,
            formatter=agent_formatter,
            recent_point_threshold=agentConfig.get('recent_point_threshold',
                                                   None),
            histogram_aggregates=agentConfig.get('histogram_aggregates'),
            histogram_percentiles=agentConfig.get('histogram_percentiles'))

        if Platform.is_linux() and psutil is not None:
            procfs_path = self.agentConfig.get('procfs_path',
                                               '/proc').rstrip('/')
            psutil.PROCFS_PATH = procfs_path

        self.events = []
        self.service_checks = []
        self.instances = instances or []
        self.warnings = []
        self.library_versions = None
        self.last_collection_time = defaultdict(int)
        self._instance_metadata = []
        self.svc_metadata = []
        self.historate_dict = {}

        # Set proxy settings
        self.proxy_settings = get_proxy(self.agentConfig)
        self._use_proxy = False if init_config is None else init_config.get(
            "use_agent_proxy", True)
        self.proxies = {
            "http": None,
            "https": None,
        }
        if self.proxy_settings and self._use_proxy:
            uri = "{host}:{port}".format(host=self.proxy_settings['host'],
                                         port=self.proxy_settings['port'])
            if self.proxy_settings['user'] and self.proxy_settings['password']:
                uri = "{user}:{password}@{uri}".format(
                    user=self.proxy_settings['user'],
                    password=self.proxy_settings['password'],
                    uri=uri)
            self.proxies['http'] = "http://{uri}".format(uri=uri)
            self.proxies['https'] = "https://{uri}".format(uri=uri)
Example #42
0
    def run(self, checksd=None, start_event=True, configs_reloaded=False):
        """
        Collect data from each check and submit their data.
        """
        log.debug("Found {num_checks} checks".format(
            num_checks=len(checksd['initialized_checks'])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd[
                'initialized_checks']  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd[
                'init_failed_checks']  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        # Run the system checks. Checks will depend on the OS
        if Platform.is_windows():
            # Win32 system checks
            for check_name in ['memory', 'cpu', 'io', 'proc', 'system']:
                try:
                    metrics.extend(self._win32_system_checks[check_name].check(
                        self.agentConfig))
                except Exception:
                    log.exception('Unable to get %s metrics', check_name)
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            for check_name in ['load', 'system', 'cpu', 'file_handles']:
                try:
                    result_check = sys_checks[check_name].check(
                        self.agentConfig)
                    if result_check:
                        payload.update(result_check)
                except Exception:
                    log.exception('Unable to get %s metrics', check_name)

            try:
                memory = sys_checks['memory'].check(self.agentConfig)
            except Exception:
                log.exception('Unable to get memory metrics')
            else:
                if memory:
                    memstats = {
                        'memPhysUsed': memory.get('physUsed'),
                        'memPhysPctUsable': memory.get('physPctUsable'),
                        'memPhysFree': memory.get('physFree'),
                        'memPhysTotal': memory.get('physTotal'),
                        'memPhysUsable': memory.get('physUsable'),
                        'memSwapUsed': memory.get('swapUsed'),
                        'memSwapFree': memory.get('swapFree'),
                        'memSwapPctFree': memory.get('swapPctFree'),
                        'memSwapTotal': memory.get('swapTotal'),
                        'memCached': memory.get('physCached'),
                        'memBuffers': memory.get('physBuffers'),
                        'memShared': memory.get('physShared'),
                        'memSlab': memory.get('physSlab'),
                        'memPageTables': memory.get('physPageTables'),
                        'memSwapCached': memory.get('swapCached')
                    }
                    payload.update(memstats)

            try:
                ioStats = sys_checks['io'].check(self.agentConfig)
            except Exception:
                log.exception('Unable to get io metrics')
            else:
                if ioStats:
                    payload['ioStats'] = ioStats

            try:
                processes = sys_checks['processes'].check(self.agentConfig)
            except Exception:
                log.exception('Unable to get processes metrics')
            else:
                payload.update({'processes': processes})

        # Run old-style checks
        if self._ganglia is not None:
            payload['ganglia'] = self._ganglia.check(self.agentConfig)
        if self._dogstream is not None:
            dogstreamData = self._dogstream.check(self.agentConfig)
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # process collector of gohai (compliant with payload of legacy "resources checks")
        if not Platform.is_windows() and self._should_send_additional_data(
                'processes'):
            gohai_processes = self._run_gohai_processes()
            if gohai_processes:
                try:
                    gohai_processes_json = json.loads(gohai_processes)
                    processes_snaps = gohai_processes_json.get('processes')
                    if processes_snaps:
                        processes_payload = {'snaps': [processes_snaps]}

                        payload['resources'] = {
                            'processes': processes_payload,
                            'meta': {
                                'host': self.hostname,
                            }
                        }
                except Exception:
                    log.exception("Error running gohai processes collection")

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # Use `info` log level for some messages on the first run only, then `debug`
        log_at_first_run = log.info if self._is_first_run() else log.debug

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log_at_first_run("Running check %s", check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name,
                instance_statuses,
                metric_count,
                event_count,
                service_check_count,
                service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats,
                check_version=check.check_version)

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datadog.agent.check_status',
                                status,
                                tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            # -1 because the user doesn't care about the service check for check failure
            service_check_count = len(current_check_service_checks) - 1

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'datadog.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

            if hasattr(check, A7_COMPATIBILITY_ATTR) and isinstance(
                    getattr(check, A7_COMPATIBILITY_ATTR), str):
                metric = 'datadog.agent.check_ready'
                status = getattr(check, A7_COMPATIBILITY_ATTR)
                meta = {
                    'tags': [
                        "check_name:%s" % check.name,
                        "agent_version_major:%s" % AGENT_VERSION.split(".")[0],
                        "agent_version_minor:%s" % AGENT_VERSION.split(".")[1],
                        "agent_version_patch:%s" % AGENT_VERSION.split(".")[2],
                        "status:%s" % status
                    ]
                }

                # datadog.agent.check_ready:
                # 0: is not compatible with A7 (or unknown)
                # 1: is compatible with A7
                metrics.append(
                    (metric, time.time(), a7_compatible_to_int(status), meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name,
                                       None,
                                       None,
                                       None,
                                       None,
                                       check_version=info.get(
                                           'version', 'unknown'),
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(
            create_service_check('datadog.agent.up',
                                 AgentCheck.OK,
                                 hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {
                'collection_time': collect_duration,
                'emit_time': self.emit_duration,
            }
            if not Platform.is_windows():
                metric_context['cpu_time'] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload['metrics'].extend(agent_stats)
            if self.agentConfig.get('developer_mode'):
                log.debug("\n Agent developer mode stats: \n {0}".format(
                    Collector._stats_for_display(agent_stats)))
            # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak.
            self._agent_metrics.get_service_metadata()

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        if self._is_first_run():
            # This is not the exact payload sent to the backend as minor post
            # processing is done, but this will give us a good idea of what is sent
            # to the backend.
            data = payload.payload  # deep copy and merge of meta and metric data
            data['apiKey'] = '*************************' + data.get(
                'apiKey', '')[-5:]
            # removing unused keys for the metadata payload
            del data['metrics']
            del data['events']
            del data['service_checks']
            if data.get('processes'):
                data['processes'][
                    'apiKey'] = '*************************' + data[
                        'processes'].get('apiKey', '')[-5:]
            log.debug("Metadata payload: %s", json.dumps(data))

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration,
                                            2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info(
                    "First flushes done, next flushes will be logged every %s flushes."
                    % FLUSH_LOGGING_PERIOD)
        else:
            log.debug(
                "Finished run #%s. Collection time: %ss. Emit time: %ss" %
                (self.run_count, round(collect_duration,
                                       2), round(self.emit_duration, 2)))

        return payload
Example #43
0
    def test_complex_config(self):
        config = {'instances': self.MYSQL_COMPLEX_CONFIG}
        self.run_check_twice(config)

        # Test service check
        self.assertServiceCheck('mysql.can_connect',
                                status=AgentCheck.OK,
                                tags=self.SC_TAGS,
                                count=1)

        # Travis MySQL not running replication - FIX in flavored test.
        self.assertServiceCheck('mysql.replication.slave_running',
                                status=AgentCheck.CRITICAL,
                                tags=self.SC_TAGS,
                                count=1)

        ver = map(lambda x: int(x),
                  self.service_metadata[0]['version'].split("."))
        ver = tuple(ver)

        testable_metrics = (self.STATUS_VARS + self.VARIABLES_VARS +
                            self.INNODB_VARS + self.BINLOG_VARS +
                            self.SYSTEM_METRICS + self.SCHEMA_VARS +
                            self.SYNTHETIC_VARS)

        if ver >= (5, 6, 0):
            testable_metrics.extend(self.PERFORMANCE_VARS)

        # Test metrics
        for mname in testable_metrics:
            # These two are currently not guaranteed outside of a Linux
            # environment.
            if mname == 'mysql.performance.user_time' and not Platform.is_linux(
            ):
                continue
            if mname == 'mysql.performance.kernel_time' and not Platform.is_linux(
            ):
                continue
            if mname == 'mysql.performance.cpu_time' and Platform.is_windows():
                continue

            if mname == 'mysql.performance.query_run_time.avg':
                self.assertMetric(mname,
                                  tags=self.METRIC_TAGS + ['schema:testdb'],
                                  count=1)
            elif mname == 'mysql.info.schema.size':
                self.assertMetric(mname,
                                  tags=self.METRIC_TAGS + ['schema:testdb'],
                                  count=1)
                self.assertMetric(mname,
                                  tags=self.METRIC_TAGS +
                                  ['schema:information_schema'],
                                  count=1)
                self.assertMetric(mname,
                                  tags=self.METRIC_TAGS +
                                  ['schema:performance_schema'],
                                  count=1)
            else:
                self.assertMetric(mname, tags=self.METRIC_TAGS, count=1)

        # Assert service metadata
        self.assertServiceMetadata(['version'], count=1)

        # test custom query metrics
        self.assertMetric('alice.age', value=25)
        self.assertMetric('bob.age', value=20)

        # test optional metrics
        self._test_optional_metrics(
            (self.OPTIONAL_REPLICATION_METRICS + self.OPTIONAL_INNODB_VARS +
             self.OPTIONAL_STATUS_VARS + self.OPTIONAL_STATUS_VARS_5_6_6), 1)

        # Raises when COVERAGE=true and coverage < 100%
        self.coverage_report()
Example #44
0
    def check(self, agentConfig):
        if Platform.is_linux():
            proc_location = agentConfig.get('procfs_path', '/proc').rstrip('/')
            try:
                proc_meminfo = "{}/meminfo".format(proc_location)
                with open(proc_meminfo, 'r') as mem_info:
                    lines = mem_info.readlines()
            except Exception:
                self.logger.exception('Cannot get memory metrics from %s',
                                      proc_meminfo)
                return False

            # NOTE: not all of the stats below are present on all systems as
            # not all kernel versions report all of them.
            #
            # $ cat /proc/meminfo
            # MemTotal:        7995360 kB
            # MemFree:         1045120 kB
            # MemAvailable:    1253920 kB
            # Buffers:          226284 kB
            # Cached:           775516 kB
            # SwapCached:       248868 kB
            # Active:          1004816 kB
            # Inactive:        1011948 kB
            # Active(anon):     455152 kB
            # Inactive(anon):   584664 kB
            # Active(file):     549664 kB
            # Inactive(file):   427284 kB
            # Unevictable:     4392476 kB
            # Mlocked:         4392476 kB
            # SwapTotal:      11120632 kB
            # SwapFree:       10555044 kB
            # Dirty:              2948 kB
            # Writeback:             0 kB
            # AnonPages:       5203560 kB
            # Mapped:            50520 kB
            # Shmem:             10108 kB
            # Slab:             161300 kB
            # SReclaimable:     136108 kB
            # SUnreclaim:        25192 kB
            # KernelStack:        3160 kB
            # PageTables:        26776 kB
            # NFS_Unstable:          0 kB
            # Bounce:                0 kB
            # WritebackTmp:          0 kB
            # CommitLimit:    15118312 kB
            # Committed_AS:    6703508 kB
            # VmallocTotal:   34359738367 kB
            # VmallocUsed:      400668 kB
            # VmallocChunk:   34359329524 kB
            # HardwareCorrupted:     0 kB
            # HugePages_Total:       0
            # HugePages_Free:        0
            # HugePages_Rsvd:        0
            # HugePages_Surp:        0
            # Hugepagesize:       2048 kB
            # DirectMap4k:       10112 kB
            # DirectMap2M:     8243200 kB

            regexp = re.compile(
                r'^(\w+):\s+([0-9]+)'
            )  # We run this several times so one-time compile now
            meminfo = {}

            parse_error = False
            for line in lines:
                try:
                    match = re.search(regexp, line)
                    if match is not None:
                        meminfo[match.group(1)] = match.group(2)
                except Exception:
                    parse_error = True
            if parse_error:
                self.logger.error("Error parsing %s", proc_meminfo)

            memData = {}

            # Physical memory
            # FIXME units are in MB, we should use bytes instead
            try:
                memData['physTotal'] = int(meminfo.get('MemTotal', 0)) / 1024
                memData['physFree'] = int(meminfo.get('MemFree', 0)) / 1024
                memData['physBuffers'] = int(meminfo.get('Buffers', 0)) / 1024
                memData['physCached'] = int(meminfo.get('Cached', 0)) / 1024
                memData['physShared'] = int(meminfo.get('Shmem', 0)) / 1024
                memData['physSlab'] = int(meminfo.get('Slab', 0)) / 1024
                memData['physPageTables'] = int(meminfo.get('PageTables',
                                                            0)) / 1024
                memData[
                    'physUsed'] = memData['physTotal'] - memData['physFree']

                if 'MemAvailable' in meminfo:
                    memData['physUsable'] = int(meminfo.get('MemAvailable',
                                                            0)) / 1024
                else:
                    # Usable is relative since cached and buffers are actually used to speed things up.
                    memData['physUsable'] = memData['physFree'] + memData[
                        'physBuffers'] + memData['physCached']

                if memData['physTotal'] > 0:
                    memData['physPctUsable'] = float(
                        memData['physUsable']) / float(memData['physTotal'])
            except Exception:
                self.logger.exception('Cannot compute stats from %s',
                                      proc_meminfo)

            # Swap
            # FIXME units are in MB, we should use bytes instead
            try:
                memData['swapTotal'] = int(meminfo.get('SwapTotal', 0)) / 1024
                memData['swapFree'] = int(meminfo.get('SwapFree', 0)) / 1024
                memData['swapCached'] = int(meminfo.get('SwapCached',
                                                        0)) / 1024

                memData[
                    'swapUsed'] = memData['swapTotal'] - memData['swapFree']

                if memData['swapTotal'] > 0:
                    memData['swapPctFree'] = float(
                        memData['swapFree']) / float(memData['swapTotal'])
            except Exception:
                self.logger.exception('Cannot compute swap stats')

            return memData

        elif sys.platform == 'darwin':
            if psutil is None:
                self.logger.error(
                    "psutil must be installed on MacOS to collect memory metrics"
                )
                return False

            phys_memory = psutil.virtual_memory()
            swap = psutil.swap_memory()
            return {
                'physUsed': phys_memory.used / float(1024**2),
                'physFree': phys_memory.free / float(1024**2),
                'physUsable': phys_memory.available / float(1024**2),
                'physPctUsable': (100 - phys_memory.percent) / 100.0,
                'swapUsed': swap.used / float(1024**2),
                'swapFree': swap.free / float(1024**2)
            }

        elif sys.platform.startswith("freebsd"):
            try:
                output, _, _ = get_subprocess_output(['sysctl', 'vm.stats.vm'],
                                                     self.logger)
                sysctl = output.splitlines()
            except Exception:
                self.logger.exception('getMemoryUsage')
                return False

            # ...
            # vm.stats.vm.v_page_size: 4096
            # vm.stats.vm.v_page_count: 759884
            # vm.stats.vm.v_wire_count: 122726
            # vm.stats.vm.v_active_count: 109350
            # vm.stats.vm.v_cache_count: 17437
            # vm.stats.vm.v_inactive_count: 479673
            # vm.stats.vm.v_free_count: 30542
            # ...

            # We run this several times so one-time compile now
            regexp = re.compile(r'^vm\.stats\.vm\.(\w+):\s+([0-9]+)')
            meminfo = {}

            parse_error = False
            for line in sysctl:
                try:
                    match = re.search(regexp, line)
                    if match is not None:
                        meminfo[match.group(1)] = match.group(2)
                except Exception:
                    parse_error = True
            if parse_error:
                self.logger.error("Error parsing vm.stats.vm output: %s",
                                  sysctl)

            memData = {}

            # Physical memory
            try:
                pageSize = int(meminfo.get('v_page_size'))

                memData['physTotal'] = (int(meminfo.get('v_page_count', 0)) *
                                        pageSize) / 1048576
                memData['physFree'] = (int(meminfo.get('v_free_count', 0)) *
                                       pageSize) / 1048576
                memData['physCached'] = (int(meminfo.get('v_cache_count', 0)) *
                                         pageSize) / 1048576
                memData['physUsed'] = (
                    (int(meminfo.get('v_active_count'), 0) +
                     int(meminfo.get('v_wire_count', 0))) * pageSize) / 1048576
                memData['physUsable'] = (
                    (int(meminfo.get('v_free_count'), 0) +
                     int(meminfo.get('v_cache_count', 0)) +
                     int(meminfo.get('v_inactive_count', 0))) *
                    pageSize) / 1048576

                if memData['physTotal'] > 0:
                    memData['physPctUsable'] = float(
                        memData['physUsable']) / float(memData['physTotal'])
            except Exception:
                self.logger.exception('Cannot compute stats from %s',
                                      proc_meminfo)

            # Swap
            try:
                output, _, _ = get_subprocess_output(['swapinfo', '-m'],
                                                     self.logger)
                sysctl = output.splitlines()
            except Exception:
                self.logger.exception('getMemoryUsage')
                return False

            # ...
            # Device          1M-blocks     Used    Avail Capacity
            # /dev/ad0s1b           570        0      570     0%
            # ...

            assert "Device" in sysctl[0]

            try:
                memData['swapTotal'] = 0
                memData['swapFree'] = 0
                memData['swapUsed'] = 0
                for line in sysctl[1:]:
                    if len(line) > 0:
                        line = line.split()
                        memData['swapTotal'] += int(line[1])
                        memData['swapFree'] += int(line[3])
                        memData['swapUsed'] += int(line[2])
            except Exception:
                self.logger.exception('Cannot compute stats from swapinfo')

            return memData
        elif sys.platform == 'sunos5':
            try:
                memData = {}
                cmd = [
                    "kstat", "-m", "memory_cap", "-c", "zone_memory_cap", "-p"
                ]
                output, _, _ = get_subprocess_output(cmd, self.logger)
                kmem = output.splitlines()

                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anon_alloc_fail   0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anonpgin  0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:class     zone_memory_cap
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:crtime    16359935.0680834
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:execpgin  185
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:fspgin    2556
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle     0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle_usec        0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:nover     0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pagedout  0
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pgpgin    2741
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:physcap   536870912  <--
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:rss       115544064  <--
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:snaptime  16787393.9439095
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swap      91828224   <--
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swapcap   1073741824 <--
                # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename  53aa9b7e-48ba-4152-a52b-a6368c3d9e7c

                # turn memory_cap:360:zone_name:key value
                # into { "key": value, ...}
                kv = [l.strip().split() for l in kmem if len(l) > 0]
                entries = dict([(k.split(":")[-1], v) for (k, v) in kv])
                # extract rss, physcap, swap, swapcap, turn into MB
                convert = lambda v: int(long(v)) / 2**20
                memData["physTotal"] = convert(entries["physcap"])
                memData["physUsed"] = convert(entries["rss"])
                memData[
                    "physFree"] = memData["physTotal"] - memData["physUsed"]
                memData["swapTotal"] = convert(entries["swapcap"])
                memData["swapUsed"] = convert(entries["swap"])
                memData[
                    "swapFree"] = memData["swapTotal"] - memData["swapUsed"]

                if memData['swapTotal'] > 0:
                    memData['swapPctFree'] = float(
                        memData['swapFree']) / float(memData['swapTotal'])
                return memData
            except Exception:
                self.logger.exception(
                    "Cannot compute mem stats from kstat -c zone_memory_cap")
                return False
        else:
            return False
Example #45
0
# Licensed under Simplified BSD License (see LICENSE)

# stdlib
import time

# 3p
import dns.resolver

# project
from utils.platform import Platform
from checks.network_checks import NetworkCheck, Status

# These imports are necessary because otherwise dynamic type
# resolution will fail on windows without it.
# See more here: https://github.com/rthalley/dnspython/issues/39.
if Platform.is_win32():
    from dns.rdtypes.ANY import *  # noqa
    from dns.rdtypes.IN import *  # noqa
    # for tiny time deltas, time.time on Windows reports the same value
    # of the clock more than once, causing the computation of response_time
    # to be often 0; let's use time.clock that is more precise.
    time_func = time.clock
else:
    time_func = time.time


class BadConfException(Exception):
    pass


class DNSCheck(NetworkCheck):
Example #46
0
 def set_ssl_validation(self, options):
     if self._config.get('skip_ssl_validation', False):
         options['verify'] = False
     elif Platform.is_windows():
         options['verify'] = get_ssl_certificate('windows',
                                                 'stackstate-cert.pem')
Example #47
0
def initialize_logging(logger_name):
    try:
        logging_config = get_logging_config()

        logging.basicConfig(
            format=get_log_format(logger_name),
            level=logging_config['log_level'] or logging.INFO,
        )

        log_file = logging_config.get('%s_log_file' % logger_name)
        if log_file is not None and not logging_config['disable_file_logging']:
            # make sure the log directory is writeable
            # NOTE: the entire directory needs to be writable so that rotation works
            if os.access(os.path.dirname(log_file), os.R_OK | os.W_OK):
                file_handler = logging.handlers.RotatingFileHandler(
                    log_file, maxBytes=LOGGING_MAX_BYTES, backupCount=1)
                formatter = logging.Formatter(get_log_format(logger_name),
                                              get_log_date_format())
                file_handler.setFormatter(formatter)

                root_log = logging.getLogger()
                root_log.addHandler(file_handler)
            else:
                sys.stderr.write("Log file is unwritable: '%s'\n" % log_file)

        # set up syslog
        if logging_config['log_to_syslog']:
            try:
                from logging.handlers import SysLogHandler

                if logging_config['syslog_host'] is not None and logging_config[
                        'syslog_port'] is not None:
                    sys_log_addr = (logging_config['syslog_host'],
                                    logging_config['syslog_port'])
                else:
                    sys_log_addr = "/dev/log"
                    # Special-case BSDs
                    if Platform.is_darwin():
                        sys_log_addr = "/var/run/syslog"
                    elif Platform.is_freebsd():
                        sys_log_addr = "/var/run/log"

                handler = SysLogHandler(address=sys_log_addr,
                                        facility=SysLogHandler.LOG_DAEMON)
                handler.setFormatter(
                    logging.Formatter(get_syslog_format(logger_name),
                                      get_log_date_format()))
                root_log = logging.getLogger()
                root_log.addHandler(handler)
            except Exception as e:
                sys.stderr.write("Error setting up syslog: '%s'\n" % str(e))
                traceback.print_exc()

        # Setting up logging in the event viewer for windows
        if get_os() == 'windows' and logging_config['log_to_event_viewer']:
            try:
                from logging.handlers import NTEventLogHandler
                nt_event_handler = NTEventLogHandler(
                    logger_name,
                    get_win32service_file('windows', 'win32service.pyd'),
                    'Application')
                nt_event_handler.setFormatter(
                    logging.Formatter(get_syslog_format(logger_name),
                                      get_log_date_format()))
                nt_event_handler.setLevel(logging.ERROR)
                app_log = logging.getLogger(logger_name)
                app_log.addHandler(nt_event_handler)
            except Exception as e:
                sys.stderr.write(
                    "Error setting up Event viewer logging: '%s'\n" % str(e))
                traceback.print_exc()

    except Exception as e:
        sys.stderr.write("Couldn't initialize logging: %s\n" % str(e))
        traceback.print_exc()

        # if config fails entirely, enable basic stdout logging as a fallback
        logging.basicConfig(
            format=get_log_format(logger_name),
            level=logging.INFO,
        )

    # re-get the log after logging is initialized
    global log
    log = logging.getLogger(__name__)
Example #48
0
def get_hostname(config=None):
    """
    Get the canonical host name this agent should identify as. This is
    the authoritative source of the host name for the agent.

    Tries, in order:

      * agent config (datadog.conf, "hostname:")
      * 'hostname -f' (on unix)
      * socket.gethostname()
    """
    hostname = None

    # first, try the config
    if config is None:
        from config import get_config
        config = get_config(parse_args=True)
    config_hostname = config.get('hostname')
    if config_hostname and is_valid_hostname(config_hostname):
        return config_hostname

    # Try to get GCE instance name
    if hostname is None:
        gce_hostname = GCE.get_hostname(config)
        if gce_hostname is not None:
            if is_valid_hostname(gce_hostname):
                return gce_hostname

    # Try to get the docker hostname
    if hostname is None and is_dockerized():
        docker_hostname = get_docker_hostname()
        if docker_hostname is not None and is_valid_hostname(docker_hostname):
            return docker_hostname

    # then move on to os-specific detection
    if hostname is None:

        def _get_hostname_unix():
            try:
                # try fqdn
                out, _, rtcode = get_subprocess_output(['/bin/hostname', '-f'],
                                                       log)
                if rtcode == 0:
                    return out.strip()
            except Exception:
                return None

        os_name = get_os()
        if os_name in ['mac', 'freebsd', 'linux', 'solaris']:
            unix_hostname = _get_hostname_unix()
            if unix_hostname and is_valid_hostname(unix_hostname):
                hostname = unix_hostname

    # if we have an ec2 default hostname, see if there's an instance-id available
    if (Platform.is_ecs_instance()) or (hostname is not None and True in [
            hostname.lower().startswith(p) for p in [u'ip-', u'domu']
    ]):
        instanceid = EC2.get_instance_id(config)
        if instanceid:
            hostname = instanceid

    # fall back on socket.gethostname(), socket.getfqdn() is too unreliable
    if hostname is None:
        try:
            socket_hostname = socket.gethostname()
        except socket.error:
            socket_hostname = None
        if socket_hostname and is_valid_hostname(socket_hostname):
            hostname = socket_hostname

    if hostname is None:
        log.critical(
            'Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file'
        )
        raise Exception(
            'Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file'
        )
    else:
        return hostname
Example #49
0
    def _populate_payload_metadata(self,
                                   payload,
                                   check_statuses,
                                   start_event=True):
        """
        Periodically populate the payload with metadata related to the system, host, and/or checks.
        """
        now = time.time()

        # Include system stats on first postback
        if start_event and self._is_first_run():
            payload['systemStats'] = self.agentConfig.get('system_stats', {})
            # Also post an event in the newsfeed
            payload['events']['System'] = [{
                'api_key':
                self.agentConfig['api_key'],
                'host':
                payload['internalHostname'],
                'timestamp':
                now,
                'event_type':
                'Agent Startup',
                'msg_text':
                'Version %s' % get_version()
            }]

        # Periodically send the host metadata.
        if self._should_send_additional_data('host_metadata'):
            # gather metadata with gohai
            try:
                if not Platform.is_windows():
                    command = "gohai"
                else:
                    command = "gohai\gohai.exe"
                gohai_metadata, gohai_err, _ = get_subprocess_output([command],
                                                                     log)
                payload['gohai'] = gohai_metadata
                if gohai_err:
                    log.warning("GOHAI LOG | {0}".format(gohai_err))
            except OSError as e:
                if e.errno == 2:  # file not found, expected when install from source
                    log.info("gohai file not found")
                else:
                    raise e
            except Exception as e:
                log.warning("gohai command failed with error %s" % str(e))

            payload['systemStats'] = get_system_stats()
            payload['meta'] = self._get_hostname_metadata()

            self.hostname_metadata_cache = payload['meta']
            # Add static tags from the configuration file
            host_tags = []
            if self.agentConfig['tags'] is not None:
                host_tags.extend([
                    unicode(tag.strip())
                    for tag in self.agentConfig['tags'].split(",")
                ])

            if self.agentConfig['collect_ec2_tags']:
                host_tags.extend(EC2.get_tags(self.agentConfig))

            if host_tags:
                payload['host-tags']['system'] = host_tags

            # If required by the user, let's create the dd_check:xxx host tags
            if self.agentConfig['create_dd_check_tags']:
                app_tags_list = [
                    DD_CHECK_TAG.format(c.name)
                    for c in self.initialized_checks_d
                ]
                app_tags_list.extend([
                    DD_CHECK_TAG.format(cname)
                    for cname in JMXFiles.get_jmx_appnames()
                ])

                if 'system' not in payload['host-tags']:
                    payload['host-tags']['system'] = []

                payload['host-tags']['system'].extend(app_tags_list)

            GCE_tags = GCE.get_tags(self.agentConfig)
            if GCE_tags is not None:
                payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags

            # Log the metadata on the first run
            if self._is_first_run():
                log.info(
                    "Hostnames: %s, tags: %s" %
                    (repr(self.hostname_metadata_cache), payload['host-tags']))

        # Periodically send extra hosts metadata (vsphere)
        # Metadata of hosts that are not the host where the agent runs, not all the checks use
        # that
        external_host_tags = []
        if self._should_send_additional_data('external_host_tags'):
            for check in self.initialized_checks_d:
                try:
                    getter = getattr(check, 'get_external_host_tags')
                    check_tags = getter()
                    external_host_tags.extend(check_tags)
                except AttributeError:
                    pass

        if external_host_tags:
            payload['external_host_tags'] = external_host_tags

        # Periodically send agent_checks metadata
        if self._should_send_additional_data('agent_checks'):
            # Add agent checks statuses and error/warning messages
            agent_checks = []
            for check in check_statuses:
                if check.instance_statuses is not None:
                    for i, instance_status in enumerate(
                            check.instance_statuses):
                        agent_checks.append((
                            check.name,
                            check.source_type_name,
                            instance_status.instance_id,
                            instance_status.status,
                            # put error message or list of warning messages in the same field
                            # it will be handled by the UI
                            instance_status.error or instance_status.warnings
                            or "",
                            check.service_metadata[i]))
                else:
                    agent_checks.append(
                        (check.name, check.source_type_name, "initialization",
                         check.status, repr(check.init_failed_error)))
            payload['agent_checks'] = agent_checks
            payload[
                'meta'] = self.hostname_metadata_cache  # add hostname metadata
Example #50
0
    def __init__(self, agentConfig, emitters, systemStats, hostname):
        self.emit_duration = None
        self.agentConfig = agentConfig
        self.hostname = hostname
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.check_timings = agentConfig.get('check_timings')
        self.push_times = {
            'host_metadata': {
                'start': time.time(),
                'interval':
                int(agentConfig.get('metadata_interval', 4 * 60 * 60))
            },
            'external_host_tags': {
                'start': time.time() - 3 * 60,  # Wait for the checks to init
                'interval': int(agentConfig.get('external_host_tags', 5 * 60))
            },
            'agent_checks': {
                'start': time.time(),
                'interval':
                int(agentConfig.get('agent_checks_interval', 10 * 60))
            },
            'processes': {
                'start': time.time(),
                'interval': int(agentConfig.get('processes_interval', 60))
            }
        }
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.hostname_metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = {}

        if Platform.is_linux() and psutil is not None:
            procfs_path = agentConfig.get('procfs_path', '/proc').rstrip('/')
            psutil.PROCFS_PATH = procfs_path

        # Unix System Checks
        self._unix_system_checks = {
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log),
            'system': u.System(log)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log),
            'system': w32.System(log)
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log) if self.agentConfig.get(
            'ganglia_host', '') != '' else None
        self._dogstream = None if self.agentConfig.get(
            'dogstreams') is None else Dogstreams.init(log, self.agentConfig)

        # Agent performance metrics check
        self._agent_metrics = None

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [
                s.strip()
                for s in self.agentConfig.get('custom_checks', '').split(',')
        ]:
            if len(module_spec) == 0:
                continue
            try:
                self._metrics_checks.append(
                    modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning(
                    "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version"
                )
            except Exception:
                log.exception('Unable to load custom check module %s' %
                              module_spec)
Example #51
0
    def run(self, checksd=None, start_event=True, configs_reloaded=False):
        """
        Collect data from each check and submit their data.
        """
        log.debug("Found {num_checks} checks".format(
            num_checks=len(checksd['initialized_checks'])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd[
                'initialized_checks']  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd[
                'init_failed_checks']  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        # Run the system checks. Checks will depend on the OS
        if Platform.is_windows():
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['memory'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(
                    self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                memstats = {
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsable': memory.get('physPctUsable'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared'),
                    'memSlab': memory.get('physSlab'),
                    'memPageTables': memory.get('physPageTables'),
                    'memSwapCached': memory.get('swapCached')
                }
                payload.update(memstats)

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # Resources checks
        if not Platform.is_windows():
            has_resource = False
            for resources_check in self._resources_checks:
                try:
                    resources_check.check()
                    snaps = resources_check.pop_snapshots()
                    if snaps:
                        has_resource = True
                        res_value = {
                            'snaps': snaps,
                            'format_version':
                            resources_check.get_format_version()
                        }
                        res_format = resources_check.describe_format_if_needed(
                        )
                        if res_format is not None:
                            res_value['format_description'] = res_format
                        payload['resources'][
                            resources_check.RESOURCE_KEY] = res_value
                except Exception:
                    log.exception("Error running resource check %s" %
                                  resources_check.RESOURCE_KEY)

            if has_resource:
                payload['resources']['meta'] = {
                    'api_key': self.agentConfig['api_key'],
                    'host': payload['internalHostname'],
                }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name,
                instance_statuses,
                metric_count,
                event_count,
                service_check_count,
                service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats)

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datadog.agent.check_status',
                                status,
                                tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'datadog.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name,
                                       None,
                                       None,
                                       None,
                                       None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(
            create_service_check('datadog.agent.up',
                                 AgentCheck.OK,
                                 hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {
                'collection_time': collect_duration,
                'emit_time': self.emit_duration,
            }
            if not Platform.is_windows():
                metric_context['cpu_time'] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload['metrics'].extend(agent_stats)
            if self.agentConfig.get('developer_mode'):
                log.debug("\n Agent developer mode stats: \n {0}".format(
                    Collector._stats_for_display(agent_stats)))

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration,
                                            2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info(
                    "First flushes done, next flushes will be logged every %s flushes."
                    % FLUSH_LOGGING_PERIOD)
        else:
            log.debug(
                "Finished run #%s. Collection time: %ss. Emit time: %ss" %
                (self.run_count, round(collect_duration,
                                       2), round(self.emit_duration, 2)))

        return payload
Example #52
0
    def init(self):
        try:
            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            instance = self.instances[0]
            set_docker_settings(self.init_config, instance)

            self.client = get_client()
            self._docker_root = self.init_config.get('docker_root', '/')
            self._mountpoints = get_mountpoints(self._docker_root)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # At first run we'll just collect the events from the latest 60 secs
            self._last_event_collection_ts = int(time.time()) - 60

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            if self.is_k8s():
                self.collect_labels_as_tags.append("io.kubernetes.pod.name")

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)

            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning("You must specify an exclude section to enable filtering")
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names


            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception, e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
Example #53
0
 def _make_fetch_state(self):
     return _SDDockerBackendConfigFetchState(
         self.docker_client.inspect_container,
         self.kubeutil.retrieve_pods_list().get('items', [])
         if Platform.is_k8s() else None)
Example #54
0
def agent_status():
    if Platform.is_windows():
        return service_manager_status()
    else:
        return osx_manager_status()
Example #55
0
from config import (get_confd_path, get_config, get_config_path,
                    get_logging_config, get_version)
from util import yLoader
from utils.flare import Flare
from utils.platform import Platform

# Constants describing the agent state
AGENT_RUNNING = 0
AGENT_START_PENDING = 1
AGENT_STOP_PENDING = 2
AGENT_STOPPED = 3
AGENT_UNKNOWN = 4

# Windows management
# Import Windows stuff only on Windows
if Platform.is_windows():
    import pywintypes
    import winerror
    import win32serviceutil
    import win32service

    # project
    from utils.pidfile import PidFile

    WIN_STATUS_TO_AGENT = {
        win32service.SERVICE_RUNNING: AGENT_RUNNING,
        win32service.SERVICE_START_PENDING: AGENT_START_PENDING,
        win32service.SERVICE_STOP_PENDING: AGENT_STOP_PENDING,
        win32service.SERVICE_STOPPED: AGENT_STOPPED,
    }
Example #56
0
    def check(self, agentConfig):
        """Capture io stats.

        @rtype dict
        @return {"device": {"metric": value, "metric": value}, ...}
        """
        io = {}
        try:
            if Platform.is_linux():
                stdout, _, _ = get_subprocess_output(
                    ['iostat', '-d', '1', '2', '-x', '-k'], self.logger)

                #                 Linux 2.6.32-343-ec2 (ip-10-35-95-10)   12/11/2012      _x86_64_        (2 CPU)
                #
                # Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await  svctm  %util
                # sda1              0.00    17.61    0.26   32.63     4.23   201.04    12.48     0.16    4.81   0.53   1.73
                # sdb               0.00     2.68    0.19    3.84     5.79    26.07    15.82     0.02    4.93   0.22   0.09
                # sdg               0.00     0.13    2.29    3.84   100.53    30.61    42.78     0.05    8.41   0.88   0.54
                # sdf               0.00     0.13    2.30    3.84   100.54    30.61    42.78     0.06    9.12   0.90   0.55
                # md0               0.00     0.00    0.05    3.37     1.41    30.01    18.35     0.00    0.00   0.00   0.00
                #
                # Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await  svctm  %util
                # sda1              0.00     0.00    0.00   10.89     0.00    43.56     8.00     0.03    2.73   2.73   2.97
                # sdb               0.00     0.00    0.00    2.97     0.00    11.88     8.00     0.00    0.00   0.00   0.00
                # sdg               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
                # sdf               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
                # md0               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
                io.update(self._parse_linux2(stdout))

            elif sys.platform == "sunos5":
                output, _, _ = get_subprocess_output(
                    ["iostat", "-x", "-d", "1", "2"], self.logger)
                iostat = output.splitlines()

                #                   extended device statistics <-- since boot
                # device      r/s    w/s   kr/s   kw/s wait actv  svc_t  %w  %b
                # ramdisk1    0.0    0.0    0.1    0.1  0.0  0.0    0.0   0   0
                # sd0         0.0    0.0    0.0    0.0  0.0  0.0    0.0   0   0
                # sd1        79.9  149.9 1237.6 6737.9  0.0  0.5    2.3   0  11
                #                   extended device statistics <-- past second
                # device      r/s    w/s   kr/s   kw/s wait actv  svc_t  %w  %b
                # ramdisk1    0.0    0.0    0.0    0.0  0.0  0.0    0.0   0   0
                # sd0         0.0    0.0    0.0    0.0  0.0  0.0    0.0   0   0
                # sd1         0.0  139.0    0.0 1850.6  0.0  0.0    0.1   0   1

                # discard the first half of the display (stats since boot)
                lines = [l for l in iostat if len(l) > 0]
                lines = lines[len(lines) / 2:]

                assert "extended device statistics" in lines[0]
                headers = lines[1].split()
                assert "device" in headers
                for l in lines[2:]:
                    cols = l.split()
                    # cols[0] is the device
                    # cols[1:] are the values
                    io[cols[0]] = {}
                    for i in range(1, len(cols)):
                        io[cols[0]][self.xlate(headers[i], "sunos")] = cols[i]

            elif sys.platform.startswith("freebsd"):
                output, _, _ = get_subprocess_output(
                    ["iostat", "-x", "-d", "1", "2"], self.logger)
                iostat = output.splitlines()

                # Be careful!
                # It looks like SunOS, but some columms (wait, svc_t) have different meaning
                #                        extended device statistics
                # device     r/s   w/s    kr/s    kw/s wait svc_t  %b
                # ad0        3.1   1.3    49.9    18.8    0   0.7   0
                #                         extended device statistics
                # device     r/s   w/s    kr/s    kw/s wait svc_t  %b
                # ad0        0.0   2.0     0.0    31.8    0   0.2   0

                # discard the first half of the display (stats since boot)
                lines = [l for l in iostat if len(l) > 0]
                lines = lines[len(lines) / 2:]

                assert "extended device statistics" in lines[0]
                headers = lines[1].split()
                assert "device" in headers
                for l in lines[2:]:
                    cols = l.split()
                    # cols[0] is the device
                    # cols[1:] are the values
                    io[cols[0]] = {}
                    for i in range(1, len(cols)):
                        io[cols[0]][self.xlate(headers[i],
                                               "freebsd")] = cols[i]
            elif sys.platform == 'darwin':
                iostat, _, _ = get_subprocess_output(
                    ['iostat', '-d', '-c', '2', '-w', '1'], self.logger)
                #          disk0           disk1          <-- number of disks
                #    KB/t tps  MB/s     KB/t tps  MB/s
                #   21.11  23  0.47    20.01   0  0.00
                #    6.67   3  0.02     0.00   0  0.00    <-- line of interest
                io = self._parse_darwin(iostat)
            else:
                return False

            # If we filter devices, do it know.
            device_blacklist_re = agentConfig.get('device_blacklist_re', None)
            if device_blacklist_re:
                filtered_io = {}
                for device, stats in io.iteritems():
                    if not device_blacklist_re.match(device):
                        filtered_io[device] = stats
            else:
                filtered_io = io
            return filtered_io

        except Exception:
            self.logger.exception("Cannot extract IO statistics")
            return False
Example #57
0
    def init(self):
        try:
            instance = self.instances[0]

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self.docker_util = DockerUtil()
            self.docker_client = self.docker_util.client
            if self.is_k8s():
                self.kubeutil = KubeUtil()
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning("You must specify an exclude section to enable filtering")
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
Example #58
0
    def __init__(self, parent=None):
        log_conf = get_logging_config()

        QSplitter.__init__(self, parent)
        self.setWindowTitle(MAIN_WINDOW_TITLE)
        self.setWindowIcon(get_icon("agent.svg"))

        self.sysTray = SystemTray(self)

        self.connect(self.sysTray,
                     SIGNAL("activated(QSystemTrayIcon::ActivationReason)"),
                     self.__icon_activated)

        checks = get_checks()
        datadog_conf = DatadogConf(get_config_path())
        self.create_logs_files_windows(log_conf)

        listwidget = QListWidget(self)
        listwidget.addItems([
            osp.basename(check.module_name).replace("_", " ").title()
            for check in checks
        ])

        self.properties = PropertiesWidget(self)

        self.setting_button = QPushButton(get_icon("info.png"),
                                          "Logs and Status", self)
        self.menu_button = QPushButton(get_icon("settings.png"), "Actions",
                                       self)
        self.settings = [
            ("Forwarder Logs", lambda: [
                self.properties.set_log_file(self.forwarder_log_file),
                self.show_html(self.properties.group_code, self.properties.
                               html_window, False)
            ]),
            ("Collector Logs", lambda: [
                self.properties.set_log_file(self.collector_log_file),
                self.show_html(self.properties.group_code, self.properties.
                               html_window, False)
            ]),
            ("Dogstatsd Logs", lambda: [
                self.properties.set_log_file(self.dogstatsd_log_file),
                self.show_html(self.properties.group_code, self.properties.
                               html_window, False)
            ]),
            ("JMX Fetch Logs", lambda: [
                self.properties.set_log_file(self.jmxfetch_log_file),
                self.show_html(self.properties.group_code, self.properties.
                               html_window, False)
            ]),
        ]

        if Platform.is_windows():
            self.settings.extend([
                ("Service Logs", lambda: [
                    self.properties.set_log_file(self.service_log_file),
                    self.show_html(self.properties.group_code, self.properties.
                                   html_window, False)
                ]),
            ])

        self.settings.extend([
            ("Agent Status", lambda: [
                self.properties.html_window.setHtml(self.properties.html_window
                                                    .latest_status()),
                self.show_html(self.properties.group_code, self.properties.
                               html_window, True),
                self.properties.set_status()
            ]),
        ])

        self.agent_settings = QPushButton(get_icon("edit.png"), "Settings",
                                          self)
        self.connect(
            self.agent_settings, SIGNAL("clicked()"), lambda: [
                self.properties.set_datadog_conf(datadog_conf),
                self.show_html(self.properties.group_code, self.properties.
                               html_window, False)
            ])

        self.setting_menu = SettingMenu(self.settings)
        self.connect(
            self.setting_button, SIGNAL("clicked()"),
            lambda: self.setting_menu.popup(
                self.setting_button.mapToGlobal(QPoint(0, 0))))

        self.manager_menu = Menu(self)
        self.connect(
            self.menu_button, SIGNAL("clicked()"),
            lambda: self.manager_menu.popup(
                self.menu_button.mapToGlobal(QPoint(0, 0))))

        holdingBox = QGroupBox("", self)
        Box = QVBoxLayout(self)
        Box.addWidget(self.agent_settings)
        Box.addWidget(self.setting_button)
        Box.addWidget(self.menu_button)
        Box.addWidget(listwidget)
        holdingBox.setLayout(Box)

        self.addWidget(holdingBox)
        self.addWidget(self.properties)

        self.connect(self.properties.enable_button, SIGNAL("clicked()"),
                     lambda: enable_check(self.properties))

        self.connect(self.properties.disable_button, SIGNAL("clicked()"),
                     lambda: disable_check(self.properties))

        self.connect(self.properties.save_button, SIGNAL("clicked()"),
                     lambda: save_file(self.properties))

        self.connect(
            self.properties.refresh_button, SIGNAL("clicked()"), lambda: [
                self.properties.set_log_file(self.properties.current_file),
                self.properties.html_window.setHtml(self.properties.html_window
                                                    .latest_status())
            ])

        self.connect(
            listwidget, SIGNAL('currentRowChanged(int)'), lambda row: [
                self.properties.set_item(checks[row]),
                self.show_html(self.properties.group_code, self.properties.
                               html_window, False)
            ])

        listwidget.setCurrentRow(0)

        self.setSizes([150, 1])
        self.setStretchFactor(1, 1)
        self.resize(QSize(950, 600))
        self.properties.set_datadog_conf(datadog_conf)

        self.do_refresh()
Example #59
0
    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if Platform.is_k8s(
        ) and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)

        if entity is not None:
            pod_name = None

            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(
                                    pod_name.split("-")[:-1])
                                if "/" in replication_controller:  # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split(
                                        "/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels:  # k8s >= 1.2
                                    namespace = labels[
                                        KubeUtil.NAMESPACE_LABEL]
                                    pod_name = "{0}/{1}".format(
                                        namespace, pod_name)

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" %
                                            replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k, v))

                    if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s(
                    ) and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add ECS tags
            if self.collect_ecs_tags:
                entity_id = entity.get("Id")
                if entity_id in self.ecs_tags:
                    ecs_tags = self.ecs_tags[entity_id]
                    tags.extend(ecs_tags)

            # Add kube labels
            if Platform.is_k8s():
                kube_tags = self.kube_labels.get(pod_name)
                if kube_tags:
                    tags.extend(list(kube_tags))

        return tags
Example #60
0
    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        c_inspect = state.inspect_container(c_id)
        tags = self.dockerutil.extract_container_tags(
            c_inspect, self.docker_labels_as_tags)

        if Platform.is_k8s():
            if not self.kubeutil.init_success:
                log.warning(
                    "kubelet client not initialized, kubernetes tags will be missing."
                )
                return tags

            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags will be missing." % c_id[:12])
                return tags

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            if not self.kubeutil:
                log.warning("The agent can't connect to kubelet, creator and "
                            "service tags will be missing for container %s." %
                            c_id[:12])
            else:
                # add creator tags
                creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata)
                tags.extend(creator_tags)

                # add services tags
                if self.kubeutil.collect_service_tag:
                    services = self.kubeutil.match_services_for_pod(
                        pod_metadata)
                    for s in services:
                        if s is not None:
                            tags.append('kube_service:%s' % s)

        elif Platform.is_swarm():
            c_labels = c_inspect.get('Config', {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        elif Platform.is_rancher():
            service_name = c_inspect.get('Config',
                                         {}).get('Labels',
                                                 {}).get(RANCHER_SVC_NAME)
            stack_name = c_inspect.get('Config',
                                       {}).get('Labels',
                                               {}).get(RANCHER_STACK_NAME)
            container_name = c_inspect.get('Config', {}).get(
                'Labels', {}).get(RANCHER_CONTAINER_NAME)
            if service_name:
                tags.append('rancher_service:%s' % service_name)
            if stack_name:
                tags.append('rancher_stack:%s' % stack_name)
            if container_name:
                tags.append('rancher_container:%s' % container_name)

        if self.metadata_collector.has_detected():
            orch_tags = self.metadata_collector.get_container_tags(
                co=c_inspect)
            tags.extend(orch_tags)

        return tags