Example #1
0
    def check(self, instance):
        if not self.kubeutil.host:
            raise Exception('Unable to get default router and host parameter is not set')

        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]

        # master health checks
        if instance.get('enable_master_checks', False):
            self._perform_master_checks(self.kubeutil.master_url_nodes)

        # kubelet health checks
        if instance.get('enable_kubelet_checks', True):
            self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance)
Example #2
0
    def check(self, instance):
        host = instance.get('host', '')
        port = instance.get('port', '')
        user = instance.get('username', '')
        password = instance.get('password', '')
        tags = instance.get('tags', [])
        dbname = instance.get('dbname', None)
        relations = instance.get('relations', [])
        ssl = _is_affirmative(instance.get('ssl', False))
        function_metrics = _is_affirmative(instance.get('collect_function_metrics', False))
        # Default value for `count_metrics` is True for backward compatibility
        count_metrics = _is_affirmative(instance.get('collect_count_metrics', True))

        if relations and not dbname:
            self.warning('"dbname" parameter must be set when using the "relations" parameter.')

        if dbname is None:
            dbname = 'postgres'

        key = (host, port, dbname)

        custom_metrics = self._get_custom_metrics(instance.get('custom_metrics', []), key)

        # Clean up tags in case there was a None entry in the instance
        # e.g. if the yaml contains tags: but no actual tags
        if tags is None:
            tags = []
        else:
            tags = list(set(tags))

        # preset tags to the database name
        tags.extend(["db:%s" % dbname])

        self.log.debug("Custom metrics: %s" % custom_metrics)

        # preset tags to the database name
        db = None

        # Collect metrics
        try:
            # Check version
            db = self.get_connection(key, host, port, user, password, dbname, ssl)
            version = self._get_version(key, db)
            self.log.debug("Running check against version %s" % version)
            self._collect_stats(key, db, tags, relations, custom_metrics, function_metrics, count_metrics)
        except ShouldRestartException:
            self.log.info("Resetting the connection")
            db = self.get_connection(key, host, port, user, password, dbname, ssl, use_cached=False)
            self._collect_stats(key, db, tags, relations, custom_metrics, function_metrics, count_metrics)

        if db is not None:
            service_check_tags = self._get_service_check_tags(host, port, dbname)
            message = u'Established connection to postgres://%s:%s/%s' % (host, port, dbname)
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK,
                tags=service_check_tags, message=message)
            try:
                # commit to close the current query transaction
                db.commit()
            except Exception, e:
                self.log.warning("Unable to commit: {0}".format(e))
Example #3
0
    def check(self, instance):
        kube_settings = get_kube_settings()
        if not kube_settings.get("host"):
            raise Exception("Unable to get default router and host parameter is not set")

        self.max_depth = instance.get("max_depth", DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get("enabled_gauges", DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get("enabled_rates", DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get("publish_aliases", DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get("use_histogram", DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]

        # master health checks
        if instance.get("enable_master_checks", False):
            master_url = kube_settings["master_url_nodes"]
            self._perform_master_checks(master_url)

        # kubelet health checks
        if instance.get("enable_kubelet_checks", True):
            kube_health_url = kube_settings["kube_health_url"]
            self._perform_kubelet_checks(kube_health_url)

        # kubelet metrics
        self._update_metrics(instance, kube_settings)
Example #4
0
    def check(self, instance):
        host = instance.get('host', self.default_router)
        if not host:
            raise Exception('Unable to get default router and host parameter is not set')

        port = instance.get('port', DEFAULT_CADVISOR_PORT)
        method = instance.get('method', DEFAULT_METHOD)
        self.metrics_url = '%s://%s:%d' % (method, host, port)
        self.metrics_cmd = urljoin(self.metrics_url, DEFAULT_METRICS_CMD)
        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]

        # master health checks
        if instance.get('enable_master_checks', False):
            master_port = instance.get('master_port', DEFAULT_MASTER_PORT)
            master_host = instance.get('master_host', 'localhost')
            master_url = '%s://%s:%d/nodes' % (method, host, master_port)
            self._perform_master_checks(master_url)

        # kubelet health checks
        if instance.get('enable_kubelet_checks', True):
            kubelet_port = instance.get('kubelet_port', DEFAULT_KUBELET_PORT)
            kubelet_url = '%s://%s:%d/healthz' % (method, host, kubelet_port)
            self._perform_kubelet_checks(kubelet_url)

        # kubelet metrics
        self._update_metrics(instance)
Example #5
0
    def _load_conf(self, instance):
        # Fetches the conf
        method = instance.get('method', 'get')
        data = instance.get('data', {})
        tags = instance.get('tags', [])
        username = instance.get('username')
        password = instance.get('password')
        http_response_status_code = str(instance.get('http_response_status_code', DEFAULT_EXPECTED_CODE))
        timeout = int(instance.get('timeout', 10))
        config_headers = instance.get('headers', {})
        default_headers = _is_affirmative(instance.get("include_default_headers", True))
        if default_headers:
            headers = agent_headers(self.agentConfig)
        else:
            headers = {}
        headers.update(config_headers)
        url = instance.get('url')
        content_match = instance.get('content_match')
        reverse_content_match = _is_affirmative(instance.get('reverse_content_match', False))
        response_time = _is_affirmative(instance.get('collect_response_time', True))
        if not url:
            raise Exception("Bad configuration. You must specify a url")
        include_content = _is_affirmative(instance.get('include_content', False))
        ssl = _is_affirmative(instance.get('disable_ssl_validation', True))
        ssl_expire = _is_affirmative(instance.get('check_certificate_expiration', True))
        instance_ca_certs = instance.get('ca_certs', self.ca_certs)
        weakcipher = _is_affirmative(instance.get('weakciphers', False))
        ignore_ssl_warning = _is_affirmative(instance.get('ignore_ssl_warning', False))
        skip_proxy = _is_affirmative(instance.get('no_proxy', False))
        allow_redirects = _is_affirmative(instance.get('allow_redirects', True))

        return url, username, password, method, data, http_response_status_code, timeout, include_content,\
            headers, response_time, content_match, reverse_content_match, tags, ssl, ssl_expire, instance_ca_certs,\
            weakcipher, ignore_ssl_warning, skip_proxy, allow_redirects
Example #6
0
    def check(self, instance):
        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        pods_list = self.kubeutil.retrieve_pods_list()

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance, pods_list)

        # kubelet events
        if _is_affirmative(instance.get('collect_events', DEFAULT_COLLECT_EVENTS)):
            try:
                self._process_events(instance, pods_list)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))
Example #7
0
    def check(self, instance):
        name = instance.get('name', None)
        tags = instance.get('tags', [])
        exact_match = _is_affirmative(instance.get('exact_match', True))
        search_string = instance.get('search_string', None)
        ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True))
        pid = instance.get('pid')

        if not isinstance(search_string, list) and pid is None:
            raise KeyError('"search_string" parameter should be a list')

        # FIXME 6.x remove me
        if pid is None:
            if "All" in search_string:
                self.warning('Deprecated: Having "All" in your search_string will'
                         'greatly reduce the performance of the check and '
                         'will be removed in a future version of the agent.')

        if name is None:
            raise KeyError('The "name" of process groups is mandatory')

        if search_string is not None:
            pids = self.find_pids(
                name,
                search_string,
                exact_match,
                ignore_ad=ignore_ad
            )
        elif pid is not None:
            pids = [psutil.Process(pid)]
        else:
            raise ValueError('The "search_string" or "pid" options are required for process identification')

        proc_state = self.get_process_state(name, pids)

        # FIXME 6.x remove the `name` tag
        tags.extend(['process_name:%s' % name, name])

        self.log.debug('ProcessCheck: process %s analysed', name)
        self.gauge('system.processes.number', len(pids), tags=tags)

        for attr, mname in ATTR_TO_METRIC.iteritems():
            vals = [x for x in proc_state[attr] if x is not None]
            # skip []
            if vals:
                if attr == 'run_time':
                    self.gauge('system.processes.%s.avg' % mname, sum(vals)/len(vals), tags=tags)
                    self.gauge('system.processes.%s.max' % mname, max(vals), tags=tags)
                    self.gauge('system.processes.%s.min' % mname, min(vals), tags=tags)

                # FIXME 6.x: change this prefix?
                else:
                    self.gauge('system.processes.%s' % mname, sum(vals), tags=tags)

        for attr, mname in ATTR_TO_METRIC_RATE.iteritems():
            vals = [x for x in proc_state[attr] if x is not None]
            if vals:
                self.rate('system.processes.%s' % mname, sum(vals), tags=tags)

        self._process_service_check(name, len(pids), instance.get('thresholds', None))
Example #8
0
    def check(self, instance):
        url = instance.get('url')
        username = instance.get('username')
        password = instance.get('password')
        collect_aggregates_only = _is_affirmative(
            instance.get('collect_aggregates_only', True)
        )
        collect_status_metrics = _is_affirmative(
            instance.get('collect_status_metrics', False)
        )
        collect_status_metrics_by_host = _is_affirmative(
            instance.get('collect_status_metrics_by_host', False)
        )
        tag_service_check_by_host = _is_affirmative(
            instance.get('tag_service_check_by_host', False)
        )
        services_incl_filter = instance.get('services_include', [])
        services_excl_filter = instance.get('services_exclude', [])

        self.log.debug('Processing HAProxy data for %s' % url)

        data = self._fetch_data(url, username, password)

        process_events = instance.get('status_check', self.init_config.get('status_check', False))

        self._process_data(
            data, collect_aggregates_only, process_events,
            url=url, collect_status_metrics=collect_status_metrics,
            collect_status_metrics_by_host=collect_status_metrics_by_host,
            tag_service_check_by_host=tag_service_check_by_host,
            services_incl_filter=services_incl_filter,
            services_excl_filter=services_excl_filter
        )
Example #9
0
    def check(self, instance):
        url = instance.get("url")
        username = instance.get("username")
        password = instance.get("password")
        collect_aggregates_only = _is_affirmative(instance.get("collect_aggregates_only", True))
        collect_status_metrics = _is_affirmative(instance.get("collect_status_metrics", False))

        collect_status_metrics_by_host = _is_affirmative(instance.get("collect_status_metrics_by_host", False))

        count_status_by_service = _is_affirmative(instance.get("count_status_by_service", True))

        tag_service_check_by_host = _is_affirmative(instance.get("tag_service_check_by_host", False))

        services_incl_filter = instance.get("services_include", [])
        services_excl_filter = instance.get("services_exclude", [])

        self.log.debug("Processing HAProxy data for %s" % url)

        data = self._fetch_data(url, username, password)

        process_events = instance.get("status_check", self.init_config.get("status_check", False))

        self._process_data(
            data,
            collect_aggregates_only,
            process_events,
            url=url,
            collect_status_metrics=collect_status_metrics,
            collect_status_metrics_by_host=collect_status_metrics_by_host,
            tag_service_check_by_host=tag_service_check_by_host,
            services_incl_filter=services_incl_filter,
            services_excl_filter=services_excl_filter,
            count_status_by_service=count_status_by_service,
        )
Example #10
0
    def _load_conf(self, instance):
        # Fetches the conf
        tags = instance.get("tags", [])
        username = instance.get("username")
        password = instance.get("password")
        http_response_status_code = instance.get("http_response_status_code", "(1|2|3)\d\d")
        timeout = int(instance.get("timeout", 10))
        config_headers = instance.get("headers", {})
        headers = agent_headers(self.agentConfig)
        headers.update(config_headers)
        url = instance.get("url")
        content_match = instance.get("content_match")
        response_time = _is_affirmative(instance.get("collect_response_time", True))
        if not url:
            raise Exception("Bad configuration. You must specify a url")
        include_content = _is_affirmative(instance.get("include_content", False))
        ssl = _is_affirmative(instance.get("disable_ssl_validation", True))
        ssl_expire = _is_affirmative(instance.get("check_certificate_expiration", True))

        return (
            url,
            username,
            password,
            http_response_status_code,
            timeout,
            include_content,
            headers,
            response_time,
            content_match,
            tags,
            ssl,
            ssl_expire,
        )
Example #11
0
    def check(self, instance):
        instance_name = instance.get('name')
        if instance_name is None:
            raise Exception("Each instance must have a unique name")

        ssl_validation = _is_affirmative(instance.get('ssl_validation', True))

        server = instance.get('server')
        if 'server' is None:
            raise Exception("Each instance must have a server")

        build_conf = instance.get('build_configuration')
        if build_conf is None:
            raise Exception("Each instance must have a build configuration")

        host = instance.get('host_affected') or self.hostname
        tags = instance.get('tags')
        is_deployment = _is_affirmative(instance.get('is_deployment', False))
        basic_http_authentication = _is_affirmative(instance.get('basic_http_authentication', False))

        self._initialize_if_required(instance_name, server, build_conf, ssl_validation, basic_http_authentication)

        # Look for new successful builds
        if basic_http_authentication:
            new_build_url = self.NEW_BUILD_URL_AUTHENTICATED.format(
                server=server,
                build_conf=build_conf,
                since_build=self.last_build_ids[instance_name]
            )
        else:
            new_build_url = self.NEW_BUILD_URL.format(
                server=server,
                build_conf=build_conf,
                since_build=self.last_build_ids[instance_name]
            )

        try:
            resp = requests.get(new_build_url, timeout=self.DEFAULT_TIMEOUT, headers=self.HEADERS, verify=ssl_validation)
            resp.raise_for_status()

            new_builds = resp.json()

            if new_builds["count"] == 0:
                self.log.debug("No new builds found.")
            else:
                self._build_and_send_event(new_builds["build"][0], instance_name, is_deployment, host, tags)
        except requests.exceptions.HTTPError:
            self.log.exception(
                "Couldn't fetch last build, got code {0}"
                .format(resp.status_code)
            )
            raise
        except Exception:
            self.log.exception(
                "Couldn't fetch last build, unhandled exception"
            )
            raise
Example #12
0
    def _load_conf(self, instance):
        self._excluded_filesystems = instance.get("excluded_filesystems", [])
        self._excluded_disks = instance.get("excluded_disks", [])
        self._tag_by_filesystem = _is_affirmative(instance.get("tag_by_filesystem", False))
        self._all_partitions = _is_affirmative(instance.get("all_partitions", False))

        # FIXME: 6.x, drop use_mount option in datadog.conf
        self._load_legacy_option(instance, "use_mount", False, operation=_is_affirmative)
        # FIXME: 6.x, drop device_blacklist_re option in datadog.conf
        self._load_legacy_option(
            instance, "excluded_disk_re", "^$", legacy_name="device_blacklist_re", operation=re.compile
        )
Example #13
0
    def check(self, instance):
        name = instance.get('name', None)
        tags = instance.get('tags', [])
        exact_match = _is_affirmative(instance.get('exact_match', True))
        search_string = instance.get('search_string', None)
        ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True))
        cpu_check_interval = instance.get('cpu_check_interval', 0.1)

        if not isinstance(search_string, list):
            raise KeyError('"search_string" parameter should be a list')

        # FIXME 6.x remove me
        if "All" in search_string:
            self.warning('Deprecated: Having "All" in your search_string will'
                         'greatly reduce the performance of the check and '
                         'will be removed in a future version of the agent.')

        if name is None:
            raise KeyError('The "name" of process groups is mandatory')

        if search_string is None:
            raise KeyError('The "search_string" is mandatory')

        if not isinstance(cpu_check_interval, (int, long, float)):
            self.warning("cpu_check_interval must be a number. Defaulting to 0.1")
            cpu_check_interval = 0.1

        pids = self.find_pids(
            name,
            search_string,
            exact_match,
            ignore_ad=ignore_ad
        )

        proc_state = self.get_process_state(name, pids, cpu_check_interval)

        # FIXME 6.x remove the `name` tag
        tags.extend(['process_name:%s' % name, name])

        self.log.debug('ProcessCheck: process %s analysed', name)
        self.gauge('system.processes.number', len(pids), tags=tags)

        for attr, mname in ATTR_TO_METRIC.iteritems():
            vals = [x for x in proc_state[attr] if x is not None]
            # skip []
            if vals:
                # FIXME 6.x: change this prefix?
                self.gauge('system.processes.%s' % mname, sum(vals), tags=tags)

        self._process_service_check(name, len(pids), instance.get('thresholds', None))
Example #14
0
    def check(self, instance):
        # Report image metrics
        if _is_affirmative(instance.get('collect_images_stats', True)):
            self._count_images(instance)

        # Get the list of containers and the index of their names
        containers, ids_to_names = self._get_and_count_containers(instance)

        # Report container metrics from cgroups
        skipped_container_ids = self._report_containers_metrics(containers, instance)

        # Send events from Docker API
        if _is_affirmative(instance.get('collect_events', True)):
            self._process_events(instance, ids_to_names, skipped_container_ids)
Example #15
0
    def get_instance_proxy(self, instance, uri, proxies=None):
        proxies = proxies if proxies is not None else self.proxies.copy()
        proxies['no'] = get_no_proxy_from_env()

        deprecated_skip = instance.get('no_proxy', None)
        skip = (
            _is_affirmative(instance.get('skip_proxy', not self._use_agent_proxy)) or
            _is_affirmative(deprecated_skip)
        )

        if deprecated_skip is not None:
            self._log_deprecation('no_proxy')

        return config_proxy_skip(proxies, uri, skip)
Example #16
0
    def init(self):
        try:
            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            instance = self.instances[0]
            set_docker_settings(self.init_config, instance)

            self.client = get_client()
            self._docker_root = self.init_config.get('docker_root', '/')
            self._mountpoints = get_mountpoints(self._docker_root)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # At first run we'll just collect the events from the latest 60 secs
            self._last_event_collection_ts = int(time.time()) - 60

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS),
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)

            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning("You must specify an exclude section to enable filtering")
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names


            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()
        except Exception, e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
Example #17
0
    def get_instance_config(self, instance):
        url = instance.get('url')
        if url is None:
            raise Exception("An url must be specified in the instance")

        pshard_stats = _is_affirmative(instance.get('pshard_stats', False))

        cluster_stats = _is_affirmative(instance.get('cluster_stats', False))
        if 'is_external' in instance:
            cluster_stats = _is_affirmative(instance.get('is_external', False))

        pending_task_stats = _is_affirmative(instance.get('pending_task_stats', True))
        # Support URLs that have a path in them from the config, for
        # backwards-compatibility.
        parsed = urlparse.urlparse(url)
        if parsed[2] != "":
            url = "%s://%s" % (parsed[0], parsed[1])
        port = parsed.port
        host = parsed.hostname

        custom_tags = instance.get('tags', [])
        service_check_tags = [
            'host:%s' % host,
            'port:%s' % port
        ]
        service_check_tags.extend(custom_tags)

        # Tag by URL so we can differentiate the metrics
        # from multiple instances
        tags = ['url:%s' % url]
        tags.extend(custom_tags)

        timeout = instance.get('timeout') or self.DEFAULT_TIMEOUT

        config = ESInstanceConfig(
            pshard_stats=pshard_stats,
            cluster_stats=cluster_stats,
            password=instance.get('password'),
            service_check_tags=service_check_tags,
            ssl_cert=instance.get('ssl_cert'),
            ssl_key=instance.get('ssl_key'),
            ssl_verify=instance.get('ssl_verify'),
            tags=tags,
            timeout=timeout,
            url=url,
            username=instance.get('username'),
            pending_task_stats=pending_task_stats
        )
        return config
Example #18
0
    def _load_conf(self, instance):
        self._excluded_filesystems = instance.get('excluded_filesystems', [])
        self._excluded_disks = instance.get('excluded_disks', [])
        self._tag_by_filesystem = _is_affirmative(
            instance.get('tag_by_filesystem', False))
        self._all_partitions = _is_affirmative(
            instance.get('all_partitions', False))

        # FIXME: 6.x, drop use_mount option in datadog.conf
        self._load_legacy_option(instance, 'use_mount', False,
                                 operation=_is_affirmative)
        # FIXME: 6.x, drop device_blacklist_re option in datadog.conf
        self._load_legacy_option(instance, 'excluded_disk_re', '^$',
                                 legacy_name='device_blacklist_re',
                                 operation=re.compile)
Example #19
0
    def _cache_morlist_raw(self, instance):
        """ Initiate the first layer to refresh self.morlist by queueing
        _cache_morlist_raw_atomic on the rootFolder in a recursive/asncy approach
        """

        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)
        if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0:
            self.log.debug(
                "Skipping morlist collection now, RAW results "
                "processing not over (latest refresh was {0}s ago)".format(
                    time.time() - self.cache_times[i_key][MORLIST][LAST])
            )
            return
        self.morlist_raw[i_key] = []

        server_instance = self._get_server_instance(instance)
        root_folder = server_instance.content.rootFolder

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        include_only_marked = _is_affirmative(instance.get('include_only_marked', False))
        self.pool.apply_async(
            self._cache_morlist_raw_atomic,
            args=(i_key, 'rootFolder', root_folder, [instance_tag], regexes, include_only_marked)
        )
        self.cache_times[i_key][MORLIST][LAST] = time.time()
Example #20
0
    def get_instance_config(self, instance):
        url = instance.get("url")
        if url is None:
            raise Exception("An url must be specified in the instance")

        is_external = _is_affirmative(instance.get("is_external", False))

        # Support URLs that have a path in them from the config, for
        # backwards-compatibility.
        parsed = urlparse.urlparse(url)
        if parsed[2] != "":
            url = "%s://%s" % (parsed[0], parsed[1])
        port = parsed.port
        host = parsed.hostname
        service_check_tags = ["host:%s" % host, "port:%s" % port]

        # Tag by URL so we can differentiate the metrics
        # from multiple instances
        tags = ["url:%s" % url]
        tags.extend(instance.get("tags", []))

        timeout = instance.get("timeout") or self.DEFAULT_TIMEOUT

        config = ESInstanceConfig(
            is_external=is_external,
            password=instance.get("password"),
            service_check_tags=service_check_tags,
            tags=tags,
            timeout=timeout,
            url=url,
            username=instance.get("username"),
        )
        return config
Example #21
0
    def _process_results(self):
        for i in xrange(MAX_LOOP_ITERATIONS):
            try:
                # We want to fetch the result in a non blocking way
                status, msg, sc_name, instance = self.resultsq.get_nowait()
            except Empty:
                break

            instance_name = instance['name']
            if status == FAILURE:
                self.nb_failures += 1
                if self.nb_failures >= self.pool_size - 1:
                    self.nb_failures = 0
                    self.restart_pool()

                # clean failed job
                self._clean_job(instance_name)
                continue

            self.report_as_service_check(sc_name, status, instance, msg)

            # FIXME: 5.3, this has been deprecated before, get rid of events
            # Don't create any event to avoid duplicates with server side
            # service_checks
            skip_event = _is_affirmative(instance.get('skip_event', False))
            if not skip_event:
                self.warning("Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Datadog Agent.")
                event = None

                if instance_name not in self.statuses:
                    self.statuses[instance_name] = defaultdict(list)

                self.statuses[instance_name][sc_name].append(status)

                window = int(instance.get('window', 1))

                if window > 256:
                    self.log.warning("Maximum window size (256) exceeded, defaulting it to 256")
                    window = 256

                threshold = instance.get('threshold', 1)

                if len(self.statuses[instance_name][sc_name]) > window:
                    self.statuses[instance_name][sc_name].pop(0)

                nb_failures = self.statuses[instance_name][sc_name].count(Status.DOWN)

                if nb_failures >= threshold:
                    if self.notified.get((instance_name, sc_name), Status.UP) != Status.DOWN:
                        event = self._create_status_event(sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.DOWN
                else:
                    if self.notified.get((instance_name, sc_name), Status.UP) != Status.UP:
                        event = self._create_status_event(sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.UP

                if event is not None:
                    self.events.append(event)

            self._clean_job(instance_name)
Example #22
0
    def check(self, instance):
        # Report image metrics
        self.warning('Using the "docker" check is deprecated and will be removed'
        ' in a future version of the agent. Please use the "docker_daemon" one instead')
        if _is_affirmative(instance.get('collect_images_stats', True)):
            self._count_images(instance)

        # Get the list of containers and the index of their names
        containers, ids_to_names = self._get_and_count_containers(instance)

        # Report container metrics from cgroups
        skipped_container_ids = self._report_containers_metrics(containers, instance)

        # Send events from Docker API
        if _is_affirmative(instance.get('collect_events', False)):
            self._process_events(instance, ids_to_names, skipped_container_ids)
Example #23
0
    def check(self, instance):
        url = instance.get("url")
        username = instance.get("username")
        password = instance.get("password")
        custom_tags = instance.get('tags', [])
        max_queues = int(instance.get("max_queues", MAX_ELEMENTS))
        max_topics = int(instance.get("max_topics", MAX_ELEMENTS))
        max_subscribers = int(instance.get("max_subscribers", MAX_ELEMENTS))
        detailed_queues = instance.get("detailed_queues", [])
        detailed_topics = instance.get("detailed_topics", [])
        detailed_subscribers = instance.get("detailed_subscribers", [])
        suppress_errors = _is_affirmative(instance.get("suppress_errors", False))

        tags = custom_tags + ["url:{0}".format(url)]

        self.log.debug("Processing ActiveMQ data for %s" % url)
        data = self._fetch_data(url, QUEUE_URL, username, password, suppress_errors)
        if data:
            self._process_data(data, "queue", tags, max_queues, detailed_queues)

        data = self._fetch_data(url, TOPIC_URL, username, password, suppress_errors)
        if data:
            self._process_data(data, "topic", tags, max_topics, detailed_topics)

        data = self._fetch_data(url, SUBSCRIBER_URL, username, password, suppress_errors)
        if data:
            self._process_subscriber_data(data, tags, max_subscribers, detailed_subscribers)
Example #24
0
    def _cache_morlist_raw(self, instance):
        """
        Initiate the first layer to refresh the list of MORs (`self.morlist`).

        Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery.

        """

        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)
        if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0:
            self.log.debug(
                "Skipping morlist collection now, RAW results "
                "processing not over (latest refresh was {0}s ago)".format(
                    time.time() - self.cache_times[i_key][MORLIST][LAST])
            )
            return
        self.morlist_raw[i_key] = []

        server_instance = self._get_server_instance(instance)
        root_folder = server_instance.content.rootFolder

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        include_only_marked = _is_affirmative(instance.get('include_only_marked', False))

        # Discover hosts and virtual machines
        self._discover_mor(i_key, root_folder, [instance_tag], regexes, include_only_marked)

        self.cache_times[i_key][MORLIST][LAST] = time.time()
Example #25
0
    def _connect(self, instance):
        for e in ("access_id", "access_secret"):
            if e not in instance:
                raise Exception("{0} parameter is required.".format(e))

        s3_settings = {
            "aws_access_key_id": instance.get('access_id', None),
            "aws_secret_access_key": instance.get('access_secret', None),
            "proxy": instance.get('host', 'localhost'),
            "proxy_port": int(instance.get('port', 8080)),
            "is_secure": _is_affirmative(instance.get('is_secure', True))
        }

        if instance.get('s3_root'):
            s3_settings['host'] = instance['s3_root']

        aggregation_key = s3_settings['proxy'] + ":" + str(s3_settings['proxy_port'])

        try:
            s3 = S3Connection(**s3_settings)
        except Exception, e:
            self.log.error("Error connecting to {0}: {1}".format(aggregation_key, e))
            self.service_check(
                self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                tags=["aggregation_key:{0}".format(aggregation_key)],
                message=str(e))
            raise
Example #26
0
    def _connect(self, instance):
        for e in ("access_id", "access_secret"):
            if e not in instance:
                raise Exception("{0} parameter is required.".format(e))

        s3_settings = {
            "aws_access_key_id": instance.get("access_id", None),
            "aws_secret_access_key": instance.get("access_secret", None),
            "proxy": instance.get("host", "localhost"),
            "proxy_port": int(instance.get("port", 8080)),
            "is_secure": _is_affirmative(instance.get("is_secure", True)),
        }

        if instance.get("s3_root"):
            s3_settings["host"] = instance["s3_root"]

        aggregation_key = s3_settings["proxy"] + ":" + str(s3_settings["proxy_port"])

        try:
            s3 = S3Connection(**s3_settings)
        except Exception as e:
            self.log.error("Error connecting to {0}: {1}".format(aggregation_key, e))
            self.service_check(
                self.SERVICE_CHECK_NAME,
                AgentCheck.CRITICAL,
                tags=["aggregation_key:{0}".format(aggregation_key)],
                message=str(e),
            )
            raise

        tags = instance.get("tags", [])
        tags.append("aggregation_key:{0}".format(aggregation_key))

        return s3, aggregation_key, tags
Example #27
0
    def _collect_raw(self, ceph_cmd, instance):
        use_sudo = _is_affirmative(instance.get('use_sudo', False))
        ceph_args = []
        if use_sudo:
            test_sudo = os.system('setsid sudo -l < /dev/null')
            if test_sudo != 0:
                raise Exception('The dd-agent user does not have sudo access')
            ceph_args = ['sudo', ceph_cmd]
        else:
            ceph_args = [ceph_cmd]

        args = ceph_args + ['version']
        try:
            output,_,_ = get_subprocess_output(args, self.log)
        except Exception as e:
            raise Exception('Unable to run cmd=%s: %s' % (' '.join(args), str(e)))

        raw = {}
        for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'):
            try:
                args = ceph_args + cmd.split() + ['-fjson']
                output,_,_ = get_subprocess_output(args, self.log)
                res = json.loads(output)
            except Exception as e:
                self.log.warning('Unable to parse data from cmd=%s: %s' % (cmd, str(e)))
                continue

            name = cmd.replace(' ', '_')
            raw[name] = res

        return raw
Example #28
0
    def _get_and_count_containers(self, instance):
        tags = instance.get("tags", [])
        with_size = _is_affirmative(instance.get('collect_container_size', False))

        service_check_name = 'docker.service_up'
        try:
            running_containers = self._get_containers(instance, with_size=with_size)
            all_containers = self._get_containers(instance, get_all=True)
        except (socket.timeout, urllib2.URLError) as e:
            self.service_check(service_check_name, AgentCheck.CRITICAL,
                message="Unable to list Docker containers: {0}".format(e))
            raise Exception("Failed to collect the list of containers. Exception: {0}".format(e))
        self.service_check(service_check_name, AgentCheck.OK)

        running_containers_ids = set([container['Id'] for container in running_containers])

        for container in all_containers:
            container_tags = list(tags)
            for key in DOCKER_TAGS:
                tag = self._make_tag(key, container[key], instance)
                if tag:
                    container_tags.append(tag)
            if container['Id'] in running_containers_ids:
                self.set("docker.containers.running", container['Id'], tags=container_tags)
            else:
                self.set("docker.containers.stopped", container['Id'], tags=container_tags)

        # The index of the names is used to generate and format events
        ids_to_names = {}
        for container in all_containers:
            ids_to_names[container['Id']] = self._get_container_name(container)

        return running_containers, ids_to_names
Example #29
0
    def __init__(self, *args, **kwargs):
        # `args` order is `name`, `init_config`, `agentConfig` (deprecated), `instances`

        self.metrics = defaultdict(list)

        self.instances = kwargs.get('instances', [])
        self.name = kwargs.get('name', '')
        self.init_config = kwargs.get('init_config', {})
        self.agentConfig = kwargs.get('agentConfig', {})
        self.warnings = []

        if len(args) > 0:
            self.name = args[0]
        if len(args) > 1:
            self.init_config = args[1]
        if len(args) > 2:
            if len(args) > 3 or 'instances' in kwargs:
                # old-style init: the 3rd argument is `agentConfig`
                self.agentConfig = args[2]
                if len(args) > 3:
                    self.instances = args[3]
            else:
                # new-style init: the 3rd argument is `instances`
                self.instances = args[2]

        self.hostname = datadog_agent.get_hostname()  # `self.hostname` is deprecated, use `datadog_agent.get_hostname()` instead

        # the agent5 'AgentCheck' setup a log attribute.
        self.log = logging.getLogger('%s.%s' % (__name__, self.name))

        # Set proxy settings
        self.proxies = get_requests_proxy(self.agentConfig)
        if not self.init_config:
            self._use_agent_proxy = True
        else:
            self._use_agent_proxy = _is_affirmative(
                self.init_config.get("use_agent_proxy", True))

        self.default_integration_http_timeout = float(self.agentConfig.get('default_integration_http_timeout', 9))

        self._deprecations = {
            'increment': [
                False,
                "DEPRECATION NOTICE: `AgentCheck.increment`/`AgentCheck.decrement` are deprecated, please use " +
                "`AgentCheck.gauge` or `AgentCheck.count` instead, with a different metric name",
            ],
            'device_name': [
                False,
                "DEPRECATION NOTICE: `device_name` is deprecated, please use a `device:` tag in the `tags` list instead",
            ],
            'in_developer_mode': [
                False,
                "DEPRECATION NOTICE: `in_developer_mode` is deprecated, please stop using it.",
            ],
            'no_proxy': [
                False,
                "DEPRECATION NOTICE: The `no_proxy` config option has been renamed "
                "to `skip_proxy` and will be removed in a future release.",
            ],
        }
Example #30
0
    def get_instance_proxy(self, instance, uri, proxies=None):
        proxies = proxies if proxies is not None else self.proxies.copy()
        proxies['no'] = get_no_proxy_from_env()

        deprecated_skip = instance.get('no_proxy', None)
        skip = (
            _is_affirmative(instance.get('skip_proxy', False)) or
            _is_affirmative(deprecated_skip)
        )

        if deprecated_skip is not None:
            self.warning(
                'Deprecation notice: The `no_proxy` config option has been renamed '
                'to `skip_proxy` and will be removed in a future release.'
            )

        return config_proxy_skip(proxies, uri, skip)
Example #31
0
    def _get_config(self, instance):
        # make sure 'rabbitmq_api_url' is present and get parameters
        base_url = instance.get('rabbitmq_api_url', None)
        if not base_url:
            raise Exception('Missing "rabbitmq_api_url" in RabbitMQ config.')
        if not base_url.endswith('/'):
            base_url += '/'
        username = instance.get('rabbitmq_user', 'guest')
        password = instance.get('rabbitmq_pass', 'guest')
        custom_tags = instance.get('tags', [])
        parsed_url = urlparse.urlparse(base_url)
        if not parsed_url.scheme or "://" not in parsed_url.geturl():
            self.log.warning(
                'The rabbit url did not include a protocol, assuming http')
            # urlparse.urljoin cannot add a protocol to the rest of the url for some reason.
            # This still leaves the potential for errors, but such urls would never have been valid, either
            # and it's not likely to be useful to attempt to catch all possible mistakes people could make.
            # urlparse also has a known issue parsing url with no schema, but a port in the host section
            # mistakingly taking the host for the schema, hence the additional validation
            base_url = 'http://' + base_url
            parsed_url = urlparse.urlparse(base_url)

        ssl_verify = _is_affirmative(instance.get('ssl_verify', True))
        if not ssl_verify and parsed_url.scheme == 'https':
            self.log.warning(
                'Skipping SSL cert validation for %s based on configuration.' %
                (base_url))

        # Limit of queues/nodes to collect metrics from
        max_detailed = {
            EXCHANGE_TYPE:
            int(instance.get('max_detailed_exchanges',
                             MAX_DETAILED_EXCHANGES)),
            QUEUE_TYPE:
            int(instance.get('max_detailed_queues', MAX_DETAILED_QUEUES)),
            NODE_TYPE:
            int(instance.get('max_detailed_nodes', MAX_DETAILED_NODES)),
        }

        # List of queues/nodes to collect metrics from
        specified = {
            EXCHANGE_TYPE: {
                'explicit': instance.get('exchanges', []),
                'regexes': instance.get('exchanges_regexes', []),
            },
            QUEUE_TYPE: {
                'explicit': instance.get('queues', []),
                'regexes': instance.get('queues_regexes', []),
            },
            NODE_TYPE: {
                'explicit': instance.get('nodes', []),
                'regexes': instance.get('nodes_regexes', []),
            },
        }

        for object_type, filters in specified.iteritems():
            for filter_type, filter_objects in filters.iteritems():
                if type(filter_objects) != list:
                    raise TypeError(
                        "{0} / {0}_regexes parameter must be a list".format(
                            object_type))

        auth = (username, password)

        return base_url, max_detailed, specified, auth, ssl_verify, custom_tags
Example #32
0
    def __init__(self, **kwargs):
        self.docker_util = DockerUtil()
        if 'init_config' in kwargs and 'instance' in kwargs:
            init_config = kwargs.get('init_config', {})
            instance = kwargs.get('instance', {})
        else:
            try:
                config_file_path = get_conf_path(KUBERNETES_CHECK_NAME)
                check_config = check_yaml(config_file_path)
                init_config = check_config['init_config'] or {}
                instance = check_config['instances'][0] or {}
            # kubernetes.yaml was not found
            except IOError as ex:
                log.error(ex.message)
                init_config, instance = {}, {}
            except Exception:
                log.error('Kubernetes configuration file is invalid. '
                          'Trying connecting to kubelet with default settings anyway...')
                init_config, instance = {}, {}

        self.method = instance.get('method', KubeUtil.DEFAULT_METHOD)
        self._node_ip = self._node_name = None  # lazy evaluation
        self.host_name = os.environ.get('HOSTNAME')
        self.pod_name = os.environ.get('KUBERNETES_POD_NAME') or self.host_name
        self.tls_settings = self._init_tls_settings(instance)

        # apiserver
        if 'api_server_url' in instance:
            self.kubernetes_api_root_url = instance.get('api_server_url')
        else:
            master_host = os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME
            master_port = os.environ.get('KUBERNETES_SERVICE_PORT') or self.DEFAULT_MASTER_PORT
            self.kubernetes_api_root_url = 'https://%s:%s' % (master_host, master_port)

        self.kubernetes_api_url = '%s/api/v1' % self.kubernetes_api_root_url

        # Service mapping helper class
        self._service_mapper = PodServiceMapper(self)
        from config import _is_affirmative
        self.collect_service_tag = _is_affirmative(instance.get('collect_service_tags', KubeUtil.DEFAULT_COLLECT_SERVICE_TAG))


        # leader status triggers event collection
        self.is_leader = False
        self.leader_elector = None
        self.leader_lease_duration = instance.get('leader_lease_duration')

        # kubelet
        # If kubelet_api_url is None, init_kubelet didn't succeed yet.
        self.init_success = False
        self.kubelet_api_url = None
        self.init_retry_interval = init_config.get('init_retry_interval', DEFAULT_RETRY_INTERVAL)
        self.last_init_retry = None
        self.left_init_retries = init_config.get('init_retries', DEFAULT_INIT_RETRIES) + 1
        self.init_kubelet(instance)

        self.kube_label_prefix = instance.get('label_to_tag_prefix', KubeUtil.DEFAULT_LABEL_PREFIX)
        self.kube_node_labels = instance.get('node_labels_to_host_tags', {})

        # keep track of the latest k8s event we collected and posted
        # default value is 0 but TTL for k8s events is one hour anyways
        self.last_event_collection_ts = 0
Example #33
0
    def get_stats(self, instance, base_url, object_type, max_detailed, filters, auth=None):
        """
        instance: the check instance
        base_url: the url of the rabbitmq management api (e.g. http://localhost:15672/api)
        object_type: either QUEUE_TYPE or NODE_TYPE
        max_detailed: the limit of objects to collect for this type
        filters: explicit or regexes filters of specified queues or nodes (specified in the yaml file)
        """

        data = self._get_data(
            urlparse.urljoin(base_url, object_type), auth=auth)
        # Make a copy of this list as we will remove items from it at each
        # iteration
        explicit_filters = list(filters['explicit'])
        regex_filters = filters['regexes']

        """ data is a list of nodes or queues:
        data = [
            {'status': 'running', 'node': 'rabbit@host', 'name': 'queue1', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
            {'status': 'running', 'node': 'rabbit@host, 'name': 'queue10', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
            {'status': 'running', 'node': 'rabbit@host', 'name': 'queue11', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
            ...
        ]
        """
        if len(explicit_filters) > max_detailed:
            raise Exception(
                "The maximum number of %s you can specify is %d." % (object_type, max_detailed))

        # a list of queues/nodes is specified. We process only those
        if explicit_filters or regex_filters:
            matching_lines = []
            for data_line in data:
                name = data_line.get("name")
                if name in explicit_filters:
                    matching_lines.append(data_line)
                    explicit_filters.remove(name)
                    continue

                match_found = False
                for p in regex_filters:
                    match = re.search(p, name)
                    if match:
                        if _is_affirmative(instance.get("tag_families", False)) and match.groups():
                            data_line["queue_family"] = match.groups()[0]
                        matching_lines.append(data_line)
                        match_found = True
                        break

                if match_found:
                    continue

                # Absolute names work only for queues
                if object_type != QUEUE_TYPE:
                    continue
                absolute_name = '%s/%s' % (data_line.get("vhost"), name)
                if absolute_name in explicit_filters:
                    matching_lines.append(data_line)
                    explicit_filters.remove(absolute_name)
                    continue

                for p in regex_filters:
                    match = re.search(p, absolute_name)
                    if match:
                        if _is_affirmative(instance.get("tag_families", False)) and match.groups():
                            data_line["queue_family"] = match.groups()[0]
                        matching_lines.append(data_line)
                        match_found = True
                        break

                if match_found:
                    continue

            data = matching_lines

        # if no filters are specified, check everything according to the limits
        if len(data) > ALERT_THRESHOLD * max_detailed:
            # Post a message on the dogweb stream to warn
            self.alert(base_url, max_detailed, len(data), object_type)

        if len(data) > max_detailed:
            # Display a warning in the info page
            self.warning(
                "Too many queues to fetch. You must choose the %s you are interested in by editing the rabbitmq.yaml configuration file or get in touch with Datadog Support" % object_type)

        for data_line in data[:max_detailed]:
            # We truncate the list of nodes/queues if it's above the limit
            self._get_metrics(data_line, object_type)
Example #34
0
    def body_lines(self):
        # Metadata whitelist
        metadata_whitelist = ['hostname', 'fqdn', 'ipv4', 'instance-id']

        lines = ['Clocks', '======', '']
        try:
            ntp_offset, ntp_styles = get_ntp_info()
            lines.append('  ' + style('NTP offset', *ntp_styles) + ': ' +
                         style('%s s' % round(ntp_offset, 4), *ntp_styles))
        except Exception as e:
            lines.append('  NTP offset: Unknown (%s)' % str(e))
        lines.append('  System UTC time: ' +
                     datetime.datetime.utcnow().__str__())
        lines.append('')

        # Paths to checks.d/conf.d
        lines += ['Paths', '=====', '']

        osname = config.get_os()

        try:
            confd_path = config.get_confd_path(osname)
        except config.PathNotFound:
            confd_path = 'Not found'

        try:
            checksd_path = config.get_checksd_path(osname)
        except config.PathNotFound:
            checksd_path = 'Not found'

        lines.append('  conf.d: ' + confd_path)
        lines.append('  checks.d: ' + checksd_path)
        lines.append('')

        # Hostnames
        lines += ['Hostnames', '=========', '']

        if not self.host_metadata:
            lines.append("  No host information available yet.")
        else:
            for key, host in self.host_metadata.iteritems():
                for whitelist_item in metadata_whitelist:
                    if whitelist_item in key:
                        lines.append("  " + key + ": " + host)
                        break

        lines.append('')

        # Checks.d Status
        lines += ['Checks', '======', '']
        check_statuses = self.check_statuses + get_jmx_status()
        if not check_statuses:
            lines.append("  No checks have run yet.")
        else:
            for cs in check_statuses:
                check_lines = ['  ' + cs.name, '  ' + '-' * len(cs.name)]
                if cs.init_failed_error:
                    check_lines.append(
                        "    - initialize check class [%s]: %s" % (style(
                            STATUS_ERROR, 'red'), repr(cs.init_failed_error)))
                    if self.verbose and cs.init_failed_traceback:
                        check_lines.extend(
                            '      ' + line
                            for line in cs.init_failed_traceback.split('\n'))
                else:
                    for s in cs.instance_statuses:
                        c = 'green'
                        if s.has_warnings():
                            c = 'yellow'
                        if s.has_error():
                            c = 'red'
                        line = "    - instance #%s [%s]" % (s.instance_id,
                                                            style(s.status, c))
                        if s.has_error():
                            line += u": %s" % s.error
                        if s.metric_count is not None:
                            line += " collected %s metrics" % s.metric_count
                        if s.instance_check_stats is not None:
                            line += " Last run duration: %s" % s.instance_check_stats.get(
                                'run_time')

                        check_lines.append(line)

                        if s.has_warnings():
                            for warning in s.warnings:
                                warn = warning.split('\n')
                                if not len(warn):
                                    continue
                                check_lines.append(
                                    u"        %s: %s" %
                                    (style("Warning", 'yellow'), warn[0]))
                                check_lines.extend(u"        %s" % l
                                                   for l in warn[1:])
                        if self.verbose and s.traceback is not None:
                            check_lines.extend(
                                '      ' + line
                                for line in s.traceback.split('\n'))

                    check_lines += [
                        "    - Collected %s metric%s, %s event%s & %s service check%s"
                        % (cs.metric_count, plural(
                            cs.metric_count), cs.event_count,
                           plural(cs.event_count), cs.service_check_count,
                           plural(cs.service_check_count)),
                    ]

                    if cs.check_stats is not None:
                        check_lines += [
                            "    - Stats: %s" %
                            pretty_statistics(cs.check_stats)
                        ]

                    if cs.library_versions is not None:
                        check_lines += ["    - Dependencies:"]
                        for library, version in cs.library_versions.iteritems(
                        ):
                            check_lines += [
                                "        - %s: %s" % (library, version)
                            ]

                    check_lines += [""]

                lines += check_lines

        # Metadata status
        metadata_enabled = _is_affirmative(get_config().get(
            'display_service_metadata', False))

        if metadata_enabled:
            lines += ["", "Service metadata", "================", ""]
            if not check_statuses:
                lines.append("  No checks have run yet.")
            else:
                meta_lines = []
                for cs in check_statuses:
                    # Check title
                    check_line = ['  ' + cs.name, '  ' + '-' * len(cs.name)]
                    instance_lines = []
                    for i, meta in enumerate(cs.service_metadata):
                        if not meta:
                            continue
                        instance_lines += ["    - instance #%s:" % i]
                        for k, v in meta.iteritems():
                            instance_lines += ["        - %s: %s" % (k, v)]
                    if instance_lines:
                        check_line += instance_lines
                        meta_lines += check_line
                if meta_lines:
                    lines += meta_lines
                else:
                    lines.append("  No metadata were collected.")

        # Emitter status
        lines += ["", "Emitters", "========", ""]
        if not self.emitter_statuses:
            lines.append("  No emitters have run yet.")
        else:
            for es in self.emitter_statuses:
                c = 'green'
                if es.has_error():
                    c = 'red'
                line = "  - %s [%s]" % (es.name, style(es.status, c))
                if es.status != STATUS_OK:
                    line += ": %s" % es.error
                lines.append(line)

        return lines
Example #35
0
    def check(self, instance):
        if not self.kubeutil.init_success:
            if self.kubeutil.left_init_retries > 0:
                self.kubeutil.init_kubelet(instance)
                self.log.warning(
                    "Kubelet client is not initialized, Kubernetes check is paused."
                )
                return
            else:
                raise Exception(
                    "Unable to initialize Kubelet client. Try setting the host parameter. The Kubernetes check failed permanently."
                )

        # Leader election
        self.refresh_leader_status(instance)

        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges
        ]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_rates
        ]

        self.publish_aliases = _is_affirmative(
            instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(
            instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        try:
            pods_list = self.kubeutil.retrieve_pods_list()
        except:
            pods_list = None

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url, instance)

        if pods_list is not None:
            # Will not fail if cAdvisor is not available
            self._update_pods_metrics(instance, pods_list)
            # cAdvisor & kubelet metrics, will fail if port 4194 is not open
            try:
                if int(instance.get('port',
                                    KubeUtil.DEFAULT_CADVISOR_PORT)) > 0:
                    self._update_metrics(instance, pods_list)
            except ConnectionError:
                self.warning(
                    '''Can't access the cAdvisor metrics, performance metrics and'''
                    ''' limits/requests will not be collected. Please setup'''
                    ''' your kubelet with the --cadvisor-port=4194 option, or set port to 0'''
                    ''' in this check's configuration to disable cAdvisor lookup.'''
                )
            except Exception as err:
                self.log.warning("Error while getting performance metrics: %s",
                                 str(err))

        # kubernetes events
        if self.event_retriever is not None:
            try:
                events = self.event_retriever.get_event_array()
                changed_cids = self.kubeutil.process_events(events,
                                                            podlist=pods_list)
                if (changed_cids and self._sd_backend):
                    self._sd_backend.update_checks(changed_cids)
                if events and self._collect_events:
                    self._update_kube_events(instance, pods_list, events)
            except Exception as ex:
                self.log.error("Event collection failed: %s", str(ex))
Example #36
0
    def check(self, instance):
        if 'apache_status_url' not in instance:
            raise Exception("Missing 'apache_status_url' in Apache config")

        url = self.assumed_url.get(instance['apache_status_url'], instance['apache_status_url'])

        connect_timeout = int(instance.get('connect_timeout', 5))
        receive_timeout = int(instance.get('receive_timeout', 15))

        tags = instance.get('tags', [])

        disable_ssl_validation = _is_affirmative(instance.get('disable_ssl_validation', False))

        auth = None
        if 'apache_user' in instance and 'apache_password' in instance:
            auth = (instance['apache_user'], instance['apache_password'])

        # Submit a service check for status page availability.
        parsed_url = urlparse.urlparse(url)
        apache_host = parsed_url.hostname
        apache_port = parsed_url.port or 80
        service_check_name = 'apache.can_connect'
        service_check_tags = ['host:%s' % apache_host, 'port:%s' % apache_port]
        try:
            self.log.debug('apache check initiating request, connect timeout %d receive %d' %
                           (connect_timeout, receive_timeout))
            r = requests.get(url, auth=auth, headers=headers(self.agentConfig),
                             verify=not disable_ssl_validation, timeout=(connect_timeout, receive_timeout))
            r.raise_for_status()

        except Exception as e:
            self.log.warning("Caught exception %s" % str(e))
            self.service_check(service_check_name, AgentCheck.CRITICAL,
                               tags=service_check_tags)
            raise
        else:
            self.service_check(service_check_name, AgentCheck.OK,
                               tags=service_check_tags)
        self.log.debug("apache check succeeded")
        response = r.content
        metric_count = 0
        # Loop through and extract the numerical values
        for line in response.splitlines():
            values = line.split(': ')
            if len(values) == 2: # match
                metric, value = values
                try:
                    value = float(value)
                except ValueError:
                    continue

                # Special case: kBytes => bytes
                if metric == 'Total kBytes':
                    value = value * 1024

                # Send metric as a gauge, if applicable
                if metric in self.GAUGES:
                    metric_count += 1
                    metric_name = self.GAUGES[metric]
                    self.gauge(metric_name, value, tags=tags)

                # Send metric as a rate, if applicable
                if metric in self.RATES:
                    metric_count += 1
                    metric_name = self.RATES[metric]
                    self.rate(metric_name, value, tags=tags)

        if metric_count == 0:
            if self.assumed_url.get(instance['apache_status_url'], None) is None and url[-5:] != '?auto':
                self.assumed_url[instance['apache_status_url']] = '%s?auto' % url
                self.warning("Assuming url was not correct. Trying to add ?auto suffix to the url")
                self.check(instance)
            else:
                raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance['apache_status_url'])
Example #37
0
    def get_instance_proxy(self, instance, uri):
        proxies = self.proxies.copy()
        proxies['no'] = get_no_proxy_from_env()

        return config_proxy_skip(
            proxies, uri, _is_affirmative(instance.get('no_proxy', False)))
Example #38
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            self.metadata_collector = MetadataCollector()

            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.kubeutil = None
                    self.log.error("Couldn't instantiate the kubernetes client, "
                        "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", DEFAULT_LABELS_AS_TAGS)
            self.kube_pod_tags = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)


            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.event_attributes_as_tags = instance.get('event_attributes_as_tags', [])
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.capped_metrics = instance.get('capped_metrics')

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
Example #39
0
                            pretty_statistics(cs.check_stats)
                        ]

                    if cs.library_versions is not None:
                        check_lines += ["    - Dependencies:"]
                        for library, version in cs.library_versions.iteritems(
                        ):
                            check_lines += [
                                "        - %s: %s" % (library, version)
                            ]

                    check_lines += [""]

                lines += check_lines

        metadata_enabled = _is_affirmative(get_config().get(
            'display_service_metadata', False))

        if metadata_enabled:
            lines += ["", "Service metadata", "================", ""]
            if not check_statuses:
                lines.append("  No checks have run yet.")
            else:
                meta_lines = []
                for cs in check_statuses:

                    check_line = ['  ' + cs.name, '  ' + '-' * len(cs.name)]
                    instance_lines = []
                    for i, meta in enumerate(cs.service_metadata):
                        if not meta:
                            continue
                        instance_lines += ["    - instance #%s:" % i]
Example #40
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()
            self.docker_client = self.docker_util.client
            if self.is_k8s():
                self.kubeutil = KubeUtil()
            self._mountpoints = self.docker_util.get_mountpoints(
                CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get(
                "collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(
                instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags",
                                            DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags",
                                        DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning(
                        "You must specify an exclude section to enable filtering"
                    )
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(
                    include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            self.collect_image_stats = _is_affirmative(
                instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(
                instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(
                instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(
                instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(
                instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(
                instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception, e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
Example #41
0
    def check(self, instance):
        if 'url' not in instance:
            raise Exception('Mesos instance missing "url" value.')

        url = instance['url']
        instance_tags = instance.get('tags', [])
        tasks = instance.get('tasks', [])
        default_timeout = self.init_config.get('default_timeout', 5)
        timeout = float(instance.get('timeout', default_timeout))
        master_port = instance.get("master_port", DEFAULT_MASTER_PORT)
        ssl_verify = not _is_affirmative(
            instance.get('disable_ssl_validation', False))

        state_metrics = self._get_constant_attributes(url, timeout,
                                                      master_port, ssl_verify)
        tags = None

        if state_metrics is None:
            state_metrics = self._get_state(url, timeout, ssl_verify)
        if state_metrics:
            tags = [
                'mesos_pid:{0}'.format(state_metrics['pid']),
                'mesos_node:slave',
            ]
            if self.cluster_name:
                tags.append('mesos_cluster:{0}'.format(self.cluster_name))

            tags += instance_tags
            for task in tasks:
                for framework in state_metrics['frameworks']:
                    for executor in framework['executors']:
                        for t in executor['tasks']:
                            if task.lower() in t['name'].lower(
                            ) and t['slave_id'] == state_metrics['id']:
                                task_tags = ['task_name:' + t['name']] + tags
                                self.service_check(
                                    t['name'] + '.ok',
                                    self.TASK_STATUS[t['state']],
                                    tags=task_tags)
                                for key_name, (
                                        metric_name, metric_func
                                ) in self.TASK_METRICS.iteritems():
                                    metric_func(self,
                                                metric_name,
                                                t['resources'][key_name],
                                                tags=task_tags)

        stats_metrics = self._get_stats(url, timeout, ssl_verify)
        if stats_metrics:
            tags = tags if tags else instance_tags
            metrics = [
                self.SLAVE_TASKS_METRICS, self.SYSTEM_METRICS,
                self.SLAVE_RESOURCE_METRICS, self.SLAVE_EXECUTORS_METRICS,
                self.STATS_METRICS
            ]
            for m in metrics:
                for key_name, (metric_name, metric_func) in m.iteritems():
                    metric_func(self,
                                metric_name,
                                stats_metrics[key_name],
                                tags=tags)

        self.service_check_needed = True
Example #42
0
    def check(self, instance):
        # For calculating lag, we have to fetch offsets from both kafka and
        # zookeeper. There's a potential race condition because whichever one we
        # check first may be outdated by the time we check the other. Better to
        # check consumer offset before checking broker offset because worst case
        # is that overstates consumer lag a little. Doing it the other way can
        # understate consumer lag to the point of having negative consumer lag,
        # which just creates confusion because it's theoretically impossible.

        # Fetch consumer group offsets from Zookeeper
        zk_hosts_ports = instance.get('zk_connect_str')
        zk_prefix = instance.get('zk_prefix', '')
        zk_interval = int(instance.get('zk_iteration_ival', 0))
        get_kafka_consumer_offsets = _is_affirmative(
            instance.get('kafka_consumer_offsets', zk_hosts_ports is None))

        # If monitor_unlisted_consumer_groups is True, fetch all groups stored in ZK
        consumer_groups = None
        if instance.get('monitor_unlisted_consumer_groups', False):
            consumer_groups = None
        elif 'consumer_groups' in instance:
            consumer_groups = self._read_config(
                instance,
                'consumer_groups',
                cast=self._validate_consumer_groups)

        zk_consumer_offsets = None
        if zk_hosts_ports and \
                self._should_zk(zk_hosts_ports, zk_interval, get_kafka_consumer_offsets):
            zk_consumer_offsets, consumer_groups = self._get_zk_consumer_offsets(
                zk_hosts_ports, consumer_groups, zk_prefix)

        topics = defaultdict(set)
        kafka_consumer_offsets = None

        cli = self._get_kafka_client(instance)
        cli._maybe_refresh_metadata()
        kafka_version = self._get_kafka_version(cli)
        if get_kafka_consumer_offsets:
            # For now, consumer groups are mandatory if not using ZK
            if not zk_hosts_ports and not consumer_groups:
                raise BadKafkaConsumerConfiguration(
                    'Invalid configuration - if you\'re not collecing '
                    'offset from ZK you _must_ specify consumer groups')
            if self._kafka_compatible(kafka_version):
                kafka_consumer_offsets, topics = self._get_kafka_consumer_offsets(
                    instance, consumer_groups)

        if not topics:
            # val = {'consumer_group': {'topic': [0, 1]}}
            for _, tps in consumer_groups.iteritems():
                for topic, partitions in tps.iteritems():
                    topics[topic].update(partitions)

        warn_msg = """ Discovered %s partition contexts - this exceeds the maximum
                       number of contexts permitted by the check. Please narrow your
                       target by specifying in your YAML what consumer groups, topics
                       and partitions you wish to monitor."""
        if zk_consumer_offsets and len(
                zk_consumer_offsets) > self.context_limit:
            self.warning(warn_msg % len(zk_consumer_offsets))
            return
        if kafka_consumer_offsets and len(
                kafka_consumer_offsets) > self.context_limit:
            self.warning(warn_msg % len(kafka_consumer_offsets))
            return

        # Fetch the broker highwater offsets
        try:
            highwater_offsets, topic_partitions_without_a_leader = self._get_broker_offsets(
                instance, topics)
        except Exception:
            self.log.exception(
                'There was a problem collecting the high watermark offsets')
            return

        # Report the broker highwater offset
        for (topic,
             partition), highwater_offset in highwater_offsets.iteritems():
            broker_tags = ['topic:%s' % topic, 'partition:%s' % partition]
            self.gauge('kafka.broker_offset',
                       highwater_offset,
                       tags=broker_tags)

        # Report the consumer group offsets and consumer lag
        if zk_consumer_offsets:
            self._report_consumer_metrics(highwater_offsets,
                                          zk_consumer_offsets,
                                          topic_partitions_without_a_leader,
                                          ['source:zk'])
        if kafka_consumer_offsets:
            self._report_consumer_metrics(highwater_offsets,
                                          kafka_consumer_offsets,
                                          topic_partitions_without_a_leader,
                                          ['source:kafka'])
Example #43
0
    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready or docker daemon is down. So we retry if needed
            # https://github.com/DataDog/dd-agent/issues/1896
            self.init()

            if self.docker_client is None:
                message = "Unable to connect to Docker daemon"
                self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                                   message=message)
                return

            if not self.init_success:
                # Initialization failed, will try later
                return

        try:
            # Report image metrics
            if self.collect_image_stats:
                self._count_and_weigh_images()

            if Platform.is_k8s():
                self.kube_pod_tags = {}
                if self.kubeutil:
                    try:
                        self.kube_pod_tags = self.kubeutil.get_kube_pod_tags()
                    except Exception as e:
                        self.log.warning('Could not retrieve kubernetes labels: %s' % str(e))

            # containers running with custom cgroups?
            custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False))

            # Get the list of containers and the index of their names
            health_service_checks = True if self.whitelist_patterns else False
            containers_by_id = self._get_and_count_containers(custom_cgroups, health_service_checks)
            containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups)

            # Send events from Docker API
            if self.collect_events or self._service_discovery or not self._disable_net_metrics or self.collect_exit_codes:
                self._process_events(containers_by_id)

            # Report performance container metrics (cpu, mem, net, io)
            self._report_performance_metrics(containers_by_id)

            if self.collect_container_size:
                self._report_container_size(containers_by_id)

            if self.collect_container_count:
                self._report_container_count(containers_by_id)

            if self.collect_volume_count:
                self._report_volume_count()

            # Collect disk stats from Docker info command
            if self.collect_disk_stats:
                self._report_disk_stats()

            if health_service_checks:
                self._send_container_healthcheck_sc(containers_by_id)
        except:
            self.log.exception("Docker_daemon check failed")
            self.warning("Check failed. Will retry at next iteration")

        if self.capped_metrics:
            self.filter_capped_metrics()
Example #44
0
    def check(self, instance):
        name = instance.get('name', None)
        tags = instance.get('tags', [])
        exact_match = _is_affirmative(instance.get('exact_match', True))
        search_string = instance.get('search_string', None)
        ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True))
        pid = instance.get('pid')

        if not isinstance(search_string, list) and pid is None:
            raise KeyError('"search_string" parameter should be a list')

        # FIXME 6.x remove me
        if pid is None:
            if "All" in search_string:
                self.warning(
                    'Deprecated: Having "All" in your search_string will'
                    'greatly reduce the performance of the check and '
                    'will be removed in a future version of the agent.')

        if name is None:
            raise KeyError('The "name" of process groups is mandatory')

        if search_string is not None:
            pids = self.find_pids(name,
                                  search_string,
                                  exact_match,
                                  ignore_ad=ignore_ad)
        elif pid is not None:
            pids = [psutil.Process(pid)]
        else:
            raise ValueError(
                'The "search_string" or "pid" options are required for process identification'
            )

        proc_state = self.get_process_state(name, pids)

        # FIXME 6.x remove the `name` tag
        tags.extend(['process_name:%s' % name, name])

        self.log.debug('ProcessCheck: process %s analysed', name)
        self.gauge('system.processes.number', len(pids), tags=tags)

        for attr, mname in ATTR_TO_METRIC.iteritems():
            vals = [x for x in proc_state[attr] if x is not None]
            # skip []
            if vals:
                if attr == 'run_time':
                    self.gauge('system.processes.%s.avg' % mname,
                               sum(vals) / len(vals),
                               tags=tags)
                    self.gauge('system.processes.%s.max' % mname,
                               max(vals),
                               tags=tags)
                    self.gauge('system.processes.%s.min' % mname,
                               min(vals),
                               tags=tags)

                # FIXME 6.x: change this prefix?
                else:
                    self.gauge('system.processes.%s' % mname,
                               sum(vals),
                               tags=tags)

        for attr, mname in ATTR_TO_METRIC_RATE.iteritems():
            vals = [x for x in proc_state[attr] if x is not None]
            if vals:
                self.rate('system.processes.%s' % mname, sum(vals), tags=tags)

        self._process_service_check(name, len(pids),
                                    instance.get('thresholds', None))
Example #45
0
    def check(self, instance):
        # Get properties from conf file
        rm_address = instance.get('resourcemanager_uri')
        if rm_address is None:
            raise Exception(
                'The ResourceManager URL must be specified in the instance configuration'
            )

        collect_task_metrics = _is_affirmative(
            instance.get('collect_task_metrics', False))

        ssl_verify = _is_affirmative(instance.get('ssl_verify', True))

        # Get additional tags from the conf file
        custom_tags = instance.get('tags') or [
        ]  # this handles the case when the YAML `tags` key has an empty value
        tags = list(set(custom_tags)) if custom_tags else []

        # Get the cluster name from the conf file
        cluster_name = instance.get('cluster_name')
        if cluster_name is None:
            self.warning(
                "The cluster_name must be specified in the instance configuration, defaulting to '%s'"
                % (DEFAULT_CUSTER_NAME))
            cluster_name = DEFAULT_CUSTER_NAME

        tags.append('cluster_name:%s' % cluster_name)

        # Get the running MR applications from YARN
        running_apps = self._get_running_app_ids(rm_address,
                                                 ssl_verify=ssl_verify)

        # Report success after gathering all metrics from ResourceManaager
        self.service_check(
            YARN_SERVICE_CHECK,
            AgentCheck.OK,
            tags=['url:%s' % rm_address] + custom_tags,
            message='Connection to ResourceManager "%s" was successful' %
            rm_address)

        # Get the applications from the application master
        running_jobs = self._mapreduce_job_metrics(running_apps,
                                                   tags,
                                                   ssl_verify=ssl_verify)

        # # Get job counter metrics
        self._mapreduce_job_counters_metrics(running_jobs,
                                             tags,
                                             ssl_verify=ssl_verify)

        # Get task metrics
        if collect_task_metrics:
            self._mapreduce_task_metrics(running_jobs,
                                         tags,
                                         ssl_verify=ssl_verify)

        # Report success after gathering all metrics from Application Master
        if running_jobs:
            job_id, metrics = running_jobs.items()[0]
            am_address = self._get_url_base(metrics['tracking_url'])

            self.service_check(
                MAPREDUCE_SERVICE_CHECK,
                AgentCheck.OK,
                tags=['url:%s' % am_address] + custom_tags,
                message='Connection to ApplicationManager "%s" was successful'
                % am_address)
Example #46
0
    def run(self, config=None):
        """Main loop of the collector"""

        # Gracefully exit on sigterm.
        signal.signal(signal.SIGTERM, self._handle_sigterm)

        if not Platform.is_windows():
            # A SIGUSR1 signals an exit with an autorestart
            signal.signal(signal.SIGUSR1, self._handle_sigusr1)

            # Handle Keyboard Interrupt
            signal.signal(signal.SIGINT, self._handle_sigterm)

            # A SIGHUP signals a configuration reload
            signal.signal(signal.SIGHUP, self._handle_sighup)
        else:
            sdk_integrations = get_sdk_integration_paths()
            for name, path in sdk_integrations.iteritems():
                lib_path = os.path.join(path, 'lib')
                if os.path.exists(lib_path):
                    sys.path.append(lib_path)

        # Save the agent start-up stats.
        CollectorStatus().persist()

        # Intialize the collector.
        if not config:
            config = get_config(parse_args=True)

        self._agentConfig = self._set_agent_config_hostname(config)
        hostname = get_hostname(self._agentConfig)
        systemStats = get_system_stats(proc_path=self._agentConfig.get(
            'procfs_path', '/proc').rstrip('/'))
        emitters = self._get_emitters()

        # Initialize service discovery
        if self._agentConfig.get('service_discovery'):
            self.sd_backend = get_sd_backend(self._agentConfig)

        if self.sd_backend and _is_affirmative(
                self._agentConfig.get('sd_jmx_enable', False)):
            pipe_path = get_jmx_pipe_path()
            if Platform.is_windows():
                pipe_name = pipe_path.format(pipename=SD_PIPE_NAME)
            else:
                pipe_name = os.path.join(pipe_path, SD_PIPE_NAME)

            if os.access(pipe_path, os.W_OK):
                if not os.path.exists(pipe_name):
                    os.mkfifo(pipe_name)
                self.sd_pipe = os.open(
                    pipe_name, os.O_RDWR)  # RW to avoid blocking (will only W)

                # Initialize Supervisor proxy
                self.supervisor_proxy = self._get_supervisor_socket(
                    self._agentConfig)
            else:
                log.debug(
                    'Unable to create pipe in temporary directory. JMX service discovery disabled.'
                )

        # Load the checks.d checks
        self._checksd = load_check_directory(self._agentConfig, hostname)

        # Load JMX configs if available
        if self._jmx_service_discovery_enabled:
            self.sd_pipe_jmx_configs(hostname)

        # Initialize the Collector
        self.collector = Collector(self._agentConfig, emitters, systemStats,
                                   hostname)

        # In developer mode, the number of runs to be included in a single collector profile
        try:
            self.collector_profile_interval = int(
                self._agentConfig.get('collector_profile_interval',
                                      DEFAULT_COLLECTOR_PROFILE_INTERVAL))
        except ValueError:
            log.warn('collector_profile_interval is invalid. '
                     'Using default value instead (%s).' %
                     DEFAULT_COLLECTOR_PROFILE_INTERVAL)
            self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL

        # Configure the watchdog.
        self.check_frequency = int(self._agentConfig['check_freq'])
        watchdog = self._get_watchdog(self.check_frequency)

        # Initialize the auto-restarter
        self.restart_interval = int(
            self._agentConfig.get('restart_interval', RESTART_INTERVAL))
        self.agent_start = time.time()

        self.allow_profiling = self._agentConfig.get('allow_profiling', True)

        profiled = False
        collector_profiled_runs = 0

        # Run the main loop.
        while self.run_forever:
            # Setup profiling if necessary
            if self.allow_profiling and self.in_developer_mode and not profiled:
                try:
                    profiler = AgentProfiler()
                    profiler.enable_profiling()
                    profiled = True
                except Exception as e:
                    log.warn("Cannot enable profiler: %s" % str(e))

            if self.reload_configs_flag:
                if isinstance(self.reload_configs_flag, set):
                    self.reload_configs(
                        checks_to_reload=self.reload_configs_flag)
                else:
                    self.reload_configs()

            # JMXFetch restarts should prompt re-piping *all* JMX configs
            if self._jmx_service_discovery_enabled and \
                    (not self.reload_configs_flag or isinstance(self.reload_configs_flag, set)):
                try:
                    jmx_launch = JMXFetch._get_jmx_launchtime()
                    if self.last_jmx_piped and self.last_jmx_piped < jmx_launch:
                        self.sd_pipe_jmx_configs(hostname)
                except Exception as e:
                    log.debug("could not stat JMX lunch file: %s", e)

            # Do the work. Pass `configs_reloaded` to let the collector know if it needs to
            # look for the AgentMetrics check and pop it out.
            _, continue_immediately = self.collector.run(
                checksd=self._checksd,
                start_event=self.start_event,
                configs_reloaded=True if self.reload_configs_flag else False)

            self.reload_configs_flag = False

            # Look for change in the config template store.
            # The self.sd_backend.reload_check_configs flag is set
            # to True if a config reload is needed.
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               not self.sd_backend.reload_check_configs:
                try:
                    self.sd_backend.reload_check_configs = get_config_store(
                        self._agentConfig).crawl_config_template()
                except Exception as e:
                    log.warn(
                        'Something went wrong while looking for config template changes: %s'
                        % str(e))

            # Check if we should run service discovery
            # The `reload_check_configs` flag can be set through the docker_daemon check or
            # using ConfigStore.crawl_config_template
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               self.sd_backend.reload_check_configs:
                self.reload_configs_flag = self.sd_backend.reload_check_configs
                self.sd_backend.reload_check_configs = False

            if profiled:
                if collector_profiled_runs >= self.collector_profile_interval:
                    try:
                        profiler.disable_profiling()
                        profiled = False
                        collector_profiled_runs = 0
                    except Exception as e:
                        log.warn("Cannot disable profiler: %s" % str(e))

            # Check if we should restart.
            if self.autorestart and self._should_restart():
                self._do_restart()

            # Only plan for next loop if we will continue, otherwise exit quickly.
            if self.run_forever:
                if watchdog:
                    watchdog.reset()
                if profiled:
                    collector_profiled_runs += 1
                if not continue_immediately:
                    log.debug("Sleeping for {0} seconds".format(
                        self.check_frequency))
                    time.sleep(self.check_frequency)
                else:
                    log.debug("Continuing immediately")

        # Now clean-up.
        try:
            CollectorStatus.remove_latest_status()
        except Exception:
            pass

        # Explicitly kill the process, because it might be running as a daemon.
        log.info("Exiting. Bye bye.")
        sys.exit(0)
Example #47
0
    def _check_connectivity_to_master(self, instance):
        url = instance.get('gitlab_url')
        if url is None:
            # Simply ignore this service check if not configured
            return

        parsed_url = urlparse.urlparse(url)
        gitlab_host = parsed_url.hostname
        gitlab_port = parsed_url.port or 80
        service_check_tags = [
            'gitlab_host:%s' % gitlab_host,
            'gitlab_port:%s' % gitlab_port
        ]

        ## Load the ssl configuration
        ssl_params = {
            'ssl_cert_validation':
            _is_affirmative(instance.get('ssl_cert_validation', True)),
            'ssl_ca_certs':
            instance.get('ssl_ca_certs'),
        }

        for key, param in ssl_params.items():
            if param is None:
                del ssl_params[key]

        verify_ssl = ssl_params.get(
            'ssl_ca_certs',
            True) if ssl_params['ssl_cert_validation'] else False

        ## Timeout settings
        timeouts = (int(
            instance.get('connect_timeout',
                         GitlabRunnerCheck.DEFAULT_CONNECT_TIMEOUT)),
                    int(
                        instance.get(
                            'receive_timeout',
                            GitlabRunnerCheck.DEFAULT_RECEIVE_TIMEOUT)))

        ## Auth settings
        auth = None
        if 'gitlab_user' in instance and 'gitlab_password' in instance:
            auth = (instance['gitlab_user'], instance['gitlab_password'])

        try:
            self.log.debug('checking connectivity against %s' % url)
            r = requests.get(url,
                             auth=auth,
                             verify=verify_ssl,
                             timeout=timeouts,
                             headers=headers(self.agentConfig))
            if r.status_code != 200:
                self.service_check(self.MASTER_SERVICE_CHECK_NAME,
                                   PrometheusCheck.CRITICAL,
                                   message="Got %s when hitting %s" %
                                   (r.status_code, url),
                                   tags=service_check_tags)
                raise Exception("Http status code {0} on url {1}".format(
                    r.status_code, url))
            else:
                r.raise_for_status()

        except requests.exceptions.Timeout:
            # If there's a timeout
            self.service_check(self.MASTER_SERVICE_CHECK_NAME,
                               PrometheusCheck.CRITICAL,
                               message="Timeout when hitting %s" % url,
                               tags=service_check_tags)
            raise
        except Exception as e:
            self.service_check(self.MASTER_SERVICE_CHECK_NAME,
                               PrometheusCheck.CRITICAL,
                               message="Error hitting %s. Error: %s" %
                               (url, e.message),
                               tags=service_check_tags)
            raise
        else:
            self.service_check(self.MASTER_SERVICE_CHECK_NAME,
                               PrometheusCheck.OK,
                               tags=service_check_tags)
        self.log.debug("gitlab check succeeded")
Example #48
0
    def check(self, instance):
        """
        Returns a dictionary that looks a lot like what's sent back by
        db.serverStatus()
        """
        def total_seconds(td):
            """
            Returns total seconds of a timedelta in a way that's safe for
            Python < 2.7
            """
            if hasattr(td, 'total_seconds'):
                return td.total_seconds()
            else:
                return (lag.microseconds +
                        (lag.seconds + lag.days * 24 * 3600) * 10**6) / 10.0**6

        if 'server' not in instance:
            raise Exception("Missing 'server' in mongo config")

        # x.509 authentication
        ssl_params = {
            'ssl': instance.get('ssl', None),
            'ssl_keyfile': instance.get('ssl_keyfile', None),
            'ssl_certfile': instance.get('ssl_certfile', None),
            'ssl_cert_reqs': instance.get('ssl_cert_reqs', None),
            'ssl_ca_certs': instance.get('ssl_ca_certs', None)
        }

        for key, param in ssl_params.items():
            if param is None:
                del ssl_params[key]

        server = instance['server']
        username, password, db_name, nodelist, clean_server_name, auth_source = self._parse_uri(
            server, sanitize_username=bool(ssl_params))
        additional_metrics = instance.get('additional_metrics', [])

        # Get the list of metrics to collect
        collect_tcmalloc_metrics = 'tcmalloc' in additional_metrics
        metrics_to_collect = self._get_metrics_to_collect(
            server, additional_metrics)

        # Tagging
        tags = instance.get('tags', [])
        # ...de-dupe tags to avoid a memory leak
        tags = list(set(tags))

        if not db_name:
            self.log.info(
                'No MongoDB database found in URI. Defaulting to admin.')
            db_name = 'admin'

        dbstats_tags = _is_affirmative(instance.get('dbstats_tags', True))
        db_name_tag = db_name if dbstats_tags else hashlib.md5(
            db_name.encode()).hexdigest()
        service_check_tags = ["db:%s" % db_name_tag]
        service_check_tags.extend(tags)

        # ...add the `server` tag to the metrics' tags only
        # (it's added in the backend for service checks)
        tags.append('server:%s' % clean_server_name)

        if nodelist:
            host = nodelist[0][0]
            port = nodelist[0][1]
            service_check_tags = service_check_tags + [
                "host:%s" % host, "port:%s" % port
            ]

        timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) * 1000
        try:
            cli = pymongo.mongo_client.MongoClient(
                server,
                socketTimeoutMS=timeout,
                connectTimeoutMS=timeout,
                serverSelectionTimeoutMS=timeout,
                read_preference=pymongo.ReadPreference.PRIMARY_PREFERRED,
                **ssl_params)
            # some commands can only go against the admin DB
            admindb = cli['admin']
            db = cli[db_name]
        except Exception:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags)
            raise

        # Authenticate
        do_auth = True
        use_x509 = ssl_params and not password

        if not username:
            self.log.debug(u"A username is required to authenticate to `%s`",
                           server)
            do_auth = False

        if do_auth:
            if auth_source:
                self.log.info(
                    "authSource was specified in the the server URL: using '%s' as the authentication database",
                    auth_source)
                self._authenticate(cli[auth_source], username, password,
                                   use_x509, clean_server_name,
                                   service_check_tags)
            else:
                self._authenticate(db, username, password, use_x509,
                                   clean_server_name, service_check_tags)

        try:
            status = db.command('serverStatus',
                                tcmalloc=collect_tcmalloc_metrics)
        except Exception:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags)
            raise
        else:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.OK,
                               tags=service_check_tags)

        if status['ok'] == 0:
            raise Exception(status['errmsg'].__str__())

        ops = db.current_op()
        status['fsyncLocked'] = 1 if ops.get('fsyncLock') else 0

        status['stats'] = db.command('dbstats')
        dbstats = {}
        dbstats[db_name] = {'stats': status['stats']}

        # Handle replica data, if any
        # See
        # http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus  # noqa
        if _is_affirmative(instance.get('replica_check', True)):
            try:
                data = {}
                dbnames = []

                replSet = admindb.command('replSetGetStatus')
                if replSet:
                    primary = None
                    current = None

                    # need a new connection to deal with replica sets
                    setname = replSet.get('set')
                    cli_rs = pymongo.mongo_client.MongoClient(
                        server,
                        socketTimeoutMS=timeout,
                        connectTimeoutMS=timeout,
                        serverSelectionTimeoutMS=timeout,
                        replicaset=setname,
                        read_preference=pymongo.ReadPreference.NEAREST,
                        **ssl_params)

                    if do_auth:
                        if auth_source:
                            self._authenticate(cli_rs[auth_source], username,
                                               password, use_x509, server,
                                               service_check_tags)
                        else:
                            self._authenticate(cli_rs[db_name], username,
                                               password, use_x509, server,
                                               service_check_tags)

                    # Replication set information
                    replset_name = replSet['set']
                    replset_state = self.get_state_name(
                        replSet['myState']).lower()

                    tags.extend([
                        u"replset_name:{0}".format(replset_name),
                        u"replset_state:{0}".format(replset_state),
                    ])

                    # Find nodes: master and current node (ourself)
                    for member in replSet.get('members'):
                        if member.get('self'):
                            current = member
                        if int(member.get('state')) == 1:
                            primary = member

                    # Compute a lag time
                    if current is not None and primary is not None:
                        if 'optimeDate' in primary and 'optimeDate' in current:
                            lag = primary['optimeDate'] - current['optimeDate']
                            data['replicationLag'] = total_seconds(lag)

                    if current is not None:
                        data['health'] = current['health']

                    data['state'] = replSet['myState']

                    if current is not None:
                        total = 0.0
                        cfg = cli_rs['local']['system.replset'].find_one()
                        for member in cfg.get('members'):
                            total += member.get('votes', 1)
                            if member['_id'] == current['_id']:
                                data['votes'] = member.get('votes', 1)
                        data['voteFraction'] = data['votes'] / total

                    status['replSet'] = data

                    # Submit events
                    self._report_replica_set_state(data['state'],
                                                   clean_server_name,
                                                   replset_name,
                                                   self.agentConfig)

            except Exception as e:
                if "OperationFailure" in repr(e) and (
                        "not running with --replSet" in str(e)
                        or "replSetGetStatus" in str(e)):
                    pass
                else:
                    raise e

        # If these keys exist, remove them for now as they cannot be serialized
        try:
            status['backgroundFlushing'].pop('last_finished')
        except KeyError:
            pass
        try:
            status.pop('localTime')
        except KeyError:
            pass

        dbnames = cli.database_names()
        self.gauge('mongodb.dbs', len(dbnames), tags=tags)

        for db_n in dbnames:
            db_aux = cli[db_n]
            dbstats[db_n] = {'stats': db_aux.command('dbstats')}

        # Go through the metrics and save the values
        for metric_name in metrics_to_collect:
            # each metric is of the form: x.y.z with z optional
            # and can be found at status[x][y][z]
            value = status

            if metric_name.startswith('stats'):
                continue
            else:
                try:
                    for c in metric_name.split("."):
                        value = value[c]
                except KeyError:
                    continue

            # value is now status[x][y][z]
            if not isinstance(value, (int, long, float)):
                raise TypeError(
                    u"{0} value is a {1}, it should be an int, a float or a long instead."
                    .format(metric_name, type(value)))

            # Submit the metric
            submit_method, metric_name_alias = self._resolve_metric(
                metric_name, metrics_to_collect)
            submit_method(self, metric_name_alias, value, tags=tags)

        for st, value in dbstats.iteritems():
            for metric_name in metrics_to_collect:
                if not metric_name.startswith('stats.'):
                    continue

                try:
                    val = value['stats'][metric_name.split('.')[1]]
                except KeyError:
                    continue

                # value is now status[x][y][z]
                if not isinstance(val, (int, long, float)):
                    raise TypeError(
                        u"{0} value is a {1}, it should be an int, a float or a long instead."
                        .format(metric_name, type(val)))

                # Submit the metric
                st_tag = st if dbstats_tags else hashlib.md5(
                    st.encode()).hexdigest()
                metrics_tags = (
                    tags + [
                        u"cluster:db:{0}".format(
                            st_tag
                        ),  # FIXME 6.0 - keep for backward compatibility
                        u"db:{0}".format(st_tag),
                    ])

                submit_method, metric_name_alias = \
                    self._resolve_metric(metric_name, metrics_to_collect)
                submit_method(self, metric_name_alias, val, tags=metrics_tags)

        if _is_affirmative(instance.get('collections_indexes_stats')):
            mongo_version = cli.server_info().get('version', '0.0')
            if LooseVersion(mongo_version) >= LooseVersion("3.2"):
                self._collect_indexes_stats(instance, db, tags)
            else:
                self.log.error(
                    "'collections_indexes_stats' is only available starting from mongo 3.2: your mongo version is %s",
                    mongo_version)

        # Report the usage metrics for dbs/collections
        if 'top' in additional_metrics:
            try:
                dbtop = db.command('top')
                for ns, ns_metrics in dbtop['totals'].iteritems():
                    if "." not in ns:
                        continue

                    # configure tags for db name and collection name
                    dbname, collname = ns.split(".", 1)
                    dbname = dbname if dbstats_tags else hashlib.md5(
                        dbname.encode()).hexdigest()
                    ns_tags = tags + [
                        "db:%s" % dbname,
                        "collection:%s" % collname
                    ]

                    # iterate over DBTOP metrics
                    for m in self.TOP_METRICS:
                        # each metric is of the form: x.y.z with z optional
                        # and can be found at ns_metrics[x][y][z]
                        value = ns_metrics
                        try:
                            for c in m.split("."):
                                value = value[c]
                        except Exception:
                            continue

                        # value is now status[x][y][z]
                        if not isinstance(value, (int, long, float)):
                            raise TypeError(
                                u"{0} value is a {1}, it should be an int, a float or a long instead."
                                .format(m, type(value)))

                        # Submit the metric
                        submit_method, metric_name_alias = \
                            self._resolve_metric(m, metrics_to_collect, prefix="usage")
                        submit_method(self,
                                      metric_name_alias,
                                      value,
                                      tags=ns_tags)
                        # Keep old incorrect metric
                        if metric_name_alias.endswith('countps'):
                            GAUGE(self,
                                  metric_name_alias[:-2],
                                  value,
                                  tags=ns_tags)
            except Exception as e:
                self.log.warning('Failed to record `top` metrics %s' % str(e))

        if 'local' in dbnames:  # it might not be if we are connectiing through mongos
            # Fetch information analogous to Mongo's db.getReplicationInfo()
            localdb = cli['local']

            oplog_data = {}

            for ol_collection_name in ("oplog.rs", "oplog.$main"):
                ol_options = localdb[ol_collection_name].options()
                if ol_options:
                    break

            if ol_options:
                try:
                    oplog_data['logSizeMB'] = round(
                        ol_options['size'] / 2.0**20, 2)

                    oplog = localdb[ol_collection_name]

                    oplog_data['usedSizeMB'] = round(
                        localdb.command("collstats",
                                        ol_collection_name)['size'] / 2.0**20,
                        2)

                    op_asc_cursor = oplog.find({
                        "ts": {
                            "$exists": 1
                        }
                    }).sort("$natural", pymongo.ASCENDING).limit(1)
                    op_dsc_cursor = oplog.find({
                        "ts": {
                            "$exists": 1
                        }
                    }).sort("$natural", pymongo.DESCENDING).limit(1)

                    try:
                        first_timestamp = op_asc_cursor[0]['ts'].as_datetime()
                        last_timestamp = op_dsc_cursor[0]['ts'].as_datetime()
                        oplog_data['timeDiff'] = total_seconds(last_timestamp -
                                                               first_timestamp)
                    except (IndexError, KeyError):
                        # if the oplog collection doesn't have any entries
                        # if an object in the collection doesn't have a ts value, we ignore it
                        pass
                except KeyError:
                    # encountered an error trying to access options.size for the oplog collection
                    self.log.warning(
                        u"Failed to record `ReplicationInfo` metrics.")

            for (m, value) in oplog_data.iteritems():
                submit_method, metric_name_alias = \
                    self._resolve_metric('oplog.%s' % m, metrics_to_collect)
                submit_method(self, metric_name_alias, value, tags=tags)

        else:
            self.log.debug(
                '"local" database not in dbnames. Not collecting ReplicationInfo metrics'
            )

        # get collection level stats
        try:
            # Ensure that you're on the right db
            db = cli[db_name]
            # grab the collections from the configutation
            coll_names = instance.get('collections', [])
            # loop through the collections
            for coll_name in coll_names:
                # grab the stats from the collection
                stats = db.command("collstats", coll_name)
                # loop through the metrics
                db_name_tag = db_name if dbstats_tags else hashlib.md5(
                    db_name.encode()).hexdigest()
                for m in self.collection_metrics_names:
                    coll_tags = tags + [
                        "db:%s" % db_name_tag,
                        "collection:%s" % coll_name
                    ]
                    value = stats.get(m, None)
                    if not value:
                        continue

                    # if it's the index sizes, then it's a dict.
                    if m == 'indexSizes':
                        submit_method, metric_name_alias = \
                            self._resolve_metric('collection.%s' % m, self.COLLECTION_METRICS)
                        # loop through the indexes
                        for (idx, val) in value.iteritems():
                            # we tag the index
                            idx_tags = coll_tags + ["index:%s" % idx]
                            submit_method(self,
                                          metric_name_alias,
                                          val,
                                          tags=idx_tags)
                    else:
                        submit_method, metric_name_alias = \
                            self._resolve_metric('collection.%s' % m, self.COLLECTION_METRICS)
                        submit_method(self,
                                      metric_name_alias,
                                      value,
                                      tags=coll_tags)
        except Exception as e:
            self.log.warning(u"Failed to record `collection` metrics.")
            self.log.exception(e)
Example #49
0
    def check(self, instance):
        """
        Returns a dictionary that looks a lot like what's sent back by
        db.serverStatus()
        """
        if 'server' not in instance:
            raise Exception("Missing 'server' in mongo config")

        server = instance['server']

        ssl_params = {
            'ssl': instance.get('ssl', None),
            'ssl_keyfile': instance.get('ssl_keyfile', None),
            'ssl_certfile': instance.get('ssl_certfile', None),
            'ssl_cert_reqs': instance.get('ssl_cert_reqs', None),
            'ssl_ca_certs': instance.get('ssl_ca_certs', None)
        }

        for key, param in ssl_params.items():
            if param is None:
                del ssl_params[key]

        # Configuration a URL, mongodb://user:pass@server/db
        parsed = pymongo.uri_parser.parse_uri(server)
        username = parsed.get('username')
        password = parsed.get('password')
        db_name = parsed.get('database')
        clean_server_name = server.replace(
            password, "*" * 5) if password is not None else server

        tags = instance.get('tags', [])
        tags.append('server:%s' % clean_server_name)

        # Get the list of metrics to collect
        collect_tcmalloc_metrics = _is_affirmative(
            instance.get('collect_tcmalloc_metrics', False))
        metrics_to_collect = self._get_metrics_to_collect(
            server,
            collect_tcmalloc_metrics=collect_tcmalloc_metrics,
        )

        # de-dupe tags to avoid a memory leak
        tags = list(set(tags))

        if not db_name:
            self.log.info(
                'No MongoDB database found in URI. Defaulting to admin.')
            db_name = 'admin'

        service_check_tags = ["db:%s" % db_name]

        nodelist = parsed.get('nodelist')
        if nodelist:
            host = nodelist[0][0]
            port = nodelist[0][1]
            service_check_tags = service_check_tags + [
                "host:%s" % host, "port:%s" % port
            ]

        do_auth = True
        if username is None or password is None:
            self.log.debug(
                "Mongo: cannot extract username and password from config %s" %
                server)
            do_auth = False

        timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) * 1000
        try:
            cli = pymongo.mongo_client.MongoClient(
                server,
                socketTimeoutMS=timeout,
                read_preference=pymongo.ReadPreference.PRIMARY_PREFERRED,
                **ssl_params)
            # some commands can only go against the admin DB
            admindb = cli['admin']
            db = cli[db_name]
        except Exception:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags)
            raise

        if do_auth and not db.authenticate(username, password):
            message = "Mongo: cannot connect with config %s" % server
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags,
                               message=message)
            raise Exception(message)

        self.service_check(self.SERVICE_CHECK_NAME,
                           AgentCheck.OK,
                           tags=service_check_tags)

        status = db["$cmd"].find_one({
            "serverStatus": 1,
            "tcmalloc": collect_tcmalloc_metrics
        })
        if status['ok'] == 0:
            raise Exception(status['errmsg'].__str__())

        status['stats'] = db.command('dbstats')
        dbstats = {}
        dbstats[db_name] = {'stats': status['stats']}

        # Handle replica data, if any
        # See
        # http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus  # noqa
        try:
            data = {}
            dbnames = []

            replSet = admindb.command('replSetGetStatus')
            if replSet:
                primary = None
                current = None

                # need a new connection to deal with replica sets
                setname = replSet.get('set')
                cli = pymongo.mongo_client.MongoClient(
                    server,
                    socketTimeoutMS=timeout,
                    replicaset=setname,
                    read_preference=pymongo.ReadPreference.NEAREST,
                    **ssl_params)
                db = cli[db_name]

                if do_auth and not db.authenticate(username, password):
                    message = ("Mongo: cannot connect with config %s" % server)
                    self.service_check(self.SERVICE_CHECK_NAME,
                                       AgentCheck.CRITICAL,
                                       tags=service_check_tags,
                                       message=message)
                    raise Exception(message)

                # find nodes: master and current node (ourself)
                for member in replSet.get('members'):
                    if member.get('self'):
                        current = member
                    if int(member.get('state')) == 1:
                        primary = member

                # If we have both we can compute a lag time
                if current is not None and primary is not None:
                    lag = primary['optimeDate'] - current['optimeDate']
                    # Python 2.7 has this built in, python < 2.7 don't...
                    if hasattr(lag, 'total_seconds'):
                        data['replicationLag'] = lag.total_seconds()
                    else:
                        data['replicationLag'] = (
                            lag.microseconds +
                            (lag.seconds + lag.days * 24 * 3600) *
                            10**6) / 10.0**6

                if current is not None:
                    data['health'] = current['health']

                data['state'] = replSet['myState']
                self.check_last_state(data['state'], clean_server_name,
                                      self.agentConfig)
                status['replSet'] = data

        except Exception as e:
            if "OperationFailure" in repr(e) and "replSetGetStatus" in str(e):
                pass
            else:
                raise e

        # If these keys exist, remove them for now as they cannot be serialized
        try:
            status['backgroundFlushing'].pop('last_finished')
        except KeyError:
            pass
        try:
            status.pop('localTime')
        except KeyError:
            pass

        dbnames = cli.database_names()
        for db_n in dbnames:
            db_aux = cli[db_n]
            dbstats[db_n] = {'stats': db_aux.command('dbstats')}

        # Go through the metrics and save the values
        for metric_name, submit_method in metrics_to_collect.iteritems():
            # each metric is of the form: x.y.z with z optional
            # and can be found at status[x][y][z]
            value = status

            if metric_name.startswith('stats'):
                continue
            else:
                try:
                    for c in metric_name.split("."):
                        value = value[c]
                except KeyError:
                    continue

            # value is now status[x][y][z]
            if not isinstance(value, (int, long, float)):
                raise TypeError(
                    u"{0} value is a {1}, it should be an int, a float or a long instead."
                    .format(metric_name, type(value)))

            # Submit the metric
            metric_name = self._normalize(metric_name, submit_method)
            submit_method(self, metric_name, value, tags=tags)

        for st, value in dbstats.iteritems():
            for metric_name, submit_method in metrics_to_collect.iteritems():
                if not metric_name.startswith('stats.'):
                    continue

                try:
                    val = value['stats'][metric_name.split('.')[1]]
                except KeyError:
                    continue

                # value is now status[x][y][z]
                if not isinstance(val, (int, long, float)):
                    raise TypeError(
                        u"{0} value is a {1}, it should be an int, a float or a long instead."
                        .format(metric_name, type(val)))

                # Submit the metric
                metric_name = self._normalize(metric_name, submit_method)
                metrics_tags = tags + ['cluster:db:%s' % st]
                submit_method(self, metric_name, value, tags=metrics_tags)
    def _load_conf(self, instance):
        # Fetches the conf
        method = instance.get('method', 'get')
        data = instance.get('data', {})
        tags = instance.get('tags', [])
        username = instance.get('username')
        password = instance.get('password')
        client_cert = instance.get('client_cert')
        client_key = instance.get('client_key')
        http_response_status_code = str(
            instance.get('http_response_status_code', DEFAULT_EXPECTED_CODE))
        timeout = int(instance.get('timeout', 10))
        config_headers = instance.get('headers', {})
        default_headers = _is_affirmative(
            instance.get("include_default_headers", True))
        if default_headers:
            headers = agent_headers(self.agentConfig)
        else:
            headers = {}
        headers.update(config_headers)
        url = instance.get('url')
        content_match = instance.get('content_match')
        reverse_content_match = _is_affirmative(
            instance.get('reverse_content_match', False))
        response_time = _is_affirmative(
            instance.get('collect_response_time', True))
        if not url:
            raise Exception("Bad configuration. You must specify a url")
        include_content = _is_affirmative(
            instance.get('include_content', False))
        disable_ssl_validation = _is_affirmative(
            instance.get('disable_ssl_validation', True))
        ssl_expire = _is_affirmative(
            instance.get('check_certificate_expiration', True))
        instance_ca_certs = instance.get('ca_certs', self.ca_certs)
        weakcipher = _is_affirmative(instance.get('weakciphers', False))
        ignore_ssl_warning = _is_affirmative(
            instance.get('ignore_ssl_warning', False))
        check_hostname = _is_affirmative(instance.get('check_hostname', True))
        skip_proxy = _is_affirmative(
            instance.get('skip_proxy', instance.get('no_proxy', False)))
        allow_redirects = _is_affirmative(instance.get('allow_redirects',
                                                       True))

        return url, username, password, client_cert, client_key, method, data, http_response_status_code, timeout, include_content,\
            headers, response_time, content_match, reverse_content_match, tags, disable_ssl_validation, ssl_expire, instance_ca_certs,\
            weakcipher, check_hostname, ignore_ssl_warning, skip_proxy, allow_redirects
Example #51
0
    def check(self, instance):
        host = instance.get('host', '')
        port = instance.get('port', '')
        if port != '':
            port = int(port)
        user = instance.get('username', '')
        password = instance.get('password', '')
        tags = instance.get('tags', [])
        dbname = instance.get('dbname', None)
        relations = instance.get('relations', [])
        ssl = _is_affirmative(instance.get('ssl', False))
        function_metrics = _is_affirmative(
            instance.get('collect_function_metrics', False))
        # Default value for `count_metrics` is True for backward compatibility
        count_metrics = _is_affirmative(
            instance.get('collect_count_metrics', True))
        database_size_metrics = _is_affirmative(
            instance.get('collect_database_size_metrics', True))
        collect_default_db = _is_affirmative(
            instance.get('collect_default_database', False))

        if relations and not dbname:
            self.warning(
                '"dbname" parameter must be set when using the "relations" parameter.'
            )

        if dbname is None:
            dbname = 'postgres'

        key = (host, port, dbname)

        custom_metrics = self._get_custom_metrics(
            instance.get('custom_metrics', []), key)

        # Clean up tags in case there was a None entry in the instance
        # e.g. if the yaml contains tags: but no actual tags
        if tags is None:
            tags = []
        else:
            tags = list(set(tags))

        # preset tags to the database name
        tags.extend(["db:%s" % dbname])

        self.log.debug("Custom metrics: %s" % custom_metrics)

        # preset tags to the database name
        db = None

        connect_fct, interface_error, programming_error = self._get_pg_attrs(
            instance)

        # Collect metrics
        try:
            # Check version
            db = self.get_connection(key, host, port, user, password, dbname,
                                     ssl, connect_fct, tags)
            version = self._get_version(key, db)
            self.log.debug("Running check against version %s" % version)
            self._collect_stats(key, db, tags, relations, custom_metrics,
                                function_metrics, count_metrics,
                                database_size_metrics, collect_default_db,
                                interface_error, programming_error)
        except ShouldRestartException:
            self.log.info("Resetting the connection")
            db = self.get_connection(key,
                                     host,
                                     port,
                                     user,
                                     password,
                                     dbname,
                                     ssl,
                                     connect_fct,
                                     tags,
                                     use_cached=False)
            self._collect_stats(key, db, tags, relations, custom_metrics,
                                function_metrics, count_metrics,
                                database_size_metrics, collect_default_db,
                                interface_error, programming_error)

        if db is not None:
            service_check_tags = self._get_service_check_tags(host, port, tags)
            message = u'Established connection to postgres://%s:%s/%s' % (
                host, port, dbname)
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.OK,
                               tags=service_check_tags,
                               message=message)
            try:
                # commit to close the current query transaction
                db.commit()
            except Exception as e:
                self.log.warning("Unable to commit: {0}".format(e))
 def enabled(cls, agent_config):
     return _is_affirmative(agent_config.get(
         'dogstatsd6_enable', False)) and cls._get_dsd6_path() is not None
Example #53
0
    def __init__(self, instance=None):
        self.docker_util = DockerUtil()
        if instance is None:
            try:
                config_file_path = get_conf_path(KUBERNETES_CHECK_NAME)
                check_config = check_yaml(config_file_path)
                instance = check_config['instances'][0]
            # kubernetes.yaml was not found
            except IOError as ex:
                log.error(ex.message)

                instance = {}
            except Exception:
                log.error('Kubernetes configuration file is invalid. '
                          'Trying connecting to kubelet with default settings anyway...')
                instance = {}

        self.method = instance.get('method', KubeUtil.DEFAULT_METHOD)
        self._node_ip = self._node_name = None  # lazy evaluation
        self.host_name = os.environ.get('HOSTNAME')
        self.tls_settings = self._init_tls_settings(instance)

        # apiserver
        if 'api_server_url' in instance:
            self.kubernetes_api_root_url = instance.get('api_server_url')
        else:
            master_host = os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME
            master_port = os.environ.get('KUBERNETES_SERVICE_PORT') or self.DEFAULT_MASTER_PORT
            self.kubernetes_api_root_url = 'https://%s:%s' % (master_host, master_port)

        self.kubernetes_api_url = '%s/api/v1' % self.kubernetes_api_root_url

        # leader status triggers event collection
        self.is_leader = False
        self.leader_elector = None
        self.leader_lease_duration = instance.get('lease_duration')

        # kubelet
        try:
            self.kubelet_api_url = self._locate_kubelet(instance)
            if not self.kubelet_api_url:
                raise Exception("Couldn't find a method to connect to kubelet.")
        except Exception as ex:
            log.error("Kubernetes check exiting, cannot run without access to kubelet.")
            raise ex

        # Service mapping helper class
        self._service_mapper = PodServiceMapper(self)

        self.kubelet_host = self.kubelet_api_url.split(':')[1].lstrip('/')
        self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH)
        self.kube_health_url = urljoin(self.kubelet_api_url, KubeUtil.KUBELET_HEALTH_PATH)
        self.kube_label_prefix = instance.get('label_to_tag_prefix', KubeUtil.DEFAULT_LABEL_PREFIX)
        self.kube_node_labels = instance.get('node_labels_to_host_tags', {})

        # cadvisor
        self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT)
        self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port)
        self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH)
        self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH)

        try:
            self.self_namespace = self.get_self_namespace()
        except Exception:
            log.warning("Failed to get the agent pod namespace, defaulting to default.")
            self.self_namespace = DEFAULT_NAMESPACE

        from config import _is_affirmative
        self.collect_service_tag = _is_affirmative(instance.get('collect_service_tags', KubeUtil.DEFAULT_COLLECT_SERVICE_TAG))

        # keep track of the latest k8s event we collected and posted
        # default value is 0 but TTL for k8s events is one hour anyways
        self.last_event_collection_ts = 0
Example #54
0
    def _process_results(self):
        for i in xrange(MAX_LOOP_ITERATIONS):
            try:
                # We want to fetch the result in a non blocking way
                status, msg, sc_name, instance = self.resultsq.get_nowait()
            except Empty:
                break

            instance_name = instance['name']
            if status == FAILURE:
                self.nb_failures += 1
                if self.nb_failures >= self.pool_size - 1:
                    self.nb_failures = 0
                    self.restart_pool()

                # clean failed job
                self._clean_job(instance_name)
                continue

            self.report_as_service_check(sc_name, status, instance, msg)

            # FIXME: 5.3, this has been deprecated before, get rid of events
            # Don't create any event to avoid duplicates with server side
            # service_checks
            skip_event = _is_affirmative(instance.get('skip_event', False))
            if not skip_event:
                self.warning(
                    "Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Datadog Agent."
                )
                event = None

                if instance_name not in self.statuses:
                    self.statuses[instance_name] = defaultdict(list)

                self.statuses[instance_name][sc_name].append(status)

                window = int(instance.get('window', 1))

                if window > 256:
                    self.log.warning(
                        "Maximum window size (256) exceeded, defaulting it to 256"
                    )
                    window = 256

                threshold = instance.get('threshold', 1)

                if len(self.statuses[instance_name][sc_name]) > window:
                    self.statuses[instance_name][sc_name].pop(0)

                nb_failures = self.statuses[instance_name][sc_name].count(
                    Status.DOWN)

                if nb_failures >= threshold:
                    if self.notified.get(
                        (instance_name, sc_name), Status.UP) != Status.DOWN:
                        event = self._create_status_event(
                            sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.DOWN
                else:
                    if self.notified.get(
                        (instance_name, sc_name), Status.UP) != Status.UP:
                        event = self._create_status_event(
                            sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.UP

                if event is not None:
                    self.events.append(event)

            self._clean_job(instance_name)
Example #55
0
    def check(self, instance):
        if 'url' not in instance:
            raise Exception('etcd instance missing "url" value.')

        # Load values from the instance config
        url = instance['url']
        instance_tags = instance.get('tags', [])

        # Load the ssl configuration
        ssl_params = {
            'ssl_keyfile':
            instance.get('ssl_keyfile'),
            'ssl_certfile':
            instance.get('ssl_certfile'),
            'ssl_cert_validation':
            _is_affirmative(instance.get('ssl_cert_validation', True)),
            'ssl_ca_certs':
            instance.get('ssl_ca_certs'),
        }

        for key, param in ssl_params.items():
            if param is None:
                del ssl_params[key]

        # Get a copy of tags for the CRIT statuses
        critical_tags = list(instance_tags)

        # Append the instance's URL in case there are more than one, that
        # way they can tell the difference!
        instance_tags.append("url:{0}".format(url))
        timeout = float(instance.get('timeout', self.DEFAULT_TIMEOUT))
        is_leader = False

        # Gather self health status
        sc_state = AgentCheck.UNKNOWN
        health_status = self._get_health_status(url, ssl_params, timeout,
                                                critical_tags)
        if health_status is not None:
            sc_state = AgentCheck.OK if self._is_healthy(
                health_status) else AgentCheck.CRITICAL
        self.service_check(self.HEALTH_SERVICE_CHECK_NAME,
                           sc_state,
                           tags=instance_tags)

        # Gather self metrics
        self_response = self._get_self_metrics(url, ssl_params, timeout,
                                               critical_tags)
        if self_response is not None:
            if self_response['state'] == 'StateLeader':
                is_leader = True
                instance_tags.append('etcd_state:leader')
            else:
                instance_tags.append('etcd_state:follower')

            for key in self.SELF_RATES:
                if key in self_response:
                    self.rate(self.SELF_RATES[key],
                              self_response[key],
                              tags=instance_tags)
                else:
                    self.log.warn("Missing key {0} in stats.".format(key))

            for key in self.SELF_GAUGES:
                if key in self_response:
                    self.gauge(self.SELF_GAUGES[key],
                               self_response[key],
                               tags=instance_tags)
                else:
                    self.log.warn("Missing key {0} in stats.".format(key))

        # Gather store metrics
        store_response = self._get_store_metrics(url, ssl_params, timeout,
                                                 critical_tags)
        if store_response is not None:
            for key in self.STORE_RATES:
                if key in store_response:
                    self.rate(self.STORE_RATES[key],
                              store_response[key],
                              tags=instance_tags)
                else:
                    self.log.warn("Missing key {0} in stats.".format(key))

            for key in self.STORE_GAUGES:
                if key in store_response:
                    self.gauge(self.STORE_GAUGES[key],
                               store_response[key],
                               tags=instance_tags)
                else:
                    self.log.warn("Missing key {0} in stats.".format(key))

        # Gather leader metrics
        if is_leader:
            leader_response = self._get_leader_metrics(url, ssl_params,
                                                       timeout, critical_tags)
            if leader_response is not None and len(
                    leader_response.get("followers", {})) > 0:
                # Get the followers
                followers = leader_response.get("followers")
                for fol in followers:
                    # counts
                    for key in self.LEADER_COUNTS:
                        self.rate(self.LEADER_COUNTS[key],
                                  followers[fol].get("counts").get(key),
                                  tags=instance_tags +
                                  ['follower:{0}'.format(fol)])
                    # latency
                    for key in self.LEADER_LATENCY:
                        self.gauge(self.LEADER_LATENCY[key],
                                   followers[fol].get("latency").get(key),
                                   tags=instance_tags +
                                   ['follower:{0}'.format(fol)])

        # Service check
        if self_response is not None and store_response is not None:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.OK,
                               tags=instance_tags)
Example #56
0
    def init(self):
        try:
            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            instance = self.instances[0]
            set_docker_settings(self.init_config, instance)

            self.client = get_client()
            self._docker_root = self.init_config.get('docker_root', '/')
            self._mountpoints = get_mountpoints(self._docker_root)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # At first run we'll just collect the events from the latest 60 secs
            self._last_event_collection_ts = int(time.time()) - 60

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get(
                "collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(
                instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags",
                                            DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags",
                                        DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning(
                        "You must specify an exclude section to enable filtering"
                    )
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(
                    include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(
                instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(
                instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(
                instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(
                instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(
                instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(
                instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception, e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
Example #57
0
    def check(self, instance):
        if 'url' not in instance:
            raise Exception('Mesos instance missing "url" value.')

        url = instance['url']
        instance_tags = instance.get('tags', [])
        if instance_tags is None:
            instance_tags = []
        default_timeout = self.init_config.get('default_timeout', 5)
        timeout = float(instance.get('timeout', default_timeout))
        ssl_verify = not _is_affirmative(
            instance.get('disable_ssl_validation', False))

        state_metrics = self._check_leadership(url, timeout, ssl_verify,
                                               instance_tags)
        if state_metrics:
            tags = [
                'mesos_pid:{0}'.format(state_metrics['pid']),
                'mesos_node:master',
            ]
            if 'cluster' in state_metrics:
                tags.append('mesos_cluster:{0}'.format(
                    state_metrics['cluster']))

            tags += instance_tags

            if self.leader:
                self.GAUGE('mesos.cluster.total_frameworks',
                           len(state_metrics['frameworks']),
                           tags=tags)

                for framework in state_metrics['frameworks']:
                    framework_tags = ['framework_name:' + framework['name']
                                      ] + tags
                    self.GAUGE('mesos.framework.total_tasks',
                               len(framework['tasks']),
                               tags=framework_tags)
                    resources = framework['used_resources']
                    for key_name, (
                            metric_name,
                            metric_func) in self.FRAMEWORK_METRICS.iteritems():
                        metric_func(self,
                                    metric_name,
                                    resources[key_name],
                                    tags=framework_tags)

                role_metrics = self._get_master_roles(url, timeout, ssl_verify,
                                                      instance_tags)
                if role_metrics is not None:
                    for role in role_metrics['roles']:
                        role_tags = ['mesos_role:' + role['name']] + tags
                        self.GAUGE('mesos.role.frameworks.count',
                                   len(role['frameworks']),
                                   tags=role_tags)
                        self.GAUGE('mesos.role.weight',
                                   role['weight'],
                                   tags=role_tags)
                        for key_name, (
                                metric_name, metric_func
                        ) in self.ROLE_RESOURCES_METRICS.iteritems():
                            metric_func(self,
                                        metric_name,
                                        role['resources'][key_name],
                                        tags=role_tags)

            stats_metrics = self._get_master_stats(url, timeout, ssl_verify,
                                                   instance_tags)
            if stats_metrics is not None:
                metrics = [self.SYSTEM_METRICS]
                if self.leader:
                    metrics += [
                        self.CLUSTER_TASKS_METRICS,
                        self.CLUSTER_SLAVES_METRICS,
                        self.CLUSTER_RESOURCES_METRICS,
                        self.CLUSTER_REGISTRAR_METRICS,
                        self.CLUSTER_FRAMEWORK_METRICS, self.STATS_METRICS
                    ]
                for m in metrics:
                    for key_name, (metric_name, metric_func) in m.iteritems():
                        if key_name in stats_metrics:
                            metric_func(self,
                                        metric_name,
                                        stats_metrics[key_name],
                                        tags=tags)

        self.service_check_needed = True
Example #58
0
    def check(self, instance):
        name = instance.get('name', None)
        tags = instance.get('tags', [])
        exact_match = _is_affirmative(instance.get('exact_match', True))
        search_string = instance.get('search_string', None)
        ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True))
        pid = instance.get('pid')
        pid_file = instance.get('pid_file')
        collect_children = _is_affirmative(
            instance.get('collect_children', False))

        if self._conflicting_procfs:
            self.warning(
                'The `procfs_path` defined in `process.yaml` is different from the one defined in '
                '`datadog.conf`. This is currently not supported by the Agent. Defaulting to the '
                'value defined in `datadog.conf`: {}'.format(
                    psutil.PROCFS_PATH))
        elif self._deprecated_init_procfs:
            self.warning(
                'DEPRECATION NOTICE: Specifying `procfs_path` in `process.yaml` is deprecated. '
                'Please specify it in `datadog.conf` instead')

        if not isinstance(search_string,
                          list) and pid is None and pid_file is None:
            raise ValueError(
                '"search_string" or "pid" or "pid_file" parameter is required')

        # FIXME 6.x remove me
        if search_string is not None:
            if "All" in search_string:
                self.warning(
                    'Deprecated: Having "All" in your search_string will'
                    'greatly reduce the performance of the check and '
                    'will be removed in a future version of the agent.')

        if name is None:
            raise KeyError('The "name" of process groups is mandatory')

        if search_string is not None:
            pids = self.find_pids(name,
                                  search_string,
                                  exact_match,
                                  ignore_ad=ignore_ad)
        elif pid is not None:
            # we use Process(pid) as a means to search, if pid not found
            # psutil.NoSuchProcess is raised.
            pids = self._get_pid_set(pid)
        elif pid_file is not None:
            try:
                with open(pid_file, 'r') as file_pid:
                    pid_line = file_pid.readline().strip()
                    pids = self._get_pid_set(int(pid_line))
            except IOError as e:
                # pid file doesn't exist, assuming the process is not running
                self.log.debug('Unable to find pid file: %s', e)
                pids = set()
        else:
            raise ValueError(
                'The "search_string" or "pid" options are required for process identification'
            )

        if collect_children:
            pids.update(self._get_child_processes(pids))
        proc_state = self.get_process_state(name, pids)

        # FIXME 6.x remove the `name` tag
        tags.extend(['process_name:%s' % name, name])

        self.log.debug('ProcessCheck: process %s analysed', name)
        self.gauge('system.processes.number', len(pids), tags=tags)

        if len(pids) == 0:
            self.warning("No matching process '%s' was found" % name)

        for attr, mname in ATTR_TO_METRIC.iteritems():
            vals = [x for x in proc_state[attr] if x is not None]
            # skip []
            if vals:
                if attr == 'run_time':
                    self.gauge('system.processes.%s.avg' % mname,
                               sum(vals) / len(vals),
                               tags=tags)
                    self.gauge('system.processes.%s.max' % mname,
                               max(vals),
                               tags=tags)
                    self.gauge('system.processes.%s.min' % mname,
                               min(vals),
                               tags=tags)

                # FIXME 6.x: change this prefix?
                else:
                    self.gauge('system.processes.%s' % mname,
                               sum(vals),
                               tags=tags)

        for attr, mname in ATTR_TO_METRIC_RATE.iteritems():
            vals = [x for x in proc_state[attr] if x is not None]
            if vals:
                self.rate('system.processes.%s' % mname, sum(vals), tags=tags)

        self._process_service_check(name, len(pids),
                                    instance.get('thresholds', None), tags)
Example #59
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # Cache connections
        self.connections = {}
        self.failed_connections = {}
        self.instances_metrics = {}
        self.instances_per_type_metrics = defaultdict(dict)
        self.existing_databases = None
        self.do_check = {}
        self.proc_type_mapping = {
            'gauge': self.gauge,
            'rate': self.rate,
            'histogram': self.histogram
        }

        self.connector = init_config.get('connector', 'adodbapi')
        if not self.connector.lower() in self.valid_connectors:
            self.log.error(
                "Invalid database connector %s, defaulting to adodbapi" %
                self.connector)
            self.connector = 'adodbapi'

        # Pre-process the list of metrics to collect
        self.custom_metrics = init_config.get('custom_metrics', [])
        for instance in instances:
            try:
                instance_key = self._conn_key(instance, self.DEFAULT_DB_KEY)
                self.do_check[instance_key] = True

                # check to see if the database exists before we try any connections to it
                with self.open_managed_db_connections(
                        instance, None, db_name=self.DEFAULT_DATABASE):
                    db_exists, context = self._check_db_exists(instance)

                if db_exists:
                    if instance.get('stored_procedure') is None:
                        with self.open_managed_db_connections(
                                instance, self.DEFAULT_DB_KEY):
                            self._make_metric_list_to_collect(
                                instance, self.custom_metrics)
                else:
                    # How much do we care that the DB doesn't exist?
                    ignore = _is_affirmative(
                        instance.get("ignore_missing_database", False))
                    if ignore is not None and ignore:
                        # not much : we expect it. leave checks disabled
                        self.do_check[instance_key] = False
                        self.log.warning(
                            "Database %s does not exist. Disabling checks for this instance."
                            % (context))
                    else:
                        # yes we do. Keep trying
                        self.log.error(
                            "Database %s does not exist. Fix issue and restart agent"
                            % (context))

            except SQLConnectionError:
                self.log.exception("Skipping SQL Server instance")
                continue
            except Exception as e:
                self.log.exception("INitialization exception %s", str(e))
                continue