def check(self, instance): if not self.kubeutil.host: raise Exception('Unable to get default router and host parameter is not set') self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH) enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES) self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges] enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES) self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates] self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES)) self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM)) self.publish_rate = FUNC_MAP[RATE][self.use_histogram] self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram] # master health checks if instance.get('enable_master_checks', False): self._perform_master_checks(self.kubeutil.master_url_nodes) # kubelet health checks if instance.get('enable_kubelet_checks', True): self._perform_kubelet_checks(self.kubeutil.kube_health_url) # kubelet metrics self._update_metrics(instance)
def check(self, instance): host = instance.get('host', '') port = instance.get('port', '') user = instance.get('username', '') password = instance.get('password', '') tags = instance.get('tags', []) dbname = instance.get('dbname', None) relations = instance.get('relations', []) ssl = _is_affirmative(instance.get('ssl', False)) function_metrics = _is_affirmative(instance.get('collect_function_metrics', False)) # Default value for `count_metrics` is True for backward compatibility count_metrics = _is_affirmative(instance.get('collect_count_metrics', True)) if relations and not dbname: self.warning('"dbname" parameter must be set when using the "relations" parameter.') if dbname is None: dbname = 'postgres' key = (host, port, dbname) custom_metrics = self._get_custom_metrics(instance.get('custom_metrics', []), key) # Clean up tags in case there was a None entry in the instance # e.g. if the yaml contains tags: but no actual tags if tags is None: tags = [] else: tags = list(set(tags)) # preset tags to the database name tags.extend(["db:%s" % dbname]) self.log.debug("Custom metrics: %s" % custom_metrics) # preset tags to the database name db = None # Collect metrics try: # Check version db = self.get_connection(key, host, port, user, password, dbname, ssl) version = self._get_version(key, db) self.log.debug("Running check against version %s" % version) self._collect_stats(key, db, tags, relations, custom_metrics, function_metrics, count_metrics) except ShouldRestartException: self.log.info("Resetting the connection") db = self.get_connection(key, host, port, user, password, dbname, ssl, use_cached=False) self._collect_stats(key, db, tags, relations, custom_metrics, function_metrics, count_metrics) if db is not None: service_check_tags = self._get_service_check_tags(host, port, dbname) message = u'Established connection to postgres://%s:%s/%s' % (host, port, dbname) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags, message=message) try: # commit to close the current query transaction db.commit() except Exception, e: self.log.warning("Unable to commit: {0}".format(e))
def check(self, instance): kube_settings = get_kube_settings() if not kube_settings.get("host"): raise Exception("Unable to get default router and host parameter is not set") self.max_depth = instance.get("max_depth", DEFAULT_MAX_DEPTH) enabled_gauges = instance.get("enabled_gauges", DEFAULT_ENABLED_GAUGES) self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges] enabled_rates = instance.get("enabled_rates", DEFAULT_ENABLED_RATES) self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates] self.publish_aliases = _is_affirmative(instance.get("publish_aliases", DEFAULT_PUBLISH_ALIASES)) self.use_histogram = _is_affirmative(instance.get("use_histogram", DEFAULT_USE_HISTOGRAM)) self.publish_rate = FUNC_MAP[RATE][self.use_histogram] self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram] # master health checks if instance.get("enable_master_checks", False): master_url = kube_settings["master_url_nodes"] self._perform_master_checks(master_url) # kubelet health checks if instance.get("enable_kubelet_checks", True): kube_health_url = kube_settings["kube_health_url"] self._perform_kubelet_checks(kube_health_url) # kubelet metrics self._update_metrics(instance, kube_settings)
def check(self, instance): host = instance.get('host', self.default_router) if not host: raise Exception('Unable to get default router and host parameter is not set') port = instance.get('port', DEFAULT_CADVISOR_PORT) method = instance.get('method', DEFAULT_METHOD) self.metrics_url = '%s://%s:%d' % (method, host, port) self.metrics_cmd = urljoin(self.metrics_url, DEFAULT_METRICS_CMD) self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH) enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES) self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges] enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES) self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates] self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES)) self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM)) self.publish_rate = FUNC_MAP[RATE][self.use_histogram] self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram] # master health checks if instance.get('enable_master_checks', False): master_port = instance.get('master_port', DEFAULT_MASTER_PORT) master_host = instance.get('master_host', 'localhost') master_url = '%s://%s:%d/nodes' % (method, host, master_port) self._perform_master_checks(master_url) # kubelet health checks if instance.get('enable_kubelet_checks', True): kubelet_port = instance.get('kubelet_port', DEFAULT_KUBELET_PORT) kubelet_url = '%s://%s:%d/healthz' % (method, host, kubelet_port) self._perform_kubelet_checks(kubelet_url) # kubelet metrics self._update_metrics(instance)
def _load_conf(self, instance): # Fetches the conf method = instance.get('method', 'get') data = instance.get('data', {}) tags = instance.get('tags', []) username = instance.get('username') password = instance.get('password') http_response_status_code = str(instance.get('http_response_status_code', DEFAULT_EXPECTED_CODE)) timeout = int(instance.get('timeout', 10)) config_headers = instance.get('headers', {}) default_headers = _is_affirmative(instance.get("include_default_headers", True)) if default_headers: headers = agent_headers(self.agentConfig) else: headers = {} headers.update(config_headers) url = instance.get('url') content_match = instance.get('content_match') reverse_content_match = _is_affirmative(instance.get('reverse_content_match', False)) response_time = _is_affirmative(instance.get('collect_response_time', True)) if not url: raise Exception("Bad configuration. You must specify a url") include_content = _is_affirmative(instance.get('include_content', False)) ssl = _is_affirmative(instance.get('disable_ssl_validation', True)) ssl_expire = _is_affirmative(instance.get('check_certificate_expiration', True)) instance_ca_certs = instance.get('ca_certs', self.ca_certs) weakcipher = _is_affirmative(instance.get('weakciphers', False)) ignore_ssl_warning = _is_affirmative(instance.get('ignore_ssl_warning', False)) skip_proxy = _is_affirmative(instance.get('no_proxy', False)) allow_redirects = _is_affirmative(instance.get('allow_redirects', True)) return url, username, password, method, data, http_response_status_code, timeout, include_content,\ headers, response_time, content_match, reverse_content_match, tags, ssl, ssl_expire, instance_ca_certs,\ weakcipher, ignore_ssl_warning, skip_proxy, allow_redirects
def check(self, instance): self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH) enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES) self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges] enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES) self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates] self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES)) self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM)) self.publish_rate = FUNC_MAP[RATE][self.use_histogram] self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram] # initialized by _filter_containers self._filtered_containers = set() pods_list = self.kubeutil.retrieve_pods_list() # kubelet health checks self._perform_kubelet_checks(self.kubeutil.kube_health_url) # kubelet metrics self._update_metrics(instance, pods_list) # kubelet events if _is_affirmative(instance.get('collect_events', DEFAULT_COLLECT_EVENTS)): try: self._process_events(instance, pods_list) except Exception as ex: self.log.error("Event collection failed: %s" % str(ex))
def check(self, instance): name = instance.get('name', None) tags = instance.get('tags', []) exact_match = _is_affirmative(instance.get('exact_match', True)) search_string = instance.get('search_string', None) ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True)) pid = instance.get('pid') if not isinstance(search_string, list) and pid is None: raise KeyError('"search_string" parameter should be a list') # FIXME 6.x remove me if pid is None: if "All" in search_string: self.warning('Deprecated: Having "All" in your search_string will' 'greatly reduce the performance of the check and ' 'will be removed in a future version of the agent.') if name is None: raise KeyError('The "name" of process groups is mandatory') if search_string is not None: pids = self.find_pids( name, search_string, exact_match, ignore_ad=ignore_ad ) elif pid is not None: pids = [psutil.Process(pid)] else: raise ValueError('The "search_string" or "pid" options are required for process identification') proc_state = self.get_process_state(name, pids) # FIXME 6.x remove the `name` tag tags.extend(['process_name:%s' % name, name]) self.log.debug('ProcessCheck: process %s analysed', name) self.gauge('system.processes.number', len(pids), tags=tags) for attr, mname in ATTR_TO_METRIC.iteritems(): vals = [x for x in proc_state[attr] if x is not None] # skip [] if vals: if attr == 'run_time': self.gauge('system.processes.%s.avg' % mname, sum(vals)/len(vals), tags=tags) self.gauge('system.processes.%s.max' % mname, max(vals), tags=tags) self.gauge('system.processes.%s.min' % mname, min(vals), tags=tags) # FIXME 6.x: change this prefix? else: self.gauge('system.processes.%s' % mname, sum(vals), tags=tags) for attr, mname in ATTR_TO_METRIC_RATE.iteritems(): vals = [x for x in proc_state[attr] if x is not None] if vals: self.rate('system.processes.%s' % mname, sum(vals), tags=tags) self._process_service_check(name, len(pids), instance.get('thresholds', None))
def check(self, instance): url = instance.get('url') username = instance.get('username') password = instance.get('password') collect_aggregates_only = _is_affirmative( instance.get('collect_aggregates_only', True) ) collect_status_metrics = _is_affirmative( instance.get('collect_status_metrics', False) ) collect_status_metrics_by_host = _is_affirmative( instance.get('collect_status_metrics_by_host', False) ) tag_service_check_by_host = _is_affirmative( instance.get('tag_service_check_by_host', False) ) services_incl_filter = instance.get('services_include', []) services_excl_filter = instance.get('services_exclude', []) self.log.debug('Processing HAProxy data for %s' % url) data = self._fetch_data(url, username, password) process_events = instance.get('status_check', self.init_config.get('status_check', False)) self._process_data( data, collect_aggregates_only, process_events, url=url, collect_status_metrics=collect_status_metrics, collect_status_metrics_by_host=collect_status_metrics_by_host, tag_service_check_by_host=tag_service_check_by_host, services_incl_filter=services_incl_filter, services_excl_filter=services_excl_filter )
def check(self, instance): url = instance.get("url") username = instance.get("username") password = instance.get("password") collect_aggregates_only = _is_affirmative(instance.get("collect_aggregates_only", True)) collect_status_metrics = _is_affirmative(instance.get("collect_status_metrics", False)) collect_status_metrics_by_host = _is_affirmative(instance.get("collect_status_metrics_by_host", False)) count_status_by_service = _is_affirmative(instance.get("count_status_by_service", True)) tag_service_check_by_host = _is_affirmative(instance.get("tag_service_check_by_host", False)) services_incl_filter = instance.get("services_include", []) services_excl_filter = instance.get("services_exclude", []) self.log.debug("Processing HAProxy data for %s" % url) data = self._fetch_data(url, username, password) process_events = instance.get("status_check", self.init_config.get("status_check", False)) self._process_data( data, collect_aggregates_only, process_events, url=url, collect_status_metrics=collect_status_metrics, collect_status_metrics_by_host=collect_status_metrics_by_host, tag_service_check_by_host=tag_service_check_by_host, services_incl_filter=services_incl_filter, services_excl_filter=services_excl_filter, count_status_by_service=count_status_by_service, )
def _load_conf(self, instance): # Fetches the conf tags = instance.get("tags", []) username = instance.get("username") password = instance.get("password") http_response_status_code = instance.get("http_response_status_code", "(1|2|3)\d\d") timeout = int(instance.get("timeout", 10)) config_headers = instance.get("headers", {}) headers = agent_headers(self.agentConfig) headers.update(config_headers) url = instance.get("url") content_match = instance.get("content_match") response_time = _is_affirmative(instance.get("collect_response_time", True)) if not url: raise Exception("Bad configuration. You must specify a url") include_content = _is_affirmative(instance.get("include_content", False)) ssl = _is_affirmative(instance.get("disable_ssl_validation", True)) ssl_expire = _is_affirmative(instance.get("check_certificate_expiration", True)) return ( url, username, password, http_response_status_code, timeout, include_content, headers, response_time, content_match, tags, ssl, ssl_expire, )
def check(self, instance): instance_name = instance.get('name') if instance_name is None: raise Exception("Each instance must have a unique name") ssl_validation = _is_affirmative(instance.get('ssl_validation', True)) server = instance.get('server') if 'server' is None: raise Exception("Each instance must have a server") build_conf = instance.get('build_configuration') if build_conf is None: raise Exception("Each instance must have a build configuration") host = instance.get('host_affected') or self.hostname tags = instance.get('tags') is_deployment = _is_affirmative(instance.get('is_deployment', False)) basic_http_authentication = _is_affirmative(instance.get('basic_http_authentication', False)) self._initialize_if_required(instance_name, server, build_conf, ssl_validation, basic_http_authentication) # Look for new successful builds if basic_http_authentication: new_build_url = self.NEW_BUILD_URL_AUTHENTICATED.format( server=server, build_conf=build_conf, since_build=self.last_build_ids[instance_name] ) else: new_build_url = self.NEW_BUILD_URL.format( server=server, build_conf=build_conf, since_build=self.last_build_ids[instance_name] ) try: resp = requests.get(new_build_url, timeout=self.DEFAULT_TIMEOUT, headers=self.HEADERS, verify=ssl_validation) resp.raise_for_status() new_builds = resp.json() if new_builds["count"] == 0: self.log.debug("No new builds found.") else: self._build_and_send_event(new_builds["build"][0], instance_name, is_deployment, host, tags) except requests.exceptions.HTTPError: self.log.exception( "Couldn't fetch last build, got code {0}" .format(resp.status_code) ) raise except Exception: self.log.exception( "Couldn't fetch last build, unhandled exception" ) raise
def _load_conf(self, instance): self._excluded_filesystems = instance.get("excluded_filesystems", []) self._excluded_disks = instance.get("excluded_disks", []) self._tag_by_filesystem = _is_affirmative(instance.get("tag_by_filesystem", False)) self._all_partitions = _is_affirmative(instance.get("all_partitions", False)) # FIXME: 6.x, drop use_mount option in datadog.conf self._load_legacy_option(instance, "use_mount", False, operation=_is_affirmative) # FIXME: 6.x, drop device_blacklist_re option in datadog.conf self._load_legacy_option( instance, "excluded_disk_re", "^$", legacy_name="device_blacklist_re", operation=re.compile )
def check(self, instance): name = instance.get('name', None) tags = instance.get('tags', []) exact_match = _is_affirmative(instance.get('exact_match', True)) search_string = instance.get('search_string', None) ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True)) cpu_check_interval = instance.get('cpu_check_interval', 0.1) if not isinstance(search_string, list): raise KeyError('"search_string" parameter should be a list') # FIXME 6.x remove me if "All" in search_string: self.warning('Deprecated: Having "All" in your search_string will' 'greatly reduce the performance of the check and ' 'will be removed in a future version of the agent.') if name is None: raise KeyError('The "name" of process groups is mandatory') if search_string is None: raise KeyError('The "search_string" is mandatory') if not isinstance(cpu_check_interval, (int, long, float)): self.warning("cpu_check_interval must be a number. Defaulting to 0.1") cpu_check_interval = 0.1 pids = self.find_pids( name, search_string, exact_match, ignore_ad=ignore_ad ) proc_state = self.get_process_state(name, pids, cpu_check_interval) # FIXME 6.x remove the `name` tag tags.extend(['process_name:%s' % name, name]) self.log.debug('ProcessCheck: process %s analysed', name) self.gauge('system.processes.number', len(pids), tags=tags) for attr, mname in ATTR_TO_METRIC.iteritems(): vals = [x for x in proc_state[attr] if x is not None] # skip [] if vals: # FIXME 6.x: change this prefix? self.gauge('system.processes.%s' % mname, sum(vals), tags=tags) self._process_service_check(name, len(pids), instance.get('thresholds', None))
def check(self, instance): # Report image metrics if _is_affirmative(instance.get('collect_images_stats', True)): self._count_images(instance) # Get the list of containers and the index of their names containers, ids_to_names = self._get_and_count_containers(instance) # Report container metrics from cgroups skipped_container_ids = self._report_containers_metrics(containers, instance) # Send events from Docker API if _is_affirmative(instance.get('collect_events', True)): self._process_events(instance, ids_to_names, skipped_container_ids)
def get_instance_proxy(self, instance, uri, proxies=None): proxies = proxies if proxies is not None else self.proxies.copy() proxies['no'] = get_no_proxy_from_env() deprecated_skip = instance.get('no_proxy', None) skip = ( _is_affirmative(instance.get('skip_proxy', not self._use_agent_proxy)) or _is_affirmative(deprecated_skip) ) if deprecated_skip is not None: self._log_deprecation('no_proxy') return config_proxy_skip(proxies, uri, skip)
def init(self): try: # We configure the check with the right cgroup settings for this host # Just needs to be done once instance = self.instances[0] set_docker_settings(self.init_config, instance) self.client = get_client() self._docker_root = self.init_config.get('docker_root', '/') self._mountpoints = get_mountpoints(self._docker_root) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # At first run we'll just collect the events from the latest 60 secs self._last_event_collection_ts = int(time.time()) - 60 # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS), IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning("You must specify an exclude section to enable filtering") else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() except Exception, e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration")
def get_instance_config(self, instance): url = instance.get('url') if url is None: raise Exception("An url must be specified in the instance") pshard_stats = _is_affirmative(instance.get('pshard_stats', False)) cluster_stats = _is_affirmative(instance.get('cluster_stats', False)) if 'is_external' in instance: cluster_stats = _is_affirmative(instance.get('is_external', False)) pending_task_stats = _is_affirmative(instance.get('pending_task_stats', True)) # Support URLs that have a path in them from the config, for # backwards-compatibility. parsed = urlparse.urlparse(url) if parsed[2] != "": url = "%s://%s" % (parsed[0], parsed[1]) port = parsed.port host = parsed.hostname custom_tags = instance.get('tags', []) service_check_tags = [ 'host:%s' % host, 'port:%s' % port ] service_check_tags.extend(custom_tags) # Tag by URL so we can differentiate the metrics # from multiple instances tags = ['url:%s' % url] tags.extend(custom_tags) timeout = instance.get('timeout') or self.DEFAULT_TIMEOUT config = ESInstanceConfig( pshard_stats=pshard_stats, cluster_stats=cluster_stats, password=instance.get('password'), service_check_tags=service_check_tags, ssl_cert=instance.get('ssl_cert'), ssl_key=instance.get('ssl_key'), ssl_verify=instance.get('ssl_verify'), tags=tags, timeout=timeout, url=url, username=instance.get('username'), pending_task_stats=pending_task_stats ) return config
def _load_conf(self, instance): self._excluded_filesystems = instance.get('excluded_filesystems', []) self._excluded_disks = instance.get('excluded_disks', []) self._tag_by_filesystem = _is_affirmative( instance.get('tag_by_filesystem', False)) self._all_partitions = _is_affirmative( instance.get('all_partitions', False)) # FIXME: 6.x, drop use_mount option in datadog.conf self._load_legacy_option(instance, 'use_mount', False, operation=_is_affirmative) # FIXME: 6.x, drop device_blacklist_re option in datadog.conf self._load_legacy_option(instance, 'excluded_disk_re', '^$', legacy_name='device_blacklist_re', operation=re.compile)
def _cache_morlist_raw(self, instance): """ Initiate the first layer to refresh self.morlist by queueing _cache_morlist_raw_atomic on the rootFolder in a recursive/asncy approach """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0: self.log.debug( "Skipping morlist collection now, RAW results " "processing not over (latest refresh was {0}s ago)".format( time.time() - self.cache_times[i_key][MORLIST][LAST]) ) return self.morlist_raw[i_key] = [] server_instance = self._get_server_instance(instance) root_folder = server_instance.content.rootFolder instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = _is_affirmative(instance.get('include_only_marked', False)) self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'rootFolder', root_folder, [instance_tag], regexes, include_only_marked) ) self.cache_times[i_key][MORLIST][LAST] = time.time()
def get_instance_config(self, instance): url = instance.get("url") if url is None: raise Exception("An url must be specified in the instance") is_external = _is_affirmative(instance.get("is_external", False)) # Support URLs that have a path in them from the config, for # backwards-compatibility. parsed = urlparse.urlparse(url) if parsed[2] != "": url = "%s://%s" % (parsed[0], parsed[1]) port = parsed.port host = parsed.hostname service_check_tags = ["host:%s" % host, "port:%s" % port] # Tag by URL so we can differentiate the metrics # from multiple instances tags = ["url:%s" % url] tags.extend(instance.get("tags", [])) timeout = instance.get("timeout") or self.DEFAULT_TIMEOUT config = ESInstanceConfig( is_external=is_external, password=instance.get("password"), service_check_tags=service_check_tags, tags=tags, timeout=timeout, url=url, username=instance.get("username"), ) return config
def _process_results(self): for i in xrange(MAX_LOOP_ITERATIONS): try: # We want to fetch the result in a non blocking way status, msg, sc_name, instance = self.resultsq.get_nowait() except Empty: break instance_name = instance['name'] if status == FAILURE: self.nb_failures += 1 if self.nb_failures >= self.pool_size - 1: self.nb_failures = 0 self.restart_pool() # clean failed job self._clean_job(instance_name) continue self.report_as_service_check(sc_name, status, instance, msg) # FIXME: 5.3, this has been deprecated before, get rid of events # Don't create any event to avoid duplicates with server side # service_checks skip_event = _is_affirmative(instance.get('skip_event', False)) if not skip_event: self.warning("Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Datadog Agent.") event = None if instance_name not in self.statuses: self.statuses[instance_name] = defaultdict(list) self.statuses[instance_name][sc_name].append(status) window = int(instance.get('window', 1)) if window > 256: self.log.warning("Maximum window size (256) exceeded, defaulting it to 256") window = 256 threshold = instance.get('threshold', 1) if len(self.statuses[instance_name][sc_name]) > window: self.statuses[instance_name][sc_name].pop(0) nb_failures = self.statuses[instance_name][sc_name].count(Status.DOWN) if nb_failures >= threshold: if self.notified.get((instance_name, sc_name), Status.UP) != Status.DOWN: event = self._create_status_event(sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.DOWN else: if self.notified.get((instance_name, sc_name), Status.UP) != Status.UP: event = self._create_status_event(sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.UP if event is not None: self.events.append(event) self._clean_job(instance_name)
def check(self, instance): # Report image metrics self.warning('Using the "docker" check is deprecated and will be removed' ' in a future version of the agent. Please use the "docker_daemon" one instead') if _is_affirmative(instance.get('collect_images_stats', True)): self._count_images(instance) # Get the list of containers and the index of their names containers, ids_to_names = self._get_and_count_containers(instance) # Report container metrics from cgroups skipped_container_ids = self._report_containers_metrics(containers, instance) # Send events from Docker API if _is_affirmative(instance.get('collect_events', False)): self._process_events(instance, ids_to_names, skipped_container_ids)
def check(self, instance): url = instance.get("url") username = instance.get("username") password = instance.get("password") custom_tags = instance.get('tags', []) max_queues = int(instance.get("max_queues", MAX_ELEMENTS)) max_topics = int(instance.get("max_topics", MAX_ELEMENTS)) max_subscribers = int(instance.get("max_subscribers", MAX_ELEMENTS)) detailed_queues = instance.get("detailed_queues", []) detailed_topics = instance.get("detailed_topics", []) detailed_subscribers = instance.get("detailed_subscribers", []) suppress_errors = _is_affirmative(instance.get("suppress_errors", False)) tags = custom_tags + ["url:{0}".format(url)] self.log.debug("Processing ActiveMQ data for %s" % url) data = self._fetch_data(url, QUEUE_URL, username, password, suppress_errors) if data: self._process_data(data, "queue", tags, max_queues, detailed_queues) data = self._fetch_data(url, TOPIC_URL, username, password, suppress_errors) if data: self._process_data(data, "topic", tags, max_topics, detailed_topics) data = self._fetch_data(url, SUBSCRIBER_URL, username, password, suppress_errors) if data: self._process_subscriber_data(data, tags, max_subscribers, detailed_subscribers)
def _cache_morlist_raw(self, instance): """ Initiate the first layer to refresh the list of MORs (`self.morlist`). Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery. """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0: self.log.debug( "Skipping morlist collection now, RAW results " "processing not over (latest refresh was {0}s ago)".format( time.time() - self.cache_times[i_key][MORLIST][LAST]) ) return self.morlist_raw[i_key] = [] server_instance = self._get_server_instance(instance) root_folder = server_instance.content.rootFolder instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = _is_affirmative(instance.get('include_only_marked', False)) # Discover hosts and virtual machines self._discover_mor(i_key, root_folder, [instance_tag], regexes, include_only_marked) self.cache_times[i_key][MORLIST][LAST] = time.time()
def _connect(self, instance): for e in ("access_id", "access_secret"): if e not in instance: raise Exception("{0} parameter is required.".format(e)) s3_settings = { "aws_access_key_id": instance.get('access_id', None), "aws_secret_access_key": instance.get('access_secret', None), "proxy": instance.get('host', 'localhost'), "proxy_port": int(instance.get('port', 8080)), "is_secure": _is_affirmative(instance.get('is_secure', True)) } if instance.get('s3_root'): s3_settings['host'] = instance['s3_root'] aggregation_key = s3_settings['proxy'] + ":" + str(s3_settings['proxy_port']) try: s3 = S3Connection(**s3_settings) except Exception, e: self.log.error("Error connecting to {0}: {1}".format(aggregation_key, e)) self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=["aggregation_key:{0}".format(aggregation_key)], message=str(e)) raise
def _connect(self, instance): for e in ("access_id", "access_secret"): if e not in instance: raise Exception("{0} parameter is required.".format(e)) s3_settings = { "aws_access_key_id": instance.get("access_id", None), "aws_secret_access_key": instance.get("access_secret", None), "proxy": instance.get("host", "localhost"), "proxy_port": int(instance.get("port", 8080)), "is_secure": _is_affirmative(instance.get("is_secure", True)), } if instance.get("s3_root"): s3_settings["host"] = instance["s3_root"] aggregation_key = s3_settings["proxy"] + ":" + str(s3_settings["proxy_port"]) try: s3 = S3Connection(**s3_settings) except Exception as e: self.log.error("Error connecting to {0}: {1}".format(aggregation_key, e)) self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=["aggregation_key:{0}".format(aggregation_key)], message=str(e), ) raise tags = instance.get("tags", []) tags.append("aggregation_key:{0}".format(aggregation_key)) return s3, aggregation_key, tags
def _collect_raw(self, ceph_cmd, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise Exception('The dd-agent user does not have sudo access') ceph_args = ['sudo', ceph_cmd] else: ceph_args = [ceph_cmd] args = ceph_args + ['version'] try: output,_,_ = get_subprocess_output(args, self.log) except Exception as e: raise Exception('Unable to run cmd=%s: %s' % (' '.join(args), str(e))) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'): try: args = ceph_args + cmd.split() + ['-fjson'] output,_,_ = get_subprocess_output(args, self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s' % (cmd, str(e))) continue name = cmd.replace(' ', '_') raw[name] = res return raw
def _get_and_count_containers(self, instance): tags = instance.get("tags", []) with_size = _is_affirmative(instance.get('collect_container_size', False)) service_check_name = 'docker.service_up' try: running_containers = self._get_containers(instance, with_size=with_size) all_containers = self._get_containers(instance, get_all=True) except (socket.timeout, urllib2.URLError) as e: self.service_check(service_check_name, AgentCheck.CRITICAL, message="Unable to list Docker containers: {0}".format(e)) raise Exception("Failed to collect the list of containers. Exception: {0}".format(e)) self.service_check(service_check_name, AgentCheck.OK) running_containers_ids = set([container['Id'] for container in running_containers]) for container in all_containers: container_tags = list(tags) for key in DOCKER_TAGS: tag = self._make_tag(key, container[key], instance) if tag: container_tags.append(tag) if container['Id'] in running_containers_ids: self.set("docker.containers.running", container['Id'], tags=container_tags) else: self.set("docker.containers.stopped", container['Id'], tags=container_tags) # The index of the names is used to generate and format events ids_to_names = {} for container in all_containers: ids_to_names[container['Id']] = self._get_container_name(container) return running_containers, ids_to_names
def __init__(self, *args, **kwargs): # `args` order is `name`, `init_config`, `agentConfig` (deprecated), `instances` self.metrics = defaultdict(list) self.instances = kwargs.get('instances', []) self.name = kwargs.get('name', '') self.init_config = kwargs.get('init_config', {}) self.agentConfig = kwargs.get('agentConfig', {}) self.warnings = [] if len(args) > 0: self.name = args[0] if len(args) > 1: self.init_config = args[1] if len(args) > 2: if len(args) > 3 or 'instances' in kwargs: # old-style init: the 3rd argument is `agentConfig` self.agentConfig = args[2] if len(args) > 3: self.instances = args[3] else: # new-style init: the 3rd argument is `instances` self.instances = args[2] self.hostname = datadog_agent.get_hostname() # `self.hostname` is deprecated, use `datadog_agent.get_hostname()` instead # the agent5 'AgentCheck' setup a log attribute. self.log = logging.getLogger('%s.%s' % (__name__, self.name)) # Set proxy settings self.proxies = get_requests_proxy(self.agentConfig) if not self.init_config: self._use_agent_proxy = True else: self._use_agent_proxy = _is_affirmative( self.init_config.get("use_agent_proxy", True)) self.default_integration_http_timeout = float(self.agentConfig.get('default_integration_http_timeout', 9)) self._deprecations = { 'increment': [ False, "DEPRECATION NOTICE: `AgentCheck.increment`/`AgentCheck.decrement` are deprecated, please use " + "`AgentCheck.gauge` or `AgentCheck.count` instead, with a different metric name", ], 'device_name': [ False, "DEPRECATION NOTICE: `device_name` is deprecated, please use a `device:` tag in the `tags` list instead", ], 'in_developer_mode': [ False, "DEPRECATION NOTICE: `in_developer_mode` is deprecated, please stop using it.", ], 'no_proxy': [ False, "DEPRECATION NOTICE: The `no_proxy` config option has been renamed " "to `skip_proxy` and will be removed in a future release.", ], }
def get_instance_proxy(self, instance, uri, proxies=None): proxies = proxies if proxies is not None else self.proxies.copy() proxies['no'] = get_no_proxy_from_env() deprecated_skip = instance.get('no_proxy', None) skip = ( _is_affirmative(instance.get('skip_proxy', False)) or _is_affirmative(deprecated_skip) ) if deprecated_skip is not None: self.warning( 'Deprecation notice: The `no_proxy` config option has been renamed ' 'to `skip_proxy` and will be removed in a future release.' ) return config_proxy_skip(proxies, uri, skip)
def _get_config(self, instance): # make sure 'rabbitmq_api_url' is present and get parameters base_url = instance.get('rabbitmq_api_url', None) if not base_url: raise Exception('Missing "rabbitmq_api_url" in RabbitMQ config.') if not base_url.endswith('/'): base_url += '/' username = instance.get('rabbitmq_user', 'guest') password = instance.get('rabbitmq_pass', 'guest') custom_tags = instance.get('tags', []) parsed_url = urlparse.urlparse(base_url) if not parsed_url.scheme or "://" not in parsed_url.geturl(): self.log.warning( 'The rabbit url did not include a protocol, assuming http') # urlparse.urljoin cannot add a protocol to the rest of the url for some reason. # This still leaves the potential for errors, but such urls would never have been valid, either # and it's not likely to be useful to attempt to catch all possible mistakes people could make. # urlparse also has a known issue parsing url with no schema, but a port in the host section # mistakingly taking the host for the schema, hence the additional validation base_url = 'http://' + base_url parsed_url = urlparse.urlparse(base_url) ssl_verify = _is_affirmative(instance.get('ssl_verify', True)) if not ssl_verify and parsed_url.scheme == 'https': self.log.warning( 'Skipping SSL cert validation for %s based on configuration.' % (base_url)) # Limit of queues/nodes to collect metrics from max_detailed = { EXCHANGE_TYPE: int(instance.get('max_detailed_exchanges', MAX_DETAILED_EXCHANGES)), QUEUE_TYPE: int(instance.get('max_detailed_queues', MAX_DETAILED_QUEUES)), NODE_TYPE: int(instance.get('max_detailed_nodes', MAX_DETAILED_NODES)), } # List of queues/nodes to collect metrics from specified = { EXCHANGE_TYPE: { 'explicit': instance.get('exchanges', []), 'regexes': instance.get('exchanges_regexes', []), }, QUEUE_TYPE: { 'explicit': instance.get('queues', []), 'regexes': instance.get('queues_regexes', []), }, NODE_TYPE: { 'explicit': instance.get('nodes', []), 'regexes': instance.get('nodes_regexes', []), }, } for object_type, filters in specified.iteritems(): for filter_type, filter_objects in filters.iteritems(): if type(filter_objects) != list: raise TypeError( "{0} / {0}_regexes parameter must be a list".format( object_type)) auth = (username, password) return base_url, max_detailed, specified, auth, ssl_verify, custom_tags
def __init__(self, **kwargs): self.docker_util = DockerUtil() if 'init_config' in kwargs and 'instance' in kwargs: init_config = kwargs.get('init_config', {}) instance = kwargs.get('instance', {}) else: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) init_config = check_config['init_config'] or {} instance = check_config['instances'][0] or {} # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) init_config, instance = {}, {} except Exception: log.error('Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...') init_config, instance = {}, {} self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.pod_name = os.environ.get('KUBERNETES_POD_NAME') or self.host_name self.tls_settings = self._init_tls_settings(instance) # apiserver if 'api_server_url' in instance: self.kubernetes_api_root_url = instance.get('api_server_url') else: master_host = os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME master_port = os.environ.get('KUBERNETES_SERVICE_PORT') or self.DEFAULT_MASTER_PORT self.kubernetes_api_root_url = 'https://%s:%s' % (master_host, master_port) self.kubernetes_api_url = '%s/api/v1' % self.kubernetes_api_root_url # Service mapping helper class self._service_mapper = PodServiceMapper(self) from config import _is_affirmative self.collect_service_tag = _is_affirmative(instance.get('collect_service_tags', KubeUtil.DEFAULT_COLLECT_SERVICE_TAG)) # leader status triggers event collection self.is_leader = False self.leader_elector = None self.leader_lease_duration = instance.get('leader_lease_duration') # kubelet # If kubelet_api_url is None, init_kubelet didn't succeed yet. self.init_success = False self.kubelet_api_url = None self.init_retry_interval = init_config.get('init_retry_interval', DEFAULT_RETRY_INTERVAL) self.last_init_retry = None self.left_init_retries = init_config.get('init_retries', DEFAULT_INIT_RETRIES) + 1 self.init_kubelet(instance) self.kube_label_prefix = instance.get('label_to_tag_prefix', KubeUtil.DEFAULT_LABEL_PREFIX) self.kube_node_labels = instance.get('node_labels_to_host_tags', {}) # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = 0
def get_stats(self, instance, base_url, object_type, max_detailed, filters, auth=None): """ instance: the check instance base_url: the url of the rabbitmq management api (e.g. http://localhost:15672/api) object_type: either QUEUE_TYPE or NODE_TYPE max_detailed: the limit of objects to collect for this type filters: explicit or regexes filters of specified queues or nodes (specified in the yaml file) """ data = self._get_data( urlparse.urljoin(base_url, object_type), auth=auth) # Make a copy of this list as we will remove items from it at each # iteration explicit_filters = list(filters['explicit']) regex_filters = filters['regexes'] """ data is a list of nodes or queues: data = [ {'status': 'running', 'node': 'rabbit@host', 'name': 'queue1', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False}, {'status': 'running', 'node': 'rabbit@host, 'name': 'queue10', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False}, {'status': 'running', 'node': 'rabbit@host', 'name': 'queue11', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False}, ... ] """ if len(explicit_filters) > max_detailed: raise Exception( "The maximum number of %s you can specify is %d." % (object_type, max_detailed)) # a list of queues/nodes is specified. We process only those if explicit_filters or regex_filters: matching_lines = [] for data_line in data: name = data_line.get("name") if name in explicit_filters: matching_lines.append(data_line) explicit_filters.remove(name) continue match_found = False for p in regex_filters: match = re.search(p, name) if match: if _is_affirmative(instance.get("tag_families", False)) and match.groups(): data_line["queue_family"] = match.groups()[0] matching_lines.append(data_line) match_found = True break if match_found: continue # Absolute names work only for queues if object_type != QUEUE_TYPE: continue absolute_name = '%s/%s' % (data_line.get("vhost"), name) if absolute_name in explicit_filters: matching_lines.append(data_line) explicit_filters.remove(absolute_name) continue for p in regex_filters: match = re.search(p, absolute_name) if match: if _is_affirmative(instance.get("tag_families", False)) and match.groups(): data_line["queue_family"] = match.groups()[0] matching_lines.append(data_line) match_found = True break if match_found: continue data = matching_lines # if no filters are specified, check everything according to the limits if len(data) > ALERT_THRESHOLD * max_detailed: # Post a message on the dogweb stream to warn self.alert(base_url, max_detailed, len(data), object_type) if len(data) > max_detailed: # Display a warning in the info page self.warning( "Too many queues to fetch. You must choose the %s you are interested in by editing the rabbitmq.yaml configuration file or get in touch with Datadog Support" % object_type) for data_line in data[:max_detailed]: # We truncate the list of nodes/queues if it's above the limit self._get_metrics(data_line, object_type)
def body_lines(self): # Metadata whitelist metadata_whitelist = ['hostname', 'fqdn', 'ipv4', 'instance-id'] lines = ['Clocks', '======', ''] try: ntp_offset, ntp_styles = get_ntp_info() lines.append(' ' + style('NTP offset', *ntp_styles) + ': ' + style('%s s' % round(ntp_offset, 4), *ntp_styles)) except Exception as e: lines.append(' NTP offset: Unknown (%s)' % str(e)) lines.append(' System UTC time: ' + datetime.datetime.utcnow().__str__()) lines.append('') # Paths to checks.d/conf.d lines += ['Paths', '=====', ''] osname = config.get_os() try: confd_path = config.get_confd_path(osname) except config.PathNotFound: confd_path = 'Not found' try: checksd_path = config.get_checksd_path(osname) except config.PathNotFound: checksd_path = 'Not found' lines.append(' conf.d: ' + confd_path) lines.append(' checks.d: ' + checksd_path) lines.append('') # Hostnames lines += ['Hostnames', '=========', ''] if not self.host_metadata: lines.append(" No host information available yet.") else: for key, host in self.host_metadata.iteritems(): for whitelist_item in metadata_whitelist: if whitelist_item in key: lines.append(" " + key + ": " + host) break lines.append('') # Checks.d Status lines += ['Checks', '======', ''] check_statuses = self.check_statuses + get_jmx_status() if not check_statuses: lines.append(" No checks have run yet.") else: for cs in check_statuses: check_lines = [' ' + cs.name, ' ' + '-' * len(cs.name)] if cs.init_failed_error: check_lines.append( " - initialize check class [%s]: %s" % (style( STATUS_ERROR, 'red'), repr(cs.init_failed_error))) if self.verbose and cs.init_failed_traceback: check_lines.extend( ' ' + line for line in cs.init_failed_traceback.split('\n')) else: for s in cs.instance_statuses: c = 'green' if s.has_warnings(): c = 'yellow' if s.has_error(): c = 'red' line = " - instance #%s [%s]" % (s.instance_id, style(s.status, c)) if s.has_error(): line += u": %s" % s.error if s.metric_count is not None: line += " collected %s metrics" % s.metric_count if s.instance_check_stats is not None: line += " Last run duration: %s" % s.instance_check_stats.get( 'run_time') check_lines.append(line) if s.has_warnings(): for warning in s.warnings: warn = warning.split('\n') if not len(warn): continue check_lines.append( u" %s: %s" % (style("Warning", 'yellow'), warn[0])) check_lines.extend(u" %s" % l for l in warn[1:]) if self.verbose and s.traceback is not None: check_lines.extend( ' ' + line for line in s.traceback.split('\n')) check_lines += [ " - Collected %s metric%s, %s event%s & %s service check%s" % (cs.metric_count, plural( cs.metric_count), cs.event_count, plural(cs.event_count), cs.service_check_count, plural(cs.service_check_count)), ] if cs.check_stats is not None: check_lines += [ " - Stats: %s" % pretty_statistics(cs.check_stats) ] if cs.library_versions is not None: check_lines += [" - Dependencies:"] for library, version in cs.library_versions.iteritems( ): check_lines += [ " - %s: %s" % (library, version) ] check_lines += [""] lines += check_lines # Metadata status metadata_enabled = _is_affirmative(get_config().get( 'display_service_metadata', False)) if metadata_enabled: lines += ["", "Service metadata", "================", ""] if not check_statuses: lines.append(" No checks have run yet.") else: meta_lines = [] for cs in check_statuses: # Check title check_line = [' ' + cs.name, ' ' + '-' * len(cs.name)] instance_lines = [] for i, meta in enumerate(cs.service_metadata): if not meta: continue instance_lines += [" - instance #%s:" % i] for k, v in meta.iteritems(): instance_lines += [" - %s: %s" % (k, v)] if instance_lines: check_line += instance_lines meta_lines += check_line if meta_lines: lines += meta_lines else: lines.append(" No metadata were collected.") # Emitter status lines += ["", "Emitters", "========", ""] if not self.emitter_statuses: lines.append(" No emitters have run yet.") else: for es in self.emitter_statuses: c = 'green' if es.has_error(): c = 'red' line = " - %s [%s]" % (es.name, style(es.status, c)) if es.status != STATUS_OK: line += ": %s" % es.error lines.append(line) return lines
def check(self, instance): if not self.kubeutil.init_success: if self.kubeutil.left_init_retries > 0: self.kubeutil.init_kubelet(instance) self.log.warning( "Kubelet client is not initialized, Kubernetes check is paused." ) return else: raise Exception( "Unable to initialize Kubelet client. Try setting the host parameter. The Kubernetes check failed permanently." ) # Leader election self.refresh_leader_status(instance) self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH) enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES) self.enabled_gauges = [ "{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges ] enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES) self.enabled_rates = [ "{0}.{1}".format(NAMESPACE, x) for x in enabled_rates ] self.publish_aliases = _is_affirmative( instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES)) self.use_histogram = _is_affirmative( instance.get('use_histogram', DEFAULT_USE_HISTOGRAM)) self.publish_rate = FUNC_MAP[RATE][self.use_histogram] self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram] # initialized by _filter_containers self._filtered_containers = set() try: pods_list = self.kubeutil.retrieve_pods_list() except: pods_list = None # kubelet health checks self._perform_kubelet_checks(self.kubeutil.kube_health_url, instance) if pods_list is not None: # Will not fail if cAdvisor is not available self._update_pods_metrics(instance, pods_list) # cAdvisor & kubelet metrics, will fail if port 4194 is not open try: if int(instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT)) > 0: self._update_metrics(instance, pods_list) except ConnectionError: self.warning( '''Can't access the cAdvisor metrics, performance metrics and''' ''' limits/requests will not be collected. Please setup''' ''' your kubelet with the --cadvisor-port=4194 option, or set port to 0''' ''' in this check's configuration to disable cAdvisor lookup.''' ) except Exception as err: self.log.warning("Error while getting performance metrics: %s", str(err)) # kubernetes events if self.event_retriever is not None: try: events = self.event_retriever.get_event_array() changed_cids = self.kubeutil.process_events(events, podlist=pods_list) if (changed_cids and self._sd_backend): self._sd_backend.update_checks(changed_cids) if events and self._collect_events: self._update_kube_events(instance, pods_list, events) except Exception as ex: self.log.error("Event collection failed: %s", str(ex))
def check(self, instance): if 'apache_status_url' not in instance: raise Exception("Missing 'apache_status_url' in Apache config") url = self.assumed_url.get(instance['apache_status_url'], instance['apache_status_url']) connect_timeout = int(instance.get('connect_timeout', 5)) receive_timeout = int(instance.get('receive_timeout', 15)) tags = instance.get('tags', []) disable_ssl_validation = _is_affirmative(instance.get('disable_ssl_validation', False)) auth = None if 'apache_user' in instance and 'apache_password' in instance: auth = (instance['apache_user'], instance['apache_password']) # Submit a service check for status page availability. parsed_url = urlparse.urlparse(url) apache_host = parsed_url.hostname apache_port = parsed_url.port or 80 service_check_name = 'apache.can_connect' service_check_tags = ['host:%s' % apache_host, 'port:%s' % apache_port] try: self.log.debug('apache check initiating request, connect timeout %d receive %d' % (connect_timeout, receive_timeout)) r = requests.get(url, auth=auth, headers=headers(self.agentConfig), verify=not disable_ssl_validation, timeout=(connect_timeout, receive_timeout)) r.raise_for_status() except Exception as e: self.log.warning("Caught exception %s" % str(e)) self.service_check(service_check_name, AgentCheck.CRITICAL, tags=service_check_tags) raise else: self.service_check(service_check_name, AgentCheck.OK, tags=service_check_tags) self.log.debug("apache check succeeded") response = r.content metric_count = 0 # Loop through and extract the numerical values for line in response.splitlines(): values = line.split(': ') if len(values) == 2: # match metric, value = values try: value = float(value) except ValueError: continue # Special case: kBytes => bytes if metric == 'Total kBytes': value = value * 1024 # Send metric as a gauge, if applicable if metric in self.GAUGES: metric_count += 1 metric_name = self.GAUGES[metric] self.gauge(metric_name, value, tags=tags) # Send metric as a rate, if applicable if metric in self.RATES: metric_count += 1 metric_name = self.RATES[metric] self.rate(metric_name, value, tags=tags) if metric_count == 0: if self.assumed_url.get(instance['apache_status_url'], None) is None and url[-5:] != '?auto': self.assumed_url[instance['apache_status_url']] = '%s?auto' % url self.warning("Assuming url was not correct. Trying to add ?auto suffix to the url") self.check(instance) else: raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance['apache_status_url'])
def get_instance_proxy(self, instance, uri): proxies = self.proxies.copy() proxies['no'] = get_no_proxy_from_env() return config_proxy_skip( proxies, uri, _is_affirmative(instance.get('no_proxy', False)))
def init(self): try: instance = self.instances[0] self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() self.metadata_collector = MetadataCollector() if Platform.is_k8s(): try: self.kubeutil = KubeUtil() except Exception as ex: self.kubeutil = None self.log.error("Couldn't instantiate the kubernetes client, " "subsequent kubernetes calls will fail as well. Error: %s" % str(ex)) # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", DEFAULT_LABELS_AS_TAGS) self.kube_pod_tags = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if self.docker_util.filtering_enabled: self.tag_names[FILTERED] = self.docker_util.filtered_tag_names # Container network mapping cache self.network_mappings = {} # get the health check whitelist self.whitelist_patterns = None health_scs_whitelist = instance.get('health_service_check_whitelist', []) if health_scs_whitelist: patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist) self.whitelist_patterns = set(patterns) self.tag_names[HEALTHCHECK] = set(whitelist_tags) # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False)) self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.event_attributes_as_tags = instance.get('event_attributes_as_tags', []) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.capped_metrics = instance.get('capped_metrics') except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True
pretty_statistics(cs.check_stats) ] if cs.library_versions is not None: check_lines += [" - Dependencies:"] for library, version in cs.library_versions.iteritems( ): check_lines += [ " - %s: %s" % (library, version) ] check_lines += [""] lines += check_lines metadata_enabled = _is_affirmative(get_config().get( 'display_service_metadata', False)) if metadata_enabled: lines += ["", "Service metadata", "================", ""] if not check_statuses: lines.append(" No checks have run yet.") else: meta_lines = [] for cs in check_statuses: check_line = [' ' + cs.name, ' ' + '-' * len(cs.name)] instance_lines = [] for i, meta in enumerate(cs.service_metadata): if not meta: continue instance_lines += [" - instance #%s:" % i]
def init(self): try: instance = self.instances[0] self.docker_util = DockerUtil() self.docker_client = self.docker_util.client if self.is_k8s(): self.kubeutil = KubeUtil() self._mountpoints = self.docker_util.get_mountpoints( CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get( "collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative( instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning( "You must specify an exclude section to enable filtering" ) else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters( include, exclude) self.tag_names[FILTERED] = _filtered_tag_names self.collect_image_stats = _is_affirmative( instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative( instance.get('collect_container_size', False)) self.collect_events = _is_affirmative( instance.get('collect_events', True)) self.collect_image_size = _is_affirmative( instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative( instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative( instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception, e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration")
def check(self, instance): if 'url' not in instance: raise Exception('Mesos instance missing "url" value.') url = instance['url'] instance_tags = instance.get('tags', []) tasks = instance.get('tasks', []) default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) master_port = instance.get("master_port", DEFAULT_MASTER_PORT) ssl_verify = not _is_affirmative( instance.get('disable_ssl_validation', False)) state_metrics = self._get_constant_attributes(url, timeout, master_port, ssl_verify) tags = None if state_metrics is None: state_metrics = self._get_state(url, timeout, ssl_verify) if state_metrics: tags = [ 'mesos_pid:{0}'.format(state_metrics['pid']), 'mesos_node:slave', ] if self.cluster_name: tags.append('mesos_cluster:{0}'.format(self.cluster_name)) tags += instance_tags for task in tasks: for framework in state_metrics['frameworks']: for executor in framework['executors']: for t in executor['tasks']: if task.lower() in t['name'].lower( ) and t['slave_id'] == state_metrics['id']: task_tags = ['task_name:' + t['name']] + tags self.service_check( t['name'] + '.ok', self.TASK_STATUS[t['state']], tags=task_tags) for key_name, ( metric_name, metric_func ) in self.TASK_METRICS.iteritems(): metric_func(self, metric_name, t['resources'][key_name], tags=task_tags) stats_metrics = self._get_stats(url, timeout, ssl_verify) if stats_metrics: tags = tags if tags else instance_tags metrics = [ self.SLAVE_TASKS_METRICS, self.SYSTEM_METRICS, self.SLAVE_RESOURCE_METRICS, self.SLAVE_EXECUTORS_METRICS, self.STATS_METRICS ] for m in metrics: for key_name, (metric_name, metric_func) in m.iteritems(): metric_func(self, metric_name, stats_metrics[key_name], tags=tags) self.service_check_needed = True
def check(self, instance): # For calculating lag, we have to fetch offsets from both kafka and # zookeeper. There's a potential race condition because whichever one we # check first may be outdated by the time we check the other. Better to # check consumer offset before checking broker offset because worst case # is that overstates consumer lag a little. Doing it the other way can # understate consumer lag to the point of having negative consumer lag, # which just creates confusion because it's theoretically impossible. # Fetch consumer group offsets from Zookeeper zk_hosts_ports = instance.get('zk_connect_str') zk_prefix = instance.get('zk_prefix', '') zk_interval = int(instance.get('zk_iteration_ival', 0)) get_kafka_consumer_offsets = _is_affirmative( instance.get('kafka_consumer_offsets', zk_hosts_ports is None)) # If monitor_unlisted_consumer_groups is True, fetch all groups stored in ZK consumer_groups = None if instance.get('monitor_unlisted_consumer_groups', False): consumer_groups = None elif 'consumer_groups' in instance: consumer_groups = self._read_config( instance, 'consumer_groups', cast=self._validate_consumer_groups) zk_consumer_offsets = None if zk_hosts_ports and \ self._should_zk(zk_hosts_ports, zk_interval, get_kafka_consumer_offsets): zk_consumer_offsets, consumer_groups = self._get_zk_consumer_offsets( zk_hosts_ports, consumer_groups, zk_prefix) topics = defaultdict(set) kafka_consumer_offsets = None cli = self._get_kafka_client(instance) cli._maybe_refresh_metadata() kafka_version = self._get_kafka_version(cli) if get_kafka_consumer_offsets: # For now, consumer groups are mandatory if not using ZK if not zk_hosts_ports and not consumer_groups: raise BadKafkaConsumerConfiguration( 'Invalid configuration - if you\'re not collecing ' 'offset from ZK you _must_ specify consumer groups') if self._kafka_compatible(kafka_version): kafka_consumer_offsets, topics = self._get_kafka_consumer_offsets( instance, consumer_groups) if not topics: # val = {'consumer_group': {'topic': [0, 1]}} for _, tps in consumer_groups.iteritems(): for topic, partitions in tps.iteritems(): topics[topic].update(partitions) warn_msg = """ Discovered %s partition contexts - this exceeds the maximum number of contexts permitted by the check. Please narrow your target by specifying in your YAML what consumer groups, topics and partitions you wish to monitor.""" if zk_consumer_offsets and len( zk_consumer_offsets) > self.context_limit: self.warning(warn_msg % len(zk_consumer_offsets)) return if kafka_consumer_offsets and len( kafka_consumer_offsets) > self.context_limit: self.warning(warn_msg % len(kafka_consumer_offsets)) return # Fetch the broker highwater offsets try: highwater_offsets, topic_partitions_without_a_leader = self._get_broker_offsets( instance, topics) except Exception: self.log.exception( 'There was a problem collecting the high watermark offsets') return # Report the broker highwater offset for (topic, partition), highwater_offset in highwater_offsets.iteritems(): broker_tags = ['topic:%s' % topic, 'partition:%s' % partition] self.gauge('kafka.broker_offset', highwater_offset, tags=broker_tags) # Report the consumer group offsets and consumer lag if zk_consumer_offsets: self._report_consumer_metrics(highwater_offsets, zk_consumer_offsets, topic_partitions_without_a_leader, ['source:zk']) if kafka_consumer_offsets: self._report_consumer_metrics(highwater_offsets, kafka_consumer_offsets, topic_partitions_without_a_leader, ['source:kafka'])
def check(self, instance): """Run the Docker check for one instance.""" if not self.init_success: # Initialization can fail if cgroups are not ready or docker daemon is down. So we retry if needed # https://github.com/DataDog/dd-agent/issues/1896 self.init() if self.docker_client is None: message = "Unable to connect to Docker daemon" self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=message) return if not self.init_success: # Initialization failed, will try later return try: # Report image metrics if self.collect_image_stats: self._count_and_weigh_images() if Platform.is_k8s(): self.kube_pod_tags = {} if self.kubeutil: try: self.kube_pod_tags = self.kubeutil.get_kube_pod_tags() except Exception as e: self.log.warning('Could not retrieve kubernetes labels: %s' % str(e)) # containers running with custom cgroups? custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False)) # Get the list of containers and the index of their names health_service_checks = True if self.whitelist_patterns else False containers_by_id = self._get_and_count_containers(custom_cgroups, health_service_checks) containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups) # Send events from Docker API if self.collect_events or self._service_discovery or not self._disable_net_metrics or self.collect_exit_codes: self._process_events(containers_by_id) # Report performance container metrics (cpu, mem, net, io) self._report_performance_metrics(containers_by_id) if self.collect_container_size: self._report_container_size(containers_by_id) if self.collect_container_count: self._report_container_count(containers_by_id) if self.collect_volume_count: self._report_volume_count() # Collect disk stats from Docker info command if self.collect_disk_stats: self._report_disk_stats() if health_service_checks: self._send_container_healthcheck_sc(containers_by_id) except: self.log.exception("Docker_daemon check failed") self.warning("Check failed. Will retry at next iteration") if self.capped_metrics: self.filter_capped_metrics()
def check(self, instance): name = instance.get('name', None) tags = instance.get('tags', []) exact_match = _is_affirmative(instance.get('exact_match', True)) search_string = instance.get('search_string', None) ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True)) pid = instance.get('pid') if not isinstance(search_string, list) and pid is None: raise KeyError('"search_string" parameter should be a list') # FIXME 6.x remove me if pid is None: if "All" in search_string: self.warning( 'Deprecated: Having "All" in your search_string will' 'greatly reduce the performance of the check and ' 'will be removed in a future version of the agent.') if name is None: raise KeyError('The "name" of process groups is mandatory') if search_string is not None: pids = self.find_pids(name, search_string, exact_match, ignore_ad=ignore_ad) elif pid is not None: pids = [psutil.Process(pid)] else: raise ValueError( 'The "search_string" or "pid" options are required for process identification' ) proc_state = self.get_process_state(name, pids) # FIXME 6.x remove the `name` tag tags.extend(['process_name:%s' % name, name]) self.log.debug('ProcessCheck: process %s analysed', name) self.gauge('system.processes.number', len(pids), tags=tags) for attr, mname in ATTR_TO_METRIC.iteritems(): vals = [x for x in proc_state[attr] if x is not None] # skip [] if vals: if attr == 'run_time': self.gauge('system.processes.%s.avg' % mname, sum(vals) / len(vals), tags=tags) self.gauge('system.processes.%s.max' % mname, max(vals), tags=tags) self.gauge('system.processes.%s.min' % mname, min(vals), tags=tags) # FIXME 6.x: change this prefix? else: self.gauge('system.processes.%s' % mname, sum(vals), tags=tags) for attr, mname in ATTR_TO_METRIC_RATE.iteritems(): vals = [x for x in proc_state[attr] if x is not None] if vals: self.rate('system.processes.%s' % mname, sum(vals), tags=tags) self._process_service_check(name, len(pids), instance.get('thresholds', None))
def check(self, instance): # Get properties from conf file rm_address = instance.get('resourcemanager_uri') if rm_address is None: raise Exception( 'The ResourceManager URL must be specified in the instance configuration' ) collect_task_metrics = _is_affirmative( instance.get('collect_task_metrics', False)) ssl_verify = _is_affirmative(instance.get('ssl_verify', True)) # Get additional tags from the conf file custom_tags = instance.get('tags') or [ ] # this handles the case when the YAML `tags` key has an empty value tags = list(set(custom_tags)) if custom_tags else [] # Get the cluster name from the conf file cluster_name = instance.get('cluster_name') if cluster_name is None: self.warning( "The cluster_name must be specified in the instance configuration, defaulting to '%s'" % (DEFAULT_CUSTER_NAME)) cluster_name = DEFAULT_CUSTER_NAME tags.append('cluster_name:%s' % cluster_name) # Get the running MR applications from YARN running_apps = self._get_running_app_ids(rm_address, ssl_verify=ssl_verify) # Report success after gathering all metrics from ResourceManaager self.service_check( YARN_SERVICE_CHECK, AgentCheck.OK, tags=['url:%s' % rm_address] + custom_tags, message='Connection to ResourceManager "%s" was successful' % rm_address) # Get the applications from the application master running_jobs = self._mapreduce_job_metrics(running_apps, tags, ssl_verify=ssl_verify) # # Get job counter metrics self._mapreduce_job_counters_metrics(running_jobs, tags, ssl_verify=ssl_verify) # Get task metrics if collect_task_metrics: self._mapreduce_task_metrics(running_jobs, tags, ssl_verify=ssl_verify) # Report success after gathering all metrics from Application Master if running_jobs: job_id, metrics = running_jobs.items()[0] am_address = self._get_url_base(metrics['tracking_url']) self.service_check( MAPREDUCE_SERVICE_CHECK, AgentCheck.OK, tags=['url:%s' % am_address] + custom_tags, message='Connection to ApplicationManager "%s" was successful' % am_address)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) if not Platform.is_windows(): # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) else: sdk_integrations = get_sdk_integration_paths() for name, path in sdk_integrations.iteritems(): lib_path = os.path.join(path, 'lib') if os.path.exists(lib_path): sys.path.append(lib_path) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if self.sd_backend and _is_affirmative( self._agentConfig.get('sd_jmx_enable', False)): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open( pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket( self._agentConfig) else: log.debug( 'Unable to create pipe in temporary directory. JMX service discovery disabled.' ) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: self.sd_pipe_jmx_configs(hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs( checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # JMXFetch restarts should prompt re-piping *all* JMX configs if self._jmx_service_discovery_enabled and \ (not self.reload_configs_flag or isinstance(self.reload_configs_flag, set)): try: jmx_launch = JMXFetch._get_jmx_launchtime() if self.last_jmx_piped and self.last_jmx_piped < jmx_launch: self.sd_pipe_jmx_configs(hostname) except Exception as e: log.debug("could not stat JMX lunch file: %s", e) # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. _, continue_immediately = self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 if not continue_immediately: log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) else: log.debug("Continuing immediately") # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def _check_connectivity_to_master(self, instance): url = instance.get('gitlab_url') if url is None: # Simply ignore this service check if not configured return parsed_url = urlparse.urlparse(url) gitlab_host = parsed_url.hostname gitlab_port = parsed_url.port or 80 service_check_tags = [ 'gitlab_host:%s' % gitlab_host, 'gitlab_port:%s' % gitlab_port ] ## Load the ssl configuration ssl_params = { 'ssl_cert_validation': _is_affirmative(instance.get('ssl_cert_validation', True)), 'ssl_ca_certs': instance.get('ssl_ca_certs'), } for key, param in ssl_params.items(): if param is None: del ssl_params[key] verify_ssl = ssl_params.get( 'ssl_ca_certs', True) if ssl_params['ssl_cert_validation'] else False ## Timeout settings timeouts = (int( instance.get('connect_timeout', GitlabRunnerCheck.DEFAULT_CONNECT_TIMEOUT)), int( instance.get( 'receive_timeout', GitlabRunnerCheck.DEFAULT_RECEIVE_TIMEOUT))) ## Auth settings auth = None if 'gitlab_user' in instance and 'gitlab_password' in instance: auth = (instance['gitlab_user'], instance['gitlab_password']) try: self.log.debug('checking connectivity against %s' % url) r = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeouts, headers=headers(self.agentConfig)) if r.status_code != 200: self.service_check(self.MASTER_SERVICE_CHECK_NAME, PrometheusCheck.CRITICAL, message="Got %s when hitting %s" % (r.status_code, url), tags=service_check_tags) raise Exception("Http status code {0} on url {1}".format( r.status_code, url)) else: r.raise_for_status() except requests.exceptions.Timeout: # If there's a timeout self.service_check(self.MASTER_SERVICE_CHECK_NAME, PrometheusCheck.CRITICAL, message="Timeout when hitting %s" % url, tags=service_check_tags) raise except Exception as e: self.service_check(self.MASTER_SERVICE_CHECK_NAME, PrometheusCheck.CRITICAL, message="Error hitting %s. Error: %s" % (url, e.message), tags=service_check_tags) raise else: self.service_check(self.MASTER_SERVICE_CHECK_NAME, PrometheusCheck.OK, tags=service_check_tags) self.log.debug("gitlab check succeeded")
def check(self, instance): """ Returns a dictionary that looks a lot like what's sent back by db.serverStatus() """ def total_seconds(td): """ Returns total seconds of a timedelta in a way that's safe for Python < 2.7 """ if hasattr(td, 'total_seconds'): return td.total_seconds() else: return (lag.microseconds + (lag.seconds + lag.days * 24 * 3600) * 10**6) / 10.0**6 if 'server' not in instance: raise Exception("Missing 'server' in mongo config") # x.509 authentication ssl_params = { 'ssl': instance.get('ssl', None), 'ssl_keyfile': instance.get('ssl_keyfile', None), 'ssl_certfile': instance.get('ssl_certfile', None), 'ssl_cert_reqs': instance.get('ssl_cert_reqs', None), 'ssl_ca_certs': instance.get('ssl_ca_certs', None) } for key, param in ssl_params.items(): if param is None: del ssl_params[key] server = instance['server'] username, password, db_name, nodelist, clean_server_name, auth_source = self._parse_uri( server, sanitize_username=bool(ssl_params)) additional_metrics = instance.get('additional_metrics', []) # Get the list of metrics to collect collect_tcmalloc_metrics = 'tcmalloc' in additional_metrics metrics_to_collect = self._get_metrics_to_collect( server, additional_metrics) # Tagging tags = instance.get('tags', []) # ...de-dupe tags to avoid a memory leak tags = list(set(tags)) if not db_name: self.log.info( 'No MongoDB database found in URI. Defaulting to admin.') db_name = 'admin' dbstats_tags = _is_affirmative(instance.get('dbstats_tags', True)) db_name_tag = db_name if dbstats_tags else hashlib.md5( db_name.encode()).hexdigest() service_check_tags = ["db:%s" % db_name_tag] service_check_tags.extend(tags) # ...add the `server` tag to the metrics' tags only # (it's added in the backend for service checks) tags.append('server:%s' % clean_server_name) if nodelist: host = nodelist[0][0] port = nodelist[0][1] service_check_tags = service_check_tags + [ "host:%s" % host, "port:%s" % port ] timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) * 1000 try: cli = pymongo.mongo_client.MongoClient( server, socketTimeoutMS=timeout, connectTimeoutMS=timeout, serverSelectionTimeoutMS=timeout, read_preference=pymongo.ReadPreference.PRIMARY_PREFERRED, **ssl_params) # some commands can only go against the admin DB admindb = cli['admin'] db = cli[db_name] except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) raise # Authenticate do_auth = True use_x509 = ssl_params and not password if not username: self.log.debug(u"A username is required to authenticate to `%s`", server) do_auth = False if do_auth: if auth_source: self.log.info( "authSource was specified in the the server URL: using '%s' as the authentication database", auth_source) self._authenticate(cli[auth_source], username, password, use_x509, clean_server_name, service_check_tags) else: self._authenticate(db, username, password, use_x509, clean_server_name, service_check_tags) try: status = db.command('serverStatus', tcmalloc=collect_tcmalloc_metrics) except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) raise else: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) if status['ok'] == 0: raise Exception(status['errmsg'].__str__()) ops = db.current_op() status['fsyncLocked'] = 1 if ops.get('fsyncLock') else 0 status['stats'] = db.command('dbstats') dbstats = {} dbstats[db_name] = {'stats': status['stats']} # Handle replica data, if any # See # http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus # noqa if _is_affirmative(instance.get('replica_check', True)): try: data = {} dbnames = [] replSet = admindb.command('replSetGetStatus') if replSet: primary = None current = None # need a new connection to deal with replica sets setname = replSet.get('set') cli_rs = pymongo.mongo_client.MongoClient( server, socketTimeoutMS=timeout, connectTimeoutMS=timeout, serverSelectionTimeoutMS=timeout, replicaset=setname, read_preference=pymongo.ReadPreference.NEAREST, **ssl_params) if do_auth: if auth_source: self._authenticate(cli_rs[auth_source], username, password, use_x509, server, service_check_tags) else: self._authenticate(cli_rs[db_name], username, password, use_x509, server, service_check_tags) # Replication set information replset_name = replSet['set'] replset_state = self.get_state_name( replSet['myState']).lower() tags.extend([ u"replset_name:{0}".format(replset_name), u"replset_state:{0}".format(replset_state), ]) # Find nodes: master and current node (ourself) for member in replSet.get('members'): if member.get('self'): current = member if int(member.get('state')) == 1: primary = member # Compute a lag time if current is not None and primary is not None: if 'optimeDate' in primary and 'optimeDate' in current: lag = primary['optimeDate'] - current['optimeDate'] data['replicationLag'] = total_seconds(lag) if current is not None: data['health'] = current['health'] data['state'] = replSet['myState'] if current is not None: total = 0.0 cfg = cli_rs['local']['system.replset'].find_one() for member in cfg.get('members'): total += member.get('votes', 1) if member['_id'] == current['_id']: data['votes'] = member.get('votes', 1) data['voteFraction'] = data['votes'] / total status['replSet'] = data # Submit events self._report_replica_set_state(data['state'], clean_server_name, replset_name, self.agentConfig) except Exception as e: if "OperationFailure" in repr(e) and ( "not running with --replSet" in str(e) or "replSetGetStatus" in str(e)): pass else: raise e # If these keys exist, remove them for now as they cannot be serialized try: status['backgroundFlushing'].pop('last_finished') except KeyError: pass try: status.pop('localTime') except KeyError: pass dbnames = cli.database_names() self.gauge('mongodb.dbs', len(dbnames), tags=tags) for db_n in dbnames: db_aux = cli[db_n] dbstats[db_n] = {'stats': db_aux.command('dbstats')} # Go through the metrics and save the values for metric_name in metrics_to_collect: # each metric is of the form: x.y.z with z optional # and can be found at status[x][y][z] value = status if metric_name.startswith('stats'): continue else: try: for c in metric_name.split("."): value = value[c] except KeyError: continue # value is now status[x][y][z] if not isinstance(value, (int, long, float)): raise TypeError( u"{0} value is a {1}, it should be an int, a float or a long instead." .format(metric_name, type(value))) # Submit the metric submit_method, metric_name_alias = self._resolve_metric( metric_name, metrics_to_collect) submit_method(self, metric_name_alias, value, tags=tags) for st, value in dbstats.iteritems(): for metric_name in metrics_to_collect: if not metric_name.startswith('stats.'): continue try: val = value['stats'][metric_name.split('.')[1]] except KeyError: continue # value is now status[x][y][z] if not isinstance(val, (int, long, float)): raise TypeError( u"{0} value is a {1}, it should be an int, a float or a long instead." .format(metric_name, type(val))) # Submit the metric st_tag = st if dbstats_tags else hashlib.md5( st.encode()).hexdigest() metrics_tags = ( tags + [ u"cluster:db:{0}".format( st_tag ), # FIXME 6.0 - keep for backward compatibility u"db:{0}".format(st_tag), ]) submit_method, metric_name_alias = \ self._resolve_metric(metric_name, metrics_to_collect) submit_method(self, metric_name_alias, val, tags=metrics_tags) if _is_affirmative(instance.get('collections_indexes_stats')): mongo_version = cli.server_info().get('version', '0.0') if LooseVersion(mongo_version) >= LooseVersion("3.2"): self._collect_indexes_stats(instance, db, tags) else: self.log.error( "'collections_indexes_stats' is only available starting from mongo 3.2: your mongo version is %s", mongo_version) # Report the usage metrics for dbs/collections if 'top' in additional_metrics: try: dbtop = db.command('top') for ns, ns_metrics in dbtop['totals'].iteritems(): if "." not in ns: continue # configure tags for db name and collection name dbname, collname = ns.split(".", 1) dbname = dbname if dbstats_tags else hashlib.md5( dbname.encode()).hexdigest() ns_tags = tags + [ "db:%s" % dbname, "collection:%s" % collname ] # iterate over DBTOP metrics for m in self.TOP_METRICS: # each metric is of the form: x.y.z with z optional # and can be found at ns_metrics[x][y][z] value = ns_metrics try: for c in m.split("."): value = value[c] except Exception: continue # value is now status[x][y][z] if not isinstance(value, (int, long, float)): raise TypeError( u"{0} value is a {1}, it should be an int, a float or a long instead." .format(m, type(value))) # Submit the metric submit_method, metric_name_alias = \ self._resolve_metric(m, metrics_to_collect, prefix="usage") submit_method(self, metric_name_alias, value, tags=ns_tags) # Keep old incorrect metric if metric_name_alias.endswith('countps'): GAUGE(self, metric_name_alias[:-2], value, tags=ns_tags) except Exception as e: self.log.warning('Failed to record `top` metrics %s' % str(e)) if 'local' in dbnames: # it might not be if we are connectiing through mongos # Fetch information analogous to Mongo's db.getReplicationInfo() localdb = cli['local'] oplog_data = {} for ol_collection_name in ("oplog.rs", "oplog.$main"): ol_options = localdb[ol_collection_name].options() if ol_options: break if ol_options: try: oplog_data['logSizeMB'] = round( ol_options['size'] / 2.0**20, 2) oplog = localdb[ol_collection_name] oplog_data['usedSizeMB'] = round( localdb.command("collstats", ol_collection_name)['size'] / 2.0**20, 2) op_asc_cursor = oplog.find({ "ts": { "$exists": 1 } }).sort("$natural", pymongo.ASCENDING).limit(1) op_dsc_cursor = oplog.find({ "ts": { "$exists": 1 } }).sort("$natural", pymongo.DESCENDING).limit(1) try: first_timestamp = op_asc_cursor[0]['ts'].as_datetime() last_timestamp = op_dsc_cursor[0]['ts'].as_datetime() oplog_data['timeDiff'] = total_seconds(last_timestamp - first_timestamp) except (IndexError, KeyError): # if the oplog collection doesn't have any entries # if an object in the collection doesn't have a ts value, we ignore it pass except KeyError: # encountered an error trying to access options.size for the oplog collection self.log.warning( u"Failed to record `ReplicationInfo` metrics.") for (m, value) in oplog_data.iteritems(): submit_method, metric_name_alias = \ self._resolve_metric('oplog.%s' % m, metrics_to_collect) submit_method(self, metric_name_alias, value, tags=tags) else: self.log.debug( '"local" database not in dbnames. Not collecting ReplicationInfo metrics' ) # get collection level stats try: # Ensure that you're on the right db db = cli[db_name] # grab the collections from the configutation coll_names = instance.get('collections', []) # loop through the collections for coll_name in coll_names: # grab the stats from the collection stats = db.command("collstats", coll_name) # loop through the metrics db_name_tag = db_name if dbstats_tags else hashlib.md5( db_name.encode()).hexdigest() for m in self.collection_metrics_names: coll_tags = tags + [ "db:%s" % db_name_tag, "collection:%s" % coll_name ] value = stats.get(m, None) if not value: continue # if it's the index sizes, then it's a dict. if m == 'indexSizes': submit_method, metric_name_alias = \ self._resolve_metric('collection.%s' % m, self.COLLECTION_METRICS) # loop through the indexes for (idx, val) in value.iteritems(): # we tag the index idx_tags = coll_tags + ["index:%s" % idx] submit_method(self, metric_name_alias, val, tags=idx_tags) else: submit_method, metric_name_alias = \ self._resolve_metric('collection.%s' % m, self.COLLECTION_METRICS) submit_method(self, metric_name_alias, value, tags=coll_tags) except Exception as e: self.log.warning(u"Failed to record `collection` metrics.") self.log.exception(e)
def check(self, instance): """ Returns a dictionary that looks a lot like what's sent back by db.serverStatus() """ if 'server' not in instance: raise Exception("Missing 'server' in mongo config") server = instance['server'] ssl_params = { 'ssl': instance.get('ssl', None), 'ssl_keyfile': instance.get('ssl_keyfile', None), 'ssl_certfile': instance.get('ssl_certfile', None), 'ssl_cert_reqs': instance.get('ssl_cert_reqs', None), 'ssl_ca_certs': instance.get('ssl_ca_certs', None) } for key, param in ssl_params.items(): if param is None: del ssl_params[key] # Configuration a URL, mongodb://user:pass@server/db parsed = pymongo.uri_parser.parse_uri(server) username = parsed.get('username') password = parsed.get('password') db_name = parsed.get('database') clean_server_name = server.replace( password, "*" * 5) if password is not None else server tags = instance.get('tags', []) tags.append('server:%s' % clean_server_name) # Get the list of metrics to collect collect_tcmalloc_metrics = _is_affirmative( instance.get('collect_tcmalloc_metrics', False)) metrics_to_collect = self._get_metrics_to_collect( server, collect_tcmalloc_metrics=collect_tcmalloc_metrics, ) # de-dupe tags to avoid a memory leak tags = list(set(tags)) if not db_name: self.log.info( 'No MongoDB database found in URI. Defaulting to admin.') db_name = 'admin' service_check_tags = ["db:%s" % db_name] nodelist = parsed.get('nodelist') if nodelist: host = nodelist[0][0] port = nodelist[0][1] service_check_tags = service_check_tags + [ "host:%s" % host, "port:%s" % port ] do_auth = True if username is None or password is None: self.log.debug( "Mongo: cannot extract username and password from config %s" % server) do_auth = False timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) * 1000 try: cli = pymongo.mongo_client.MongoClient( server, socketTimeoutMS=timeout, read_preference=pymongo.ReadPreference.PRIMARY_PREFERRED, **ssl_params) # some commands can only go against the admin DB admindb = cli['admin'] db = cli[db_name] except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) raise if do_auth and not db.authenticate(username, password): message = "Mongo: cannot connect with config %s" % server self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=message) raise Exception(message) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) status = db["$cmd"].find_one({ "serverStatus": 1, "tcmalloc": collect_tcmalloc_metrics }) if status['ok'] == 0: raise Exception(status['errmsg'].__str__()) status['stats'] = db.command('dbstats') dbstats = {} dbstats[db_name] = {'stats': status['stats']} # Handle replica data, if any # See # http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus # noqa try: data = {} dbnames = [] replSet = admindb.command('replSetGetStatus') if replSet: primary = None current = None # need a new connection to deal with replica sets setname = replSet.get('set') cli = pymongo.mongo_client.MongoClient( server, socketTimeoutMS=timeout, replicaset=setname, read_preference=pymongo.ReadPreference.NEAREST, **ssl_params) db = cli[db_name] if do_auth and not db.authenticate(username, password): message = ("Mongo: cannot connect with config %s" % server) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=message) raise Exception(message) # find nodes: master and current node (ourself) for member in replSet.get('members'): if member.get('self'): current = member if int(member.get('state')) == 1: primary = member # If we have both we can compute a lag time if current is not None and primary is not None: lag = primary['optimeDate'] - current['optimeDate'] # Python 2.7 has this built in, python < 2.7 don't... if hasattr(lag, 'total_seconds'): data['replicationLag'] = lag.total_seconds() else: data['replicationLag'] = ( lag.microseconds + (lag.seconds + lag.days * 24 * 3600) * 10**6) / 10.0**6 if current is not None: data['health'] = current['health'] data['state'] = replSet['myState'] self.check_last_state(data['state'], clean_server_name, self.agentConfig) status['replSet'] = data except Exception as e: if "OperationFailure" in repr(e) and "replSetGetStatus" in str(e): pass else: raise e # If these keys exist, remove them for now as they cannot be serialized try: status['backgroundFlushing'].pop('last_finished') except KeyError: pass try: status.pop('localTime') except KeyError: pass dbnames = cli.database_names() for db_n in dbnames: db_aux = cli[db_n] dbstats[db_n] = {'stats': db_aux.command('dbstats')} # Go through the metrics and save the values for metric_name, submit_method in metrics_to_collect.iteritems(): # each metric is of the form: x.y.z with z optional # and can be found at status[x][y][z] value = status if metric_name.startswith('stats'): continue else: try: for c in metric_name.split("."): value = value[c] except KeyError: continue # value is now status[x][y][z] if not isinstance(value, (int, long, float)): raise TypeError( u"{0} value is a {1}, it should be an int, a float or a long instead." .format(metric_name, type(value))) # Submit the metric metric_name = self._normalize(metric_name, submit_method) submit_method(self, metric_name, value, tags=tags) for st, value in dbstats.iteritems(): for metric_name, submit_method in metrics_to_collect.iteritems(): if not metric_name.startswith('stats.'): continue try: val = value['stats'][metric_name.split('.')[1]] except KeyError: continue # value is now status[x][y][z] if not isinstance(val, (int, long, float)): raise TypeError( u"{0} value is a {1}, it should be an int, a float or a long instead." .format(metric_name, type(val))) # Submit the metric metric_name = self._normalize(metric_name, submit_method) metrics_tags = tags + ['cluster:db:%s' % st] submit_method(self, metric_name, value, tags=metrics_tags)
def _load_conf(self, instance): # Fetches the conf method = instance.get('method', 'get') data = instance.get('data', {}) tags = instance.get('tags', []) username = instance.get('username') password = instance.get('password') client_cert = instance.get('client_cert') client_key = instance.get('client_key') http_response_status_code = str( instance.get('http_response_status_code', DEFAULT_EXPECTED_CODE)) timeout = int(instance.get('timeout', 10)) config_headers = instance.get('headers', {}) default_headers = _is_affirmative( instance.get("include_default_headers", True)) if default_headers: headers = agent_headers(self.agentConfig) else: headers = {} headers.update(config_headers) url = instance.get('url') content_match = instance.get('content_match') reverse_content_match = _is_affirmative( instance.get('reverse_content_match', False)) response_time = _is_affirmative( instance.get('collect_response_time', True)) if not url: raise Exception("Bad configuration. You must specify a url") include_content = _is_affirmative( instance.get('include_content', False)) disable_ssl_validation = _is_affirmative( instance.get('disable_ssl_validation', True)) ssl_expire = _is_affirmative( instance.get('check_certificate_expiration', True)) instance_ca_certs = instance.get('ca_certs', self.ca_certs) weakcipher = _is_affirmative(instance.get('weakciphers', False)) ignore_ssl_warning = _is_affirmative( instance.get('ignore_ssl_warning', False)) check_hostname = _is_affirmative(instance.get('check_hostname', True)) skip_proxy = _is_affirmative( instance.get('skip_proxy', instance.get('no_proxy', False))) allow_redirects = _is_affirmative(instance.get('allow_redirects', True)) return url, username, password, client_cert, client_key, method, data, http_response_status_code, timeout, include_content,\ headers, response_time, content_match, reverse_content_match, tags, disable_ssl_validation, ssl_expire, instance_ca_certs,\ weakcipher, check_hostname, ignore_ssl_warning, skip_proxy, allow_redirects
def check(self, instance): host = instance.get('host', '') port = instance.get('port', '') if port != '': port = int(port) user = instance.get('username', '') password = instance.get('password', '') tags = instance.get('tags', []) dbname = instance.get('dbname', None) relations = instance.get('relations', []) ssl = _is_affirmative(instance.get('ssl', False)) function_metrics = _is_affirmative( instance.get('collect_function_metrics', False)) # Default value for `count_metrics` is True for backward compatibility count_metrics = _is_affirmative( instance.get('collect_count_metrics', True)) database_size_metrics = _is_affirmative( instance.get('collect_database_size_metrics', True)) collect_default_db = _is_affirmative( instance.get('collect_default_database', False)) if relations and not dbname: self.warning( '"dbname" parameter must be set when using the "relations" parameter.' ) if dbname is None: dbname = 'postgres' key = (host, port, dbname) custom_metrics = self._get_custom_metrics( instance.get('custom_metrics', []), key) # Clean up tags in case there was a None entry in the instance # e.g. if the yaml contains tags: but no actual tags if tags is None: tags = [] else: tags = list(set(tags)) # preset tags to the database name tags.extend(["db:%s" % dbname]) self.log.debug("Custom metrics: %s" % custom_metrics) # preset tags to the database name db = None connect_fct, interface_error, programming_error = self._get_pg_attrs( instance) # Collect metrics try: # Check version db = self.get_connection(key, host, port, user, password, dbname, ssl, connect_fct, tags) version = self._get_version(key, db) self.log.debug("Running check against version %s" % version) self._collect_stats(key, db, tags, relations, custom_metrics, function_metrics, count_metrics, database_size_metrics, collect_default_db, interface_error, programming_error) except ShouldRestartException: self.log.info("Resetting the connection") db = self.get_connection(key, host, port, user, password, dbname, ssl, connect_fct, tags, use_cached=False) self._collect_stats(key, db, tags, relations, custom_metrics, function_metrics, count_metrics, database_size_metrics, collect_default_db, interface_error, programming_error) if db is not None: service_check_tags = self._get_service_check_tags(host, port, tags) message = u'Established connection to postgres://%s:%s/%s' % ( host, port, dbname) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags, message=message) try: # commit to close the current query transaction db.commit() except Exception as e: self.log.warning("Unable to commit: {0}".format(e))
def enabled(cls, agent_config): return _is_affirmative(agent_config.get( 'dogstatsd6_enable', False)) and cls._get_dsd6_path() is not None
def __init__(self, instance=None): self.docker_util = DockerUtil() if instance is None: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) instance = check_config['instances'][0] # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) instance = {} except Exception: log.error('Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...') instance = {} self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.tls_settings = self._init_tls_settings(instance) # apiserver if 'api_server_url' in instance: self.kubernetes_api_root_url = instance.get('api_server_url') else: master_host = os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME master_port = os.environ.get('KUBERNETES_SERVICE_PORT') or self.DEFAULT_MASTER_PORT self.kubernetes_api_root_url = 'https://%s:%s' % (master_host, master_port) self.kubernetes_api_url = '%s/api/v1' % self.kubernetes_api_root_url # leader status triggers event collection self.is_leader = False self.leader_elector = None self.leader_lease_duration = instance.get('lease_duration') # kubelet try: self.kubelet_api_url = self._locate_kubelet(instance) if not self.kubelet_api_url: raise Exception("Couldn't find a method to connect to kubelet.") except Exception as ex: log.error("Kubernetes check exiting, cannot run without access to kubelet.") raise ex # Service mapping helper class self._service_mapper = PodServiceMapper(self) self.kubelet_host = self.kubelet_api_url.split(':')[1].lstrip('/') self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH) self.kube_health_url = urljoin(self.kubelet_api_url, KubeUtil.KUBELET_HEALTH_PATH) self.kube_label_prefix = instance.get('label_to_tag_prefix', KubeUtil.DEFAULT_LABEL_PREFIX) self.kube_node_labels = instance.get('node_labels_to_host_tags', {}) # cadvisor self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT) self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port) self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH) self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH) try: self.self_namespace = self.get_self_namespace() except Exception: log.warning("Failed to get the agent pod namespace, defaulting to default.") self.self_namespace = DEFAULT_NAMESPACE from config import _is_affirmative self.collect_service_tag = _is_affirmative(instance.get('collect_service_tags', KubeUtil.DEFAULT_COLLECT_SERVICE_TAG)) # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = 0
def _process_results(self): for i in xrange(MAX_LOOP_ITERATIONS): try: # We want to fetch the result in a non blocking way status, msg, sc_name, instance = self.resultsq.get_nowait() except Empty: break instance_name = instance['name'] if status == FAILURE: self.nb_failures += 1 if self.nb_failures >= self.pool_size - 1: self.nb_failures = 0 self.restart_pool() # clean failed job self._clean_job(instance_name) continue self.report_as_service_check(sc_name, status, instance, msg) # FIXME: 5.3, this has been deprecated before, get rid of events # Don't create any event to avoid duplicates with server side # service_checks skip_event = _is_affirmative(instance.get('skip_event', False)) if not skip_event: self.warning( "Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Datadog Agent." ) event = None if instance_name not in self.statuses: self.statuses[instance_name] = defaultdict(list) self.statuses[instance_name][sc_name].append(status) window = int(instance.get('window', 1)) if window > 256: self.log.warning( "Maximum window size (256) exceeded, defaulting it to 256" ) window = 256 threshold = instance.get('threshold', 1) if len(self.statuses[instance_name][sc_name]) > window: self.statuses[instance_name][sc_name].pop(0) nb_failures = self.statuses[instance_name][sc_name].count( Status.DOWN) if nb_failures >= threshold: if self.notified.get( (instance_name, sc_name), Status.UP) != Status.DOWN: event = self._create_status_event( sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.DOWN else: if self.notified.get( (instance_name, sc_name), Status.UP) != Status.UP: event = self._create_status_event( sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.UP if event is not None: self.events.append(event) self._clean_job(instance_name)
def check(self, instance): if 'url' not in instance: raise Exception('etcd instance missing "url" value.') # Load values from the instance config url = instance['url'] instance_tags = instance.get('tags', []) # Load the ssl configuration ssl_params = { 'ssl_keyfile': instance.get('ssl_keyfile'), 'ssl_certfile': instance.get('ssl_certfile'), 'ssl_cert_validation': _is_affirmative(instance.get('ssl_cert_validation', True)), 'ssl_ca_certs': instance.get('ssl_ca_certs'), } for key, param in ssl_params.items(): if param is None: del ssl_params[key] # Get a copy of tags for the CRIT statuses critical_tags = list(instance_tags) # Append the instance's URL in case there are more than one, that # way they can tell the difference! instance_tags.append("url:{0}".format(url)) timeout = float(instance.get('timeout', self.DEFAULT_TIMEOUT)) is_leader = False # Gather self health status sc_state = AgentCheck.UNKNOWN health_status = self._get_health_status(url, ssl_params, timeout, critical_tags) if health_status is not None: sc_state = AgentCheck.OK if self._is_healthy( health_status) else AgentCheck.CRITICAL self.service_check(self.HEALTH_SERVICE_CHECK_NAME, sc_state, tags=instance_tags) # Gather self metrics self_response = self._get_self_metrics(url, ssl_params, timeout, critical_tags) if self_response is not None: if self_response['state'] == 'StateLeader': is_leader = True instance_tags.append('etcd_state:leader') else: instance_tags.append('etcd_state:follower') for key in self.SELF_RATES: if key in self_response: self.rate(self.SELF_RATES[key], self_response[key], tags=instance_tags) else: self.log.warn("Missing key {0} in stats.".format(key)) for key in self.SELF_GAUGES: if key in self_response: self.gauge(self.SELF_GAUGES[key], self_response[key], tags=instance_tags) else: self.log.warn("Missing key {0} in stats.".format(key)) # Gather store metrics store_response = self._get_store_metrics(url, ssl_params, timeout, critical_tags) if store_response is not None: for key in self.STORE_RATES: if key in store_response: self.rate(self.STORE_RATES[key], store_response[key], tags=instance_tags) else: self.log.warn("Missing key {0} in stats.".format(key)) for key in self.STORE_GAUGES: if key in store_response: self.gauge(self.STORE_GAUGES[key], store_response[key], tags=instance_tags) else: self.log.warn("Missing key {0} in stats.".format(key)) # Gather leader metrics if is_leader: leader_response = self._get_leader_metrics(url, ssl_params, timeout, critical_tags) if leader_response is not None and len( leader_response.get("followers", {})) > 0: # Get the followers followers = leader_response.get("followers") for fol in followers: # counts for key in self.LEADER_COUNTS: self.rate(self.LEADER_COUNTS[key], followers[fol].get("counts").get(key), tags=instance_tags + ['follower:{0}'.format(fol)]) # latency for key in self.LEADER_LATENCY: self.gauge(self.LEADER_LATENCY[key], followers[fol].get("latency").get(key), tags=instance_tags + ['follower:{0}'.format(fol)]) # Service check if self_response is not None and store_response is not None: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=instance_tags)
def init(self): try: # We configure the check with the right cgroup settings for this host # Just needs to be done once instance = self.instances[0] set_docker_settings(self.init_config, instance) self.client = get_client() self._docker_root = self.init_config.get('docker_root', '/') self._mountpoints = get_mountpoints(self._docker_root) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # At first run we'll just collect the events from the latest 60 secs self._last_event_collection_ts = int(time.time()) - 60 # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get( "collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative( instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning( "You must specify an exclude section to enable filtering" ) else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters( include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative( instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative( instance.get('collect_container_size', False)) self.collect_events = _is_affirmative( instance.get('collect_events', True)) self.collect_image_size = _is_affirmative( instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative( instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative( instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception, e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration")
def check(self, instance): if 'url' not in instance: raise Exception('Mesos instance missing "url" value.') url = instance['url'] instance_tags = instance.get('tags', []) if instance_tags is None: instance_tags = [] default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) ssl_verify = not _is_affirmative( instance.get('disable_ssl_validation', False)) state_metrics = self._check_leadership(url, timeout, ssl_verify, instance_tags) if state_metrics: tags = [ 'mesos_pid:{0}'.format(state_metrics['pid']), 'mesos_node:master', ] if 'cluster' in state_metrics: tags.append('mesos_cluster:{0}'.format( state_metrics['cluster'])) tags += instance_tags if self.leader: self.GAUGE('mesos.cluster.total_frameworks', len(state_metrics['frameworks']), tags=tags) for framework in state_metrics['frameworks']: framework_tags = ['framework_name:' + framework['name'] ] + tags self.GAUGE('mesos.framework.total_tasks', len(framework['tasks']), tags=framework_tags) resources = framework['used_resources'] for key_name, ( metric_name, metric_func) in self.FRAMEWORK_METRICS.iteritems(): metric_func(self, metric_name, resources[key_name], tags=framework_tags) role_metrics = self._get_master_roles(url, timeout, ssl_verify, instance_tags) if role_metrics is not None: for role in role_metrics['roles']: role_tags = ['mesos_role:' + role['name']] + tags self.GAUGE('mesos.role.frameworks.count', len(role['frameworks']), tags=role_tags) self.GAUGE('mesos.role.weight', role['weight'], tags=role_tags) for key_name, ( metric_name, metric_func ) in self.ROLE_RESOURCES_METRICS.iteritems(): metric_func(self, metric_name, role['resources'][key_name], tags=role_tags) stats_metrics = self._get_master_stats(url, timeout, ssl_verify, instance_tags) if stats_metrics is not None: metrics = [self.SYSTEM_METRICS] if self.leader: metrics += [ self.CLUSTER_TASKS_METRICS, self.CLUSTER_SLAVES_METRICS, self.CLUSTER_RESOURCES_METRICS, self.CLUSTER_REGISTRAR_METRICS, self.CLUSTER_FRAMEWORK_METRICS, self.STATS_METRICS ] for m in metrics: for key_name, (metric_name, metric_func) in m.iteritems(): if key_name in stats_metrics: metric_func(self, metric_name, stats_metrics[key_name], tags=tags) self.service_check_needed = True
def check(self, instance): name = instance.get('name', None) tags = instance.get('tags', []) exact_match = _is_affirmative(instance.get('exact_match', True)) search_string = instance.get('search_string', None) ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True)) pid = instance.get('pid') pid_file = instance.get('pid_file') collect_children = _is_affirmative( instance.get('collect_children', False)) if self._conflicting_procfs: self.warning( 'The `procfs_path` defined in `process.yaml` is different from the one defined in ' '`datadog.conf`. This is currently not supported by the Agent. Defaulting to the ' 'value defined in `datadog.conf`: {}'.format( psutil.PROCFS_PATH)) elif self._deprecated_init_procfs: self.warning( 'DEPRECATION NOTICE: Specifying `procfs_path` in `process.yaml` is deprecated. ' 'Please specify it in `datadog.conf` instead') if not isinstance(search_string, list) and pid is None and pid_file is None: raise ValueError( '"search_string" or "pid" or "pid_file" parameter is required') # FIXME 6.x remove me if search_string is not None: if "All" in search_string: self.warning( 'Deprecated: Having "All" in your search_string will' 'greatly reduce the performance of the check and ' 'will be removed in a future version of the agent.') if name is None: raise KeyError('The "name" of process groups is mandatory') if search_string is not None: pids = self.find_pids(name, search_string, exact_match, ignore_ad=ignore_ad) elif pid is not None: # we use Process(pid) as a means to search, if pid not found # psutil.NoSuchProcess is raised. pids = self._get_pid_set(pid) elif pid_file is not None: try: with open(pid_file, 'r') as file_pid: pid_line = file_pid.readline().strip() pids = self._get_pid_set(int(pid_line)) except IOError as e: # pid file doesn't exist, assuming the process is not running self.log.debug('Unable to find pid file: %s', e) pids = set() else: raise ValueError( 'The "search_string" or "pid" options are required for process identification' ) if collect_children: pids.update(self._get_child_processes(pids)) proc_state = self.get_process_state(name, pids) # FIXME 6.x remove the `name` tag tags.extend(['process_name:%s' % name, name]) self.log.debug('ProcessCheck: process %s analysed', name) self.gauge('system.processes.number', len(pids), tags=tags) if len(pids) == 0: self.warning("No matching process '%s' was found" % name) for attr, mname in ATTR_TO_METRIC.iteritems(): vals = [x for x in proc_state[attr] if x is not None] # skip [] if vals: if attr == 'run_time': self.gauge('system.processes.%s.avg' % mname, sum(vals) / len(vals), tags=tags) self.gauge('system.processes.%s.max' % mname, max(vals), tags=tags) self.gauge('system.processes.%s.min' % mname, min(vals), tags=tags) # FIXME 6.x: change this prefix? else: self.gauge('system.processes.%s' % mname, sum(vals), tags=tags) for attr, mname in ATTR_TO_METRIC_RATE.iteritems(): vals = [x for x in proc_state[attr] if x is not None] if vals: self.rate('system.processes.%s' % mname, sum(vals), tags=tags) self._process_service_check(name, len(pids), instance.get('thresholds', None), tags)
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Cache connections self.connections = {} self.failed_connections = {} self.instances_metrics = {} self.instances_per_type_metrics = defaultdict(dict) self.existing_databases = None self.do_check = {} self.proc_type_mapping = { 'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram } self.connector = init_config.get('connector', 'adodbapi') if not self.connector.lower() in self.valid_connectors: self.log.error( "Invalid database connector %s, defaulting to adodbapi" % self.connector) self.connector = 'adodbapi' # Pre-process the list of metrics to collect self.custom_metrics = init_config.get('custom_metrics', []) for instance in instances: try: instance_key = self._conn_key(instance, self.DEFAULT_DB_KEY) self.do_check[instance_key] = True # check to see if the database exists before we try any connections to it with self.open_managed_db_connections( instance, None, db_name=self.DEFAULT_DATABASE): db_exists, context = self._check_db_exists(instance) if db_exists: if instance.get('stored_procedure') is None: with self.open_managed_db_connections( instance, self.DEFAULT_DB_KEY): self._make_metric_list_to_collect( instance, self.custom_metrics) else: # How much do we care that the DB doesn't exist? ignore = _is_affirmative( instance.get("ignore_missing_database", False)) if ignore is not None and ignore: # not much : we expect it. leave checks disabled self.do_check[instance_key] = False self.log.warning( "Database %s does not exist. Disabling checks for this instance." % (context)) else: # yes we do. Keep trying self.log.error( "Database %s does not exist. Fix issue and restart agent" % (context)) except SQLConnectionError: self.log.exception("Skipping SQL Server instance") continue except Exception as e: self.log.exception("INitialization exception %s", str(e)) continue