def _load_conf(self, instance): self._excluded_filesystems = instance.get('excluded_filesystems', []) self._excluded_disks = instance.get('excluded_disks', []) self._excluded_mountpoint_re = re.compile( instance.get('excluded_mountpoint_re', '^$')) self._tag_by_filesystem = _is_affirmative( instance.get('tag_by_filesystem', False)) self._all_partitions = _is_affirmative( instance.get('all_partitions', False)) self._device_tag_re = instance.get('device_tag_re', {}) self._custom_tags = instance.get('tags', []) self._service_check_rw = _is_affirmative( instance.get('service_check_rw', False)) # Force exclusion of CDROM (iso9660) from disk check self._excluded_filesystems.append('iso9660') # FIXME: 6.x, drop use_mount option in datadog.conf self._load_legacy_option(instance, 'use_mount', False, operation=_is_affirmative) # FIXME: 6.x, drop device_blacklist_re option in datadog.conf self._load_legacy_option(instance, 'excluded_disk_re', '^$', legacy_name='device_blacklist_re', operation=re.compile)
def check_health_v1(self, config, tags): url = config['api_url'] + '/sys/health' health_data = self.access_api(url, config, tags).json() cluster_name = health_data.get('cluster_name') if cluster_name: tags.append('cluster_name:{}'.format(cluster_name)) vault_version = health_data.get('version') if vault_version: tags.append('vault_version:{}'.format(vault_version)) unsealed = not _is_affirmative(health_data.get('sealed')) if unsealed: self.service_check(self.SERVICE_CHECK_UNSEALED, AgentCheck.OK, tags=tags) else: self.service_check(self.SERVICE_CHECK_UNSEALED, AgentCheck.CRITICAL, tags=tags) initialized = _is_affirmative(health_data.get('initialized')) if initialized: self.service_check(self.SERVICE_CHECK_INITIALIZED, AgentCheck.OK, tags=tags) else: self.service_check(self.SERVICE_CHECK_INITIALIZED, AgentCheck.CRITICAL, tags=tags)
def check(self, instance): try: directory = instance['directory'] except KeyError: raise Exception('DirectoryCheck: missing `directory` in config') abs_directory = abspath(directory) name = instance.get('name', directory) pattern = instance.get('pattern') recursive = _is_affirmative(instance.get('recursive', False)) dirtagname = instance.get('dirtagname', 'name') filetagname = instance.get('filetagname', 'filename') filegauges = _is_affirmative(instance.get('filegauges', False)) countonly = _is_affirmative(instance.get('countonly', False)) ignore_missing = _is_affirmative(instance.get('ignore_missing', False)) custom_tags = instance.get('tags', []) if not exists(abs_directory): if ignore_missing: self.log.info( 'DirectoryCheck: the directory `{}` does not exist. Skipping.' .format(abs_directory)) return raise Exception( 'DirectoryCheck: the directory `{}` does not exist. Skipping.'. format(abs_directory)) self._get_stats(abs_directory, name, dirtagname, filetagname, filegauges, pattern, recursive, countonly, custom_tags)
def check(self, instance): instance_name = instance.get('name') if instance_name is None: raise Exception("Each instance must have a unique name") ssl_validation = _is_affirmative(instance.get('ssl_validation', True)) server = instance.get('server') if 'server' is None: raise Exception("Each instance must have a server") build_conf = instance.get('build_configuration') if build_conf is None: raise Exception("Each instance must have a build configuration") host = instance.get('host_affected') or self.hostname tags = instance.get('tags') is_deployment = _is_affirmative(instance.get('is_deployment', False)) basic_http_authentication = _is_affirmative( instance.get('basic_http_authentication', False)) self._initialize_if_required(instance_name, server, build_conf, ssl_validation, basic_http_authentication) # Look for new successful builds if basic_http_authentication: new_build_url = self.NEW_BUILD_URL_AUTHENTICATED.format( server=server, build_conf=build_conf, since_build=self.last_build_ids[instance_name]) else: new_build_url = self.NEW_BUILD_URL.format( server=server, build_conf=build_conf, since_build=self.last_build_ids[instance_name]) try: resp = requests.get(new_build_url, timeout=self.DEFAULT_TIMEOUT, headers=self.HEADERS, verify=ssl_validation) resp.raise_for_status() new_builds = resp.json() if new_builds["count"] == 0: self.log.debug("No new builds found.") else: self._build_and_send_event(new_builds["build"][0], instance_name, is_deployment, host, tags) except requests.exceptions.HTTPError: self.log.exception( "Couldn't fetch last build, got code {0}".format( resp.status_code)) raise except Exception: self.log.exception( "Couldn't fetch last build, unhandled exception") raise
def check(self, instance): if "directory" not in instance: raise Exception('DirectoryCheck: missing "directory" in config') directory = instance["directory"] abs_directory = abspath(directory) name = instance.get("name", directory) pattern = instance.get("pattern", "*") recursive = _is_affirmative(instance.get("recursive", False)) dirtagname = instance.get("dirtagname", "name") filetagname = instance.get("filetagname", "filename") filegauges = _is_affirmative(instance.get("filegauges", False)) countonly = _is_affirmative(instance.get("countonly", False)) ignore_missing = _is_affirmative(instance.get("ignore_missing", False)) custom_tags = instance.get("tags", []) if not exists(abs_directory): if ignore_missing: self.log.info("DirectoryCheck: \ the directory (%s) does not exist. \ Skipping." % abs_directory) return raise Exception("DirectoryCheck: \ the directory (%s) does not exist" % abs_directory) self._get_stats(abs_directory, name, dirtagname, filetagname, filegauges, pattern, recursive, countonly, custom_tags)
def check(self, instance): instance_name = instance.get("name") if instance_name is None: raise Exception("Each instance must have a unique name") server = instance.get("server") if server is None: raise Exception("Each instance must have a server") # Check the server URL for HTTP or HTTPS designation, # fall back to http:// if no scheme present (allows for backwards compatibility). server = self._normalize_server_url(server) build_conf = instance.get("build_configuration") if build_conf is None: raise Exception("Each instance must have a build configuration") host = instance.get("host_affected") or self.hostname tags = instance.get("tags") is_deployment = _is_affirmative(instance.get("is_deployment", False)) basic_http_authentication = _is_affirmative( instance.get("basic_http_authentication", False)) self._initialize_if_required(instance_name, server, build_conf, basic_http_authentication) # Look for new successful builds if basic_http_authentication: new_build_url = self.NEW_BUILD_URL_AUTHENTICATED.format( server=server, build_conf=build_conf, since_build=self.last_build_ids[instance_name]) else: new_build_url = self.NEW_BUILD_URL.format( server=server, build_conf=build_conf, since_build=self.last_build_ids[instance_name]) try: resp = self.http.get(new_build_url) resp.raise_for_status() new_builds = resp.json() if new_builds["count"] == 0: self.log.debug("No new builds found.") else: self._build_and_send_event(new_builds["build"][0], instance_name, is_deployment, host, tags) except requests.exceptions.HTTPError: self.log.exception("Couldn't fetch last build, got code {}".format( resp.status_code)) raise except Exception: self.log.exception( "Couldn't fetch last build, unhandled exception") raise
def check(self, instance): # Get properties from conf file rm_address = instance.get('resourcemanager_uri', DEFAULT_RM_URI) app_tags = instance.get('application_tags', {}) queue_blacklist = instance.get('queue_blacklist', []) if type(app_tags) is not dict: self.log.error("application_tags is incorrect: {} is not a dictionary".format(app_tags)) app_tags = {} filtered_app_tags = {} for dd_prefix, yarn_key in app_tags.iteritems(): if yarn_key in self._ALLOWED_APPLICATION_TAGS: filtered_app_tags[dd_prefix] = yarn_key app_tags = filtered_app_tags # Collected by default app_tags['app_name'] = 'name' # Authenticate our connection to endpoint if required username = instance.get('username') password = instance.get('password') auth = None if username is not None and password is not None: auth = (username, password) # Option to disable verifying ssl certificate ssl_verify = _is_affirmative(instance.get('ssl_verify', True)) # Get additional tags from the conf file custom_tags = instance.get('tags', []) tags = list(set(custom_tags)) # Get the cluster name from the conf file cluster_name = instance.get('cluster_name') if cluster_name is None: self.warning( "The cluster_name must be specified in the instance configuration, " "defaulting to '{}'".format(DEFAULT_CLUSTER_NAME) ) cluster_name = DEFAULT_CLUSTER_NAME tags.append('cluster_name:{}'.format(cluster_name)) # Get metrics from the Resource Manager self._yarn_cluster_metrics(rm_address, auth, ssl_verify, tags) if _is_affirmative(instance.get('collect_app_metrics', DEFAULT_COLLECT_APP_METRICS)): self._yarn_app_metrics(rm_address, auth, ssl_verify, app_tags, tags) self._yarn_node_metrics(rm_address, auth, ssl_verify, tags) self._yarn_scheduler_metrics(rm_address, auth, ssl_verify, tags, queue_blacklist)
def _load_conf(self, instance): tags = instance.get("tags", []) ip_address = instance["ip_address"] metrics = instance.get('metrics', []) if _is_affirmative(instance.get('use_global_metrics', True)): metrics.extend(self.init_config.get('global_metrics', [])) timeout = int(instance.get('timeout', self.DEFAULT_TIMEOUT)) retries = int(instance.get('retries', self.DEFAULT_RETRIES)) enforce_constraints = _is_affirmative( instance.get('enforce_mib_constraints', True)) snmp_engine, mib_view_controller = self.create_snmp_engine( self.mibs_path) return snmp_engine, mib_view_controller, ip_address, tags, metrics, timeout, retries, enforce_constraints
def check(self, instance): # Metrics collection endpoint = instance.get('prometheus_endpoint') if endpoint is None: raise CheckException( "Unable to find prometheus_endpoint in config file.") # By default we send the buckets send_buckets = _is_affirmative( instance.get('send_histograms_buckets', True)) custom_tags = instance.get('tags', []) try: self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance) self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME, PrometheusCheck.OK, tags=custom_tags) except requests.exceptions.ConnectionError as e: # Unable to connect to the metrics endpoint self.service_check( self.PROMETHEUS_SERVICE_CHECK_NAME, PrometheusCheck.CRITICAL, message= "Unable to retrieve Prometheus metrics from endpoint {}: {}". format(endpoint, e.message), tags=custom_tags, ) # Service check to check Gitlab's health endpoints for check_type in self.ALLOWED_SERVICE_CHECKS: self._check_health_endpoint(instance, check_type, custom_tags)
def _cache_morlist_raw(self, instance): """ Initiate the first layer to refresh the list of MORs (`self.morlist`). Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery. """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) for resource_type in RESOURCE_TYPE_METRICS: if i_key in self.morlist_raw and len(self.morlist_raw[i_key].get( resource_type, [])) > 0: last = self.cache_config.get_last(CacheConfig.Morlist, i_key) self.log.debug( "Skipping morlist collection now, RAW results " "processing not over (latest refresh was {}s ago)".format( time.time() - last)) return self.morlist_raw[i_key] = {} instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = _is_affirmative( instance.get('include_only_marked', False)) # Discover hosts and virtual machines self.pool.apply_async(self._cache_morlist_raw_atomic, args=(instance, [instance_tag], regexes, include_only_marked)) self.cache_config.set_last(CacheConfig.Morlist, i_key, time.time())
def _verify_ssl(self, instance): # Load the ssl configuration ssl_cert_validation = _is_affirmative( instance.get('ssl_cert_validation', True)) ssl_ca_certs = instance.get('ssl_ca_certs', True) return ssl_ca_certs if ssl_cert_validation else False
def check(self, instance): status_url = instance.get('status_url') ping_url = instance.get('ping_url') ping_reply = instance.get('ping_reply') auth = None user = instance.get('user') password = instance.get('password') tags = instance.get('tags', []) http_host = instance.get('http_host') timeout = instance.get('timeout', DEFAULT_TIMEOUT) disable_ssl_validation = _is_affirmative( instance.get('disable_ssl_validation', False)) if user and password: auth = (user, password) if status_url is None and ping_url is None: raise BadConfigError( "No status_url or ping_url specified for this instance") pool = None if status_url is not None: try: pool = self._process_status(status_url, auth, tags, http_host, timeout, disable_ssl_validation) except Exception as e: self.log.error("Error running php_fpm check: {}".format(e)) if ping_url is not None: self._process_ping(ping_url, ping_reply, auth, tags, pool, http_host, timeout, disable_ssl_validation)
def check_leader_v1(self, config, tags): url = config['api_url'] + '/sys/leader' leader_data = self.access_api(url, config, tags).json() is_leader = _is_affirmative(leader_data.get('is_self')) tags.append('is_leader:{}'.format('true' if is_leader else 'false')) current_leader = leader_data.get('leader_address') previous_leader = config['leader'] if config['detect_leader'] and current_leader: if previous_leader is not None and current_leader != previous_leader: self.event({ 'timestamp': timestamp(), 'event_type': self.EVENT_LEADER_CHANGE, 'msg_title': 'Leader change', 'msg_text': 'Leader changed from `{}` to `{}`.'.format( previous_leader, current_leader), 'alert_type': 'info', 'source_type_name': self.CHECK_NAME, 'host': self.hostname, 'tags': tags, }) config['leader'] = current_leader
def _collect_raw(self, ceph_cmd, ceph_cluster, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise Exception('The dd-agent user does not have sudo access') ceph_args = 'sudo {}'.format(ceph_cmd) else: ceph_args = ceph_cmd ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'): try: args = '{} {} -fjson'.format(ceph_args, cmd) output, _, _ = get_subprocess_output(args.split(), self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s' % (cmd, str(e))) continue name = cmd.replace(' ', '_') raw[name] = res return raw
def _cache_morlist_raw(self, instance): """ Initiate the first layer to refresh the list of MORs (`self.morlist`). Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery. """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) for resource_type in RESOURCE_TYPE_MAP: if i_key in self.morlist_raw and len(self.morlist_raw[i_key].get( resource_type, [])) > 0: self.log.debug( "Skipping morlist collection now, RAW results " "processing not over (latest refresh was {0}s ago)".format( time.time() - self.cache_times[i_key][MORLIST][LAST])) return self.morlist_raw[i_key] = {} instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = _is_affirmative( instance.get('include_only_marked', False)) # Discover hosts and virtual machines self._discover_mor(instance, [instance_tag], regexes, include_only_marked) self.cache_times[i_key][MORLIST][LAST] = time.time()
def check(self, instance): url = instance.get("url") username = instance.get("username") password = instance.get("password") custom_tags = instance.get('tags', []) max_queues = int(instance.get("max_queues", MAX_ELEMENTS)) max_topics = int(instance.get("max_topics", MAX_ELEMENTS)) max_subscribers = int(instance.get("max_subscribers", MAX_ELEMENTS)) detailed_queues = instance.get("detailed_queues", []) detailed_topics = instance.get("detailed_topics", []) detailed_subscribers = instance.get("detailed_subscribers", []) suppress_errors = _is_affirmative( instance.get("suppress_errors", False)) tags = custom_tags + ["url:{0}".format(url)] self.log.debug("Processing ActiveMQ data for %s" % url) data = self._fetch_data(url, QUEUE_URL, username, password, suppress_errors) if data: self._process_data(data, "queue", tags, max_queues, detailed_queues) data = self._fetch_data(url, TOPIC_URL, username, password, suppress_errors) if data: self._process_data(data, "topic", tags, max_topics, detailed_topics) data = self._fetch_data(url, SUBSCRIBER_URL, username, password, suppress_errors) if data: self._process_subscriber_data(data, tags, max_subscribers, detailed_subscribers)
def get_config(self, instance): instance_id = hash_mutable(instance) config = self.config.get(instance_id) if config is None: config = {} try: api_url = instance['api_url'] api_version = api_url[-1] if api_version not in self.api_versions: self.log.warning( 'Unknown Vault API version `{}`, using version ' '`{}`'.format(api_version, self.DEFAULT_API_VERSION)) config['api_url'] = api_url config['api'] = self.api_versions.get( api_version, self.DEFAULT_API_VERSION)['functions'] except KeyError: self.log.error( 'Vault configuration setting `api_url` is required') return client_token = instance.get('client_token') config['headers'] = { 'X-Vault-Token': client_token } if client_token else None username = instance.get('username') password = instance.get('password') config['auth'] = (username, password) if username and password else None config['ssl_verify'] = _is_affirmative( instance.get('ssl_verify', True)) config['proxies'] = self.get_instance_proxy( instance, config['api_url']) config['timeout'] = int(instance.get('timeout', 20)) config['tags'] = instance.get('tags', []) # Keep track of the previous cluster leader to detect changes. config['leader'] = None config['detect_leader'] = _is_affirmative( instance.get('detect_leader')) self.config[instance_id] = config return config
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.cluster_name = None for instance in instances or []: url = instance.get('url', '') parsed_url = urlparse(url) ssl_verify = not _is_affirmative(instance.get('disable_ssl_validation', False)) if not ssl_verify and parsed_url.scheme == 'https': self.log.warning('Skipping SSL cert validation for %s based on configuration.' % url)
def _get_pg_attrs(self, instance): if _is_affirmative(instance.get('use_psycopg2', False)): if psycopg2 is None: self.log.error("Unable to import psycopg2, falling back to pg8000") else: return psycopg2_connect, psycopg2.InterfaceError, psycopg2.ProgrammingError # Let's use pg8000 return pg8000.connect, pg8000.InterfaceError, pg8000.ProgrammingError
def _should_process(self, data_dict, collect_aggregates_only): """if collect_aggregates_only, we process only the aggregates """ if _is_affirmative(collect_aggregates_only): return self._is_aggregate(data_dict) elif str(collect_aggregates_only).lower() == 'both': return True return data_dict['svname'] != Services.BACKEND
def from_instance(instance): """ Create a config object from an instance dictionary """ url = instance.get('url') if not url: raise ConfigurationError("A URL must be specified in the instance") pshard_stats = _is_affirmative(instance.get('pshard_stats', False)) pshard_graceful_to = _is_affirmative( instance.get('pshard_graceful_timeout', False)) index_stats = _is_affirmative(instance.get('index_stats', False)) cluster_stats = _is_affirmative(instance.get('cluster_stats', False)) if 'is_external' in instance: cluster_stats = _is_affirmative(instance.get('is_external', False)) pending_task_stats = _is_affirmative( instance.get('pending_task_stats', True)) admin_forwarder = _is_affirmative(instance.get('admin_forwarder', False)) # Support URLs that have a path in them from the config, for # backwards-compatibility. parsed = urlparse.urlparse(url) if parsed[2] and not admin_forwarder: url = '{}://{}'.format(parsed[0], parsed[1]) port = parsed.port host = parsed.hostname custom_tags = instance.get('tags', []) service_check_tags = [ 'host:{}'.format(host), 'port:{}'.format(port), ] service_check_tags.extend(custom_tags) # Tag by URL so we can differentiate the metrics # from multiple instances tags = ['url:{}'.format(url)] tags.extend(custom_tags) timeout = instance.get('timeout') or DEFAULT_TIMEOUT config = ESInstanceConfig(admin_forwarder=admin_forwarder, pshard_stats=pshard_stats, pshard_graceful_to=pshard_graceful_to, cluster_stats=cluster_stats, index_stats=index_stats, password=instance.get('password'), service_check_tags=service_check_tags, health_tags=[], ssl_cert=instance.get('ssl_cert'), ssl_key=instance.get('ssl_key'), ssl_verify=instance.get('ssl_verify'), tags=tags, timeout=timeout, url=url, username=instance.get('username'), pending_task_stats=pending_task_stats) return config
def get_instance_config(self, instance): url = instance.get('url') if url is None: raise Exception("A URL must be specified in the instance") pshard_stats = _is_affirmative(instance.get('pshard_stats', False)) pshard_graceful_to = _is_affirmative(instance.get('pshard_graceful_timeout', False)) index_stats = _is_affirmative(instance.get('index_stats', False)) cluster_stats = _is_affirmative(instance.get('cluster_stats', False)) if 'is_external' in instance: cluster_stats = _is_affirmative(instance.get('is_external', False)) pending_task_stats = _is_affirmative(instance.get('pending_task_stats', True)) admin_forwarder = _is_affirmative(instance.get('admin_forwarder', False)) # Support URLs that have a path in them from the config, for # backwards-compatibility. parsed = urlparse.urlparse(url) if parsed[2] != "" and not admin_forwarder: url = "%s://%s" % (parsed[0], parsed[1]) port = parsed.port host = parsed.hostname custom_tags = instance.get('tags', []) service_check_tags = [ 'host:%s' % host, 'port:%s' % port ] service_check_tags.extend(custom_tags) # Tag by URL so we can differentiate the metrics # from multiple instances tags = ['url:%s' % url] tags.extend(custom_tags) timeout = instance.get('timeout') or self.DEFAULT_TIMEOUT config = ESInstanceConfig( admin_forwarder=admin_forwarder, pshard_stats=pshard_stats, pshard_graceful_to=pshard_graceful_to, cluster_stats=cluster_stats, index_stats=index_stats, password=instance.get('password'), service_check_tags=service_check_tags, health_tags=[], ssl_cert=instance.get('ssl_cert'), ssl_key=instance.get('ssl_key'), ssl_verify=instance.get('ssl_verify'), tags=tags, timeout=timeout, url=url, username=instance.get('username'), pending_task_stats=pending_task_stats ) return config
def check(self, instance): if 'url' not in instance: raise Exception('Mesos instance missing "url" value.') url = instance['url'] instance_tags = instance.get('tags', []) if instance_tags is None: instance_tags = [] tasks = instance.get('tasks', []) default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) master_port = instance.get("master_port", DEFAULT_MASTER_PORT) ssl_verify = not _is_affirmative(instance.get('disable_ssl_validation', False)) state_metrics = self._get_constant_attributes(url, timeout, master_port, ssl_verify, instance_tags) tags = None if state_metrics is None: state_metrics = self._get_state(url, timeout, ssl_verify, instance_tags) if state_metrics: tags = ['mesos_pid:{0}'.format(state_metrics['pid']), 'mesos_node:slave'] if self.cluster_name: tags.append('mesos_cluster:{0}'.format(self.cluster_name)) tags += instance_tags for task in tasks: for framework in state_metrics['frameworks']: for executor in framework['executors']: for t in executor['tasks']: if task.lower() in t['name'].lower() and t['slave_id'] == state_metrics['id']: task_tags = ['task_name:' + t['name']] + tags self.service_check(t['name'] + '.ok', self.TASK_STATUS[t['state']], tags=task_tags) for key_name, (metric_name, metric_func) in iteritems(self.TASK_METRICS): metric_func(self, metric_name, t['resources'][key_name], tags=task_tags) stats_metrics = self._get_stats(url, timeout, ssl_verify, instance_tags) if stats_metrics: tags = tags if tags else instance_tags metrics = [ self.SLAVE_TASKS_METRICS, self.SYSTEM_METRICS, self.SLAVE_RESOURCE_METRICS, self.SLAVE_EXECUTORS_METRICS, self.STATS_METRICS, ] for m in metrics: for key_name, (metric_name, metric_func) in iteritems(m): if key_name in stats_metrics: metric_func(self, metric_name, stats_metrics[key_name], tags=tags) self.service_check_needed = True
def _load_conf(self, instance): tags = instance.get("tags", []) ip_address = instance["ip_address"] metrics = instance.get('metrics', []) timeout = int(instance.get('timeout', self.DEFAULT_TIMEOUT)) retries = int(instance.get('retries', self.DEFAULT_RETRIES)) enforce_constraints = _is_affirmative( instance.get('enforce_mib_constraints', True)) instance_key = instance['name'] cmd_generator = self.generators.get(instance_key, None) if not cmd_generator: cmd_generator = self.create_command_generator( self.mibs_path, self.ignore_nonincreasing_oid) self.generators[instance_key] = cmd_generator return cmd_generator, ip_address, tags, metrics, timeout, retries, enforce_constraints
def __init__(self, name, init_config, agentConfig, instances): for instance in instances: if 'name' not in instance: instance['name'] = self._get_instance_key(instance) # Set OID batch size self.oid_batch_size = int( init_config.get("oid_batch_size", DEFAULT_OID_BATCH_SIZE)) # Load Custom MIB directory self.mibs_path = None self.ignore_nonincreasing_oid = False if init_config is not None: self.mibs_path = init_config.get("mibs_folder") self.ignore_nonincreasing_oid = _is_affirmative( init_config.get("ignore_nonincreasing_oid", False)) NetworkCheck.__init__(self, name, init_config, agentConfig, instances)
def get_instance_config(self, instance): if 'url' not in instance: raise Exception('Marathon instance missing "url" value.') # Load values from the instance config url = instance['url'] user = instance.get('user') password = instance.get('password') acs_url = instance.get('acs_url') if user is not None and password is not None: auth = (user, password) else: auth = None ssl_verify = not _is_affirmative( instance.get('disable_ssl_validation', False)) group = instance.get('group', None) tags = instance.get('tags', []) default_timeout = self.init_config.get('default_timeout', self.DEFAULT_TIMEOUT) timeout = float(instance.get('timeout', default_timeout)) return url, auth, acs_url, ssl_verify, group, tags, timeout
def _connect(self, instance): for e in ("access_id", "access_secret"): if e not in instance: raise Exception("{0} parameter is required.".format(e)) s3_settings = { "aws_access_key_id": instance.get('access_id', None), "aws_secret_access_key": instance.get('access_secret', None), "proxy": instance.get('host', 'localhost'), "proxy_port": int(instance.get('port', 8080)), "is_secure": _is_affirmative(instance.get('is_secure', True)) } if instance.get('s3_root'): s3_settings['host'] = instance['s3_root'] aggregation_key = s3_settings['proxy'] + ":" + str( s3_settings['proxy_port']) tags = instance.get("tags", []) if tags is None: tags = [] tags.append("aggregation_key:{0}".format(aggregation_key)) try: s3 = S3Connection(**s3_settings) except Exception as e: self.log.error("Error connecting to {0}: {1}".format( aggregation_key, e)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=tags, message=str(e)) raise metrics = instance.get("metrics", []) return s3, aggregation_key, tags, metrics
def _cache_morlist_raw(self, instance): """ Initiate the first layer to refresh the list of MORs (`self.morlist`). Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery. """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) # If the queue is not completely empty, don't do anything for resource_type in RESOURCE_TYPE_METRICS: if self.mor_objects_queue.contains( i_key) and self.mor_objects_queue.size( i_key, resource_type): last = self.cache_config.get_last(CacheConfig.Morlist, i_key) self.log.debug( "Skipping morlist collection: the objects queue for the " "resource type '{}' is still being processed " "(latest refresh was {}s ago)".format( resource_type, time.time() - last)) return instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = _is_affirmative( instance.get('include_only_marked', False)) # Discover hosts and virtual machines self.pool.apply_async(self._cache_morlist_raw_atomic, args=(instance, [instance_tag], regexes, include_only_marked)) self.cache_config.set_last(CacheConfig.Morlist, i_key, time.time())
def _psutil_config_to_stats(self, instance): """ Reads `init_config` for `psutil` methods to call on the current process Calls those methods and stores the raw output :returns a dictionary of statistic_name: value """ process_metrics = instance.get( 'process_metrics', self.init_config.get('process_metrics', None)) if not process_metrics: self.log.error('No metrics configured for AgentMetrics check!') return {} methods, metric_types = zip(*[(p['name'], p.get('type', GAUGE)) for p in process_metrics if _is_affirmative(p.get('active'))]) names_to_metric_types = {} for i, m in enumerate(methods): names_to_metric_types[AgentMetrics._get_statistic_name_from_method( m)] = metric_types[i] stats = AgentMetrics._collect_internal_stats(methods) return stats, names_to_metric_types
def _check_connectivity_to_master(self, instance, tags): url = instance.get('gitlab_url') if url is None: # Simply ignore this service check if not configured return parsed_url = urlparse(url) gitlab_host = parsed_url.hostname gitlab_port = 443 if parsed_url.scheme == 'https' else (parsed_url.port or 80) service_check_tags = [ 'gitlab_host:{}'.format(gitlab_host), 'gitlab_port:{}'.format(gitlab_port) ] service_check_tags.extend(tags) # Load the ssl configuration ssl_cert_validation = _is_affirmative( instance.get('ssl_cert_validation', True)) ssl_ca_certs = instance.get('ssl_ca_certs', True) verify_ssl = ssl_ca_certs if ssl_cert_validation else False # Timeout settings timeouts = ( int( instance.get('connect_timeout', GitlabRunnerCheck.DEFAULT_CONNECT_TIMEOUT)), int( instance.get('receive_timeout', GitlabRunnerCheck.DEFAULT_RECEIVE_TIMEOUT)), ) # Auth settings auth = None if 'gitlab_user' in instance and 'gitlab_password' in instance: auth = (instance['gitlab_user'], instance['gitlab_password']) try: self.log.debug("checking connectivity against {}".format(url)) r = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeouts, headers=headers(self.agentConfig)) if r.status_code != 200: self.service_check( self.MASTER_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message="Got {} when hitting {}".format( r.status_code, url), tags=service_check_tags, ) raise Exception("Http status code {} on url {}".format( r.status_code, url)) else: r.raise_for_status() except requests.exceptions.Timeout: # If there's a timeout self.service_check( self.MASTER_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message="Timeout when hitting {}".format(url), tags=service_check_tags, ) raise except Exception as e: self.service_check( self.MASTER_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message="Error hitting {}. Error: {}".format(url, e), tags=service_check_tags, ) raise else: self.service_check(self.MASTER_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.OK, tags=service_check_tags) self.log.debug("gitlab check succeeded")