def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # A dictionary to keep track of service statuses self.statuses = {} self.notified = {} self.start_pool()
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) for k in ["mean", "median", "95", "99", "100"]: for m in self.stat_keys: self.keys.append(m + "_" + k) self.prev_coord_redirs_total = -1
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception('Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.kubeutil = KubeUtil() if not self.kubeutil.host: raise Exception('Unable to get default router and host parameter is not set')
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # ad stands for access denied # We cache the PIDs getting this error and don't iterate on them # more often than `access_denied_cache_duration` # This cache is for all PIDs so it's global, but it should # be refreshed by instance self.last_ad_cache_ts = {} self.ad_cache = set() self.access_denied_cache_duration = int( init_config.get( 'access_denied_cache_duration', DEFAULT_AD_CACHE_DURATION ) ) # By default cache the PID list for a while # Sometimes it's not wanted b/c it can mess with no-data monitoring # This cache is indexed per instance self.last_pid_cache_ts = {} self.pid_cache = {} self.pid_cache_duration = int( init_config.get( 'pid_cache_duration', DEFAULT_PID_CACHE_DURATION ) ) # Process cache, indexed by instance self.process_cache = defaultdict(dict)
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # message.type is the index in this array # see: https://github.com/prometheus/client_model/blob/master/ruby/lib/prometheus/client/model/metrics.pb.rb self.METRIC_TYPES = ['counter', 'gauge', 'summary', 'untyped', 'histogram'] # patterns used for metrics and labels extraction form the prometheus # text format. Do not overwrite those self.metrics_pattern = re.compile(r'^(\w+)(.*)\s+([0-9.+eE,]+)$') self.lbl_pattern = re.compile(r'(\w+)="(.*?)"') # `NAMESPACE` is the prefix metrics will have. Need to be hardcoded in the # child check class. self.NAMESPACE = '' # `metrics_mapper` is a dictionnary where the keys are the metrics to capture # and the values are the corresponding metrics names to have in datadog. # Note: it is empty in the mother class but will need to be # overloaded/hardcoded in the final check not to be counted as custom metric. self.metrics_mapper = {} # If the `labels_mapper` dictionnary is provided, the metrics labels names # in the `labels_mapper` will use the corresponding value as tag name # when sending the gauges. self.labels_mapper = {} # `exclude_labels` is an array of labels names to exclude. Those labels # will just not be added as tags when submitting the metric. self.exclude_labels = []
def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.time_started = time.time() self.pool_started = False self.exceptionq = Queue() # Connections open to vCenter instances self.server_instances = {} # Caching resources, timeouts self.cache_times = {} for instance in self.instances: i_key = self._instance_key(instance) self.cache_times[i_key] = { MORLIST: { LAST: 0, INTERVAL: init_config.get('refresh_morlist_interval', REFRESH_MORLIST_INTERVAL) }, METRICS_METADATA: { LAST: 0, INTERVAL: init_config.get('refresh_metrics_metadata_interval', REFRESH_METRICS_METADATA_INTERVAL) } } # First layer of cache (get entities from the tree) self.morlist_raw = {} # Second layer, processed from the first one self.morlist = {} # Metrics metadata, basically perfCounterId -> {name, group, description} self.metrics_metadata = {} self.latest_event_query = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Initialize a HTTP opener with Unix socket support socket_timeout = int(init_config.get('socket_timeout', 0)) \ or DEFAULT_SOCKET_TIMEOUT UnixHTTPConnection.socket_timeout = socket_timeout self.url_opener = urllib2.build_opener(UnixSocketHandler())
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception("Disk check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) # Get the configuration once for all self._load_conf(instances[0])
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) # Used to store the instances of the jmx connector (1 per instance) self.jmxs = {} self.jmx_metrics = [] self.init_config = init_config
def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # A dictionary to keep track of service statuses self.statuses = {} self.notified = {} self.nb_failures = 0 self.pool_started = False
def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances, allow_no_data=True) # A dictionary to keep track of service statuses self.statuses = {} self.notified = {} self.start_pool() self.nb_failures = 0
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Parse job specific counters self.general_counters = self._parse_general_counters(init_config) # Parse job specific counters self.job_specific_counters = self._parse_job_specific_counters(init_config)
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Load Custom MIB directory mibs_path = None if init_config is not None: mibs_path = init_config.get("mibs_folder") SnmpCheck.create_command_generator(mibs_path)
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) for instance in instances or []: url = instance.get('url', '') parsed_url = urlparse(url) ssl_verify = not _is_affirmative(instance.get('disable_ssl_validation', False)) if not ssl_verify and parsed_url.scheme == 'https': self.log.warning('Skipping SSL cert validation for %s based on configuration.' % url)
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) if instances is not None and len(instances) > 1: raise Exception("Consul check only supports one configured instance.") self._local_config = None self._last_config_fetch_time = None self._last_known_leader = None
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Members' last replica set states self._last_state_by_server = {} # List of metrics to collect per instance self.metrics_to_collect_by_instance = {}
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception("Docker check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) self.init_success = False self.init()
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.dbs = {} self.versions = {} self.instance_metrics = {} self.bgw_metrics = {} self.db_instance_metrics = [] self.db_bgw_metrics = [] self.replication_metrics = {}
def __init__(self, *args, **kwargs): AgentCheck.__init__(self, *args, **kwargs) self.log.info('key_file_location: %s' % self.init_config.get('key_file_location')) self.service = self.get_service( self.apiName, self.version, self.scope, self.init_config.get('key_file_location'))
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) # Used to store the instances of the jmx connector (1 per instance) self.jmxs = {} self.jmx_metrics = [] self.init_config = init_config # Used to store the number of times we opened a new jmx connector for this instance self.jmx_connections_watcher = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Load Custom MIB directory mibs_path = None ignore_nonincreasing_oid = False if init_config is not None: mibs_path = init_config.get("mibs_folder") ignore_nonincreasing_oid = _is_affirmative(init_config.get("ignore_nonincreasing_oid", False)) SnmpCheck.create_command_generator(mibs_path, ignore_nonincreasing_oid)
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception("Docker check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) self.init_success = False self.init() self._service_discovery = agentConfig.get('service_discovery') and \ agentConfig.get('service_discovery_backend') == 'docker'
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) try: import redis except ImportError: self.log.error('redisdb.yaml exists but redis module can not be imported. Skipping check.') self.previous_total_commands = {} self.connections = {}
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) self._mountpoints = {} docker_root = init_config.get('docker_root', '/') for metric in CGROUP_METRICS: self._mountpoints[metric["cgroup"]] = self._find_cgroup(metric["cgroup"], docker_root) self._last_event_collection_ts = defaultdict(lambda: None) self.url_opener = urllib2.build_opener(UnixSocketHandler()) self.should_get_size = True self._cgroup_filename_pattern = None
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception('Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) inst = instances[0] if instances is not None else None self.kubeutil = KubeUtil(instance=inst) if not self.kubeutil.host: raise Exception('Unable to retrieve Docker hostname and host parameter is not set')
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Members' last replica set states self._last_state_by_server = {} # List of metrics to collect per instance self.metrics_to_collect_by_instance = {} self.collection_metrics_names = [] for (key, value) in self.COLLECTION_METRICS.iteritems(): self.collection_metrics_names.append(key.split('.')[1])
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) # Load any custom metrics from conf.d/sqlserver.yaml for row in init_config.get("custom_metrics", []): if row["type"] not in VALID_METRIC_TYPES: self.log.error("%s has an invalid metric type: %s" % (row["name"], row["type"])) self.METRICS.append( (row["name"], row["type"], row["counter_name"], row.get("instance_name", ""), row.get("tag_by", None)) ) # Cache connections self.connections = {}
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) # If we can't import the redis module, we should always skip this check try: import redis self.enabled = True except ImportError: self.enabled = False self.log.error('redisdb.yaml exists but redis module can not be imported. Skipping check.') self.previous_total_commands = {} self.connections = {}
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) # Load any custom metrics from conf.d/sqlserver.yaml for row in init_config.get('custom_metrics', []): if row['type'] not in VALID_METRIC_TYPES: self.log.error('%s has an invalid metric type: %s' \ % (row['name'], row['type'])) self.METRICS.append( (row['name'], row['type'], row['counter_name'], row.get('instance_name', ''), row.get('tag_by', None)) ) # Cache connections self.connections = {}
def __init__(self, name, init_config, agentConfig, instances = None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Cache connections self.connections = {} self.instances_metrics = {} for instance in instances: metrics_to_collect = [] for name, counter_name, instance_name in self.METRICS: try: sql_type, base_name = self.get_sql_type(instance, counter_name) metrics_to_collect.append(self.typed_metric(name, counter_name, base_name, None, sql_type, instance_name, None)) except Exception: self.log.warning("Can't load the metric %s, ignoring", name, exc_info=True) continue # Load any custom metrics from conf.d/sqlserver.yaml for row in init_config.get('custom_metrics', []): user_type = row.get('type') if user_type is not None and user_type not in VALID_METRIC_TYPES: self.log.error('%s has an invalid metric type: %s' \ % (row['name'], user_type)) sql_type = None try: if user_type is None: sql_type, base_name = self.get_sql_type(instance, row['counter_name']) except Exception: self.log.warning("Can't load the metric %s, ignoring", name, exc_info=True) continue metrics_to_collect.append(self.typed_metric(row['name'], row['counter_name'], base_name, user_type, sql_type, row.get('instance_name', ''), row.get('tag_by', None))) instance_key = self._conn_key(instance) self.instances_metrics[instance_key] = metrics_to_collect
def __init__(self, name, init_config, agentConfig, instances, counter_list): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._countersettypes = {} self._counters = {} self._metrics = {} self._tags = {} try: for instance in instances: key = hash_mutable(instance) cfg_tags = instance.get('tags') if cfg_tags is not None: if not isinstance(cfg_tags, list): self.log.error("Tags must be configured as a list") raise ValueError("Tags must be type list, not %s" % str(type(cfg_tags))) self._tags[key] = list(cfg_tags) remote_machine = None host = instance.get('host') self._metrics[key] = [] if host is not None and host != ".": try: remote_machine = host username = instance.get('username') password = instance.get('password') nr = win32wnet.NETRESOURCE() nr.lpRemoteName = r"\\%s\c$" % remote_machine nr.dwType = 0 nr.lpLocalName = None win32wnet.WNetAddConnection2(nr, password, username, 0) except Exception as e: self.log.error("Failed to make remote connection %s" % str(e)) return # list of the metrics. Each entry is itself an entry, # which is the pdh name, datadog metric name, type, and the # pdh counter object for counterset, inst_name, counter_name, dd_name, mtype in counter_list: m = getattr(self, mtype.lower()) obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name=remote_machine) entry = [inst_name, dd_name, m, obj] self.log.debug("entry: %s" % str(entry)) self._metrics[key].append(entry) # get any additional metrics in the instance addl_metrics = instance.get('additional_metrics') if addl_metrics is not None: for counterset, inst_name, counter_name, dd_name, mtype in addl_metrics: if inst_name.lower() == "none" or len( inst_name ) == 0 or inst_name == "*" or inst_name.lower( ) == "all": inst_name = None m = getattr(self, mtype.lower()) obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name=remote_machine) entry = [inst_name, dd_name, m, obj] self.log.debug("additional metric entry: %s" % str(entry)) self._metrics[key].append(entry) except Exception as e: self.log.debug("Exception in PDH init: %s", str(e)) raise
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.high_watermarks = {}
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception( 'Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.kube_settings = set_kube_settings(instances[0])
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) # Host status needs to persist across all checks self.host_status = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # message.type is the index in this array # see: https://github.com/prometheus/client_model/blob/master/ruby/lib/prometheus/client/model/metrics.pb.rb self.METRIC_TYPES = ['counter', 'gauge', 'summary', 'untyped', 'histogram'] # `NAMESPACE` is the prefix metrics will have. Need to be hardcoded in the # child check class. self.NAMESPACE = '' # `metrics_mapper` is a dictionary where the keys are the metrics to capture # and the values are the corresponding metrics names to have in datadog. # Note: it is empty in the parent class but will need to be # overloaded/hardcoded in the final check not to be counted as custom metric. self.metrics_mapper = {} # `label_joins` holds the configuration for extracting 1:1 labels from # a target metric to all metric matching the label, example: # self.label_joins = { # 'kube_pod_info': { # 'label_to_match': 'pod', # 'labels_to_get': ['node', 'host_ip'] # } # } self.label_joins = {} # `_label_mapping` holds the additionals label info to add for a specific # label value, example: # self._label_mapping = { # 'pod': { # 'dd-agent-9s1l1': [("node","yolo"),("host_ip","yey")] # } # } self._label_mapping = {} # `_active_label_mapping` holds a dictionary of label values found during the run # to cleanup the label_mapping of unused values, example: # self._active_label_mapping = { # 'pod': { # 'dd-agent-9s1l1': True # } # } self._active_label_mapping = {} # `_watched_labels` holds the list of label to watch for enrichment self._watched_labels = set() self._dry_run = True # Some metrics are ignored because they are duplicates or introduce a # very high cardinality. Metrics included in this list will be silently # skipped without a 'Unable to handle metric' debug line in the logs self.ignore_metrics = [] # If the `labels_mapper` dictionary is provided, the metrics labels names # in the `labels_mapper` will use the corresponding value as tag name # when sending the gauges. self.labels_mapper = {} # `exclude_labels` is an array of labels names to exclude. Those labels # will just not be added as tags when submitting the metric. self.exclude_labels = [] # `type_overrides` is a dictionary where the keys are prometheus metric names # and the values are a metric type (name as string) to use instead of the one # listed in the payload. It can be used to force a type on untyped metrics. # Note: it is empty in the parent class but will need to be # overloaded/hardcoded in the final check not to be counted as custom metric. self.type_overrides = {} # Some metrics are retrieved from differents hosts and often # a label can hold this information, this transfer it to the hostname self.label_to_hostname = None # Can either be only the path to the certificate and thus you should specify the private key # or it can be the path to a file containing both the certificate & the private key self.ssl_cert = None # Needed if the certificate does not include the private key # # /!\ The private key to your local certificate must be unencrypted. # Currently, Requests does not support using encrypted keys. self.ssl_private_key = None # The path to the trusted CA used for generating custom certificates self.ssl_ca_cert = None
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) self.connections = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) self.zk_timeout = int( init_config.get('zk_timeout', DEFAULT_ZK_TIMEOUT)) self.kafka_timeout = int( init_config.get('kafka_timeout', DEFAULT_KAFKA_TIMEOUT))
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._last_state_by_server = {} self.idx_rates = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.default_timeout = init_config.get('default_timeout', self.DEFAULT_TIMEOUT)
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Host status needs to persist across all checks self.cluster_status = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.db_blacklist = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.assumed_url = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.mysql_version = {} self.greater_502 = {}
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) # Host status needs to persist across all checks self.host_status = defaultdict(lambda: defaultdict(lambda: None))
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) self._last_state_by_server = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) if instances is not None and len(instances) > 1: raise Exception("Network check only supports one configured instance.")
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) self.last_ts = {} self.wmi_conns = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._instance_states = defaultdict(lambda: ConsulCheckInstanceState())
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.connections = {} self.last_timestamp_seen = defaultdict(int)
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances)
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.nodetool_cmd = init_config.get("nodetool", "/usr/bin/nodetool")
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) for k in ["mean", "median", "95", "99", "100"]: [self.keys.append(m + "_" + k) for m in self.stat_keys] self.prev_coord_redirs_total = -1
def __init__(self, *args, **kwargs): AgentCheck.__init__(self, *args, **kwargs) self._collector_payload = {} self._metric_context = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.already_alerted = []
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.instance_cleanup_times = {}
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) self.high_watermarks = {}
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._last_gc_count = defaultdict(int)
def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.wmi_conns = {}
def __init__(self, name, init_config, agentConfig): AgentCheck.__init__(self, name, init_config, agentConfig) self._last_state = -1
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Cache connections self.connections = {} self.failed_connections = {} self.instances_metrics = {} self.instances_per_type_metrics = defaultdict(dict) self.existing_databases = None self.do_check = {} self.proc_type_mapping = { 'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram } self.connector = init_config.get('connector', 'adodbapi') if not self.connector.lower() in self.valid_connectors: self.log.error( "Invalid database connector %s, defaulting to adodbapi" % self.connector) self.connector = 'adodbapi' # Pre-process the list of metrics to collect self.custom_metrics = init_config.get('custom_metrics', []) for instance in instances: try: instance_key = self._conn_key(instance, self.DEFAULT_DB_KEY) self.do_check[instance_key] = True # check to see if the database exists before we try any connections to it with self.open_managed_db_connections( instance, None, db_name=self.DEFAULT_DATABASE): db_exists, context = self._check_db_exists(instance) if db_exists: if instance.get('stored_procedure') is None: with self.open_managed_db_connections( instance, self.DEFAULT_DB_KEY): self._make_metric_list_to_collect( instance, self.custom_metrics) else: # How much do we care that the DB doesn't exist? ignore = _is_affirmative( instance.get("ignore_missing_database", False)) if ignore is not None and ignore: # not much : we expect it. leave checks disabled self.do_check[instance_key] = False self.log.warning( "Database %s does not exist. Disabling checks for this instance." % (context)) else: # yes we do. Keep trying self.log.error( "Database %s does not exist. Fix issue and restart agent" % (context)) except SQLConnectionError: self.log.exception("Skipping SQL Server instance") continue except Exception as e: self.log.exception("INitialization exception %s", str(e)) continue