def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._countersettypes = {} self._counters = {} self._metrics = {} self._tags = {} try: for instance in instances: key = hash_mutable(instance) counterset = instance.get('countersetname') cfg_tags = instance.get('tags') if cfg_tags is not None: tags = cfg_tags.join(",") self._tags[key] = list(tags) if tags else [] metrics = instance.get('metrics') # list of the metrics. Each entry is itself an entry, # which is the pdh name, datadog metric name, type, and the # pdh counter object self._metrics[key] = [] for inst_name, dd_name, mtype in metrics: m = getattr(self, mtype.lower()) obj = WinPDHCounter(counterset, inst_name, self.log) entry = [inst_name, dd_name, m, obj] self.log.debug("entry: %s" % str(entry)) self._metrics[key].append(entry) except Exception as e: self.log.debug("Exception in PDH init: %s", str(e)) raise
def test_network_latency_checks(self): self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_NETWORK_LATENCY_CHECKS, self.DEFAULT_AGENT_CONFIG) mocks = self._get_consul_mocks() # We start out as the leader, and stay that way instance_hash = hash_mutable(MOCK_CONFIG_NETWORK_LATENCY_CHECKS['instances'][0]) self.check._instance_states[instance_hash].last_known_leader = self.mock_get_cluster_leader_A(None) self.run_check(MOCK_CONFIG_NETWORK_LATENCY_CHECKS, mocks=mocks) latency = [m for m in self.metrics if m[0].startswith('consul.net.')] latency.sort() # Make sure we have the expected number of metrics self.assertEquals(19, len(latency)) # Only 3 dc-latency metrics since we only do source = self dc = [m for m in latency if '.dc.latency.' in m[0]] self.assertEquals(3, len(dc)) self.assertEquals(1.6746410750238774, dc[0][2]) # 16 latency metrics, 2 nodes * 8 metrics each node = [m for m in latency if '.node.latency.' in m[0]] self.assertEquals(16, len(node)) self.assertEquals(0.26577747932995816, node[0][2])
def check(self, instance): # Connect to the WMI provider host = instance.get('host', "localhost") provider = instance.get('provider') user = instance.get('username', "") password = instance.get('password', "") instance_tags = instance.get('tags', []) sites = instance.get('sites', ['_Total']) is_2008 = _is_affirmative(instance.get('is_2008', False)) instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash) filters = map(lambda x: {"Name": tuple(('=', x))}, sites) metrics_by_property, properties = self._get_wmi_properties( instance_key, self.METRICS, []) if is_2008: for idx, prop in enumerate(properties): if prop == "TotalBytesTransferred": properties[idx] = "TotalBytesTransfered" break wmi_sampler = self._get_wmi_sampler(instance_key, self.CLASS, properties, filters=filters, host=host, namespace=self.NAMESPACE, provider=provider, username=user, password=password) # Sample, extract & submit metrics try: wmi_sampler.sample() metrics = self._extract_metrics(wmi_sampler, sites, instance_tags) except TimeoutException: self.log.warning( u"[IIS] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={instance_tags}".format( wmi_class=self.CLASS, wmi_properties=properties, filters=filters, instance_tags=instance_tags)) except pythoncom.com_error as e: if '0x80041017' in str(e): self.warning( u"You may be running IIS6/7 which reports metrics a " u"little differently. Try enabling the is_2008 flag for this instance." ) raise e else: self._submit_events(wmi_sampler, sites) self._submit_metrics(metrics, metrics_by_property)
def check(self, instance): """ Fetch WMI metrics. """ # Connection information host = instance.get('host', "localhost") namespace = instance.get('namespace', "root\\cimv2") provider = instance.get('provider') username = instance.get('username', "") password = instance.get('password', "") # WMI instance wmi_class = instance.get('class') metrics = instance.get('metrics') filters = instance.get('filters') tag_by = instance.get('tag_by', "") tag_queries = instance.get('tag_queries', []) constant_tags = instance.get('constant_tags') # Create or retrieve an existing WMISampler instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, namespace, wmi_class, instance_hash) metric_name_and_type_by_property, properties = \ self._get_wmi_properties(instance_key, metrics, tag_queries) wmi_sampler = self._get_wmi_sampler( instance_key, wmi_class, properties, tag_by=tag_by, filters=filters, host=host, namespace=namespace, provider=provider, username=username, password=password, ) # Sample, extract & submit metrics try: wmi_sampler.sample() metrics = self._extract_metrics(wmi_sampler, tag_by, tag_queries, constant_tags) except TimeoutException: self.log.warning( u"WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tag_queries={tag_queries}".format( wmi_class=wmi_class, wmi_properties=properties, filters=filters, tag_queries=tag_queries)) else: self._submit_metrics(metrics, metric_name_and_type_by_property)
def check(self, instance): # Connect to the WMI provider host = instance.get('host', "localhost") provider = instance.get('provider') user = instance.get('username', "") password = instance.get('password', "") instance_tags = instance.get('tags', []) sites = instance.get('sites', ['_Total']) is_2008 = _is_affirmative(instance.get('is_2008', False)) instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash) filters = map(lambda x: {"Name": tuple(('=', x))}, sites) metrics_by_property, properties = self._get_wmi_properties(instance_key, self.METRICS, []) if is_2008: for idx, prop in enumerate(properties): if prop == "TotalBytesTransferred": properties[idx] = "TotalBytesTransfered" break wmi_sampler = self._get_wmi_sampler( instance_key, self.CLASS, properties, filters=filters, host=host, namespace=self.NAMESPACE, provider=provider, username=user, password=password ) # Sample, extract & submit metrics try: wmi_sampler.sample() metrics = self._extract_metrics(wmi_sampler, sites, instance_tags) except TimeoutException: self.log.warning( u"[IIS] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={instance_tags}".format( wmi_class=self.CLASS, wmi_properties=properties, filters=filters, instance_tags=instance_tags ) ) except pythoncom.com_error as e: if '0x80041017' in str(e): self.warning( u"You may be running IIS6/7 which reports metrics a " u"little differently. Try enabling the is_2008 flag for this instance." ) raise e else: self._submit_events(wmi_sampler, sites) self._submit_metrics(metrics, metrics_by_property)
def check(self, instance): key = hash_mutable(instance) for inst_name, dd_name, metric_func, counter in self._metrics[key]: vals = counter.get_all_values() for key, val in vals.iteritems(): tags = [] if key in self._tags: tags = self._tags[key] if not counter.is_single_instance(): tag = "instance=%s" % key tags.append(tag) metric_func(dd_name, val, tags)
def check(self, instance): # Connect to the WMI provider host = instance.get('host', "localhost") user = instance.get('username', "") password = instance.get('password', "") services = instance.get('services', []) custom_tags = instance.get('tags', []) instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash) tags = [] if (host == "localhost" or host == ".") else [u'host:{0}'.format(host)] tags.extend(custom_tags) if len(services) == 0: raise Exception('No services defined in windows_service.yaml') properties = ["Name", "State"] if "ALL" in services: self.log.debug("tracking all services") filters = None else: filters = map( lambda x: {"Name": tuple(('LIKE', x)) if '%' in x else tuple(('=', x))}, services) wmi_sampler = self._get_wmi_sampler(instance_key, self.CLASS, properties, filters=filters, host=host, namespace=self.NAMESPACE, username=user, password=password) try: # Sample, extract & submit metrics wmi_sampler.sample() except TimeoutException: self.log.warning( u"[WinService] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={tags}".format( wmi_class=self.CLASS, wmi_properties=properties, filters=filters, tags=tags)) else: self._process_services(wmi_sampler, services, tags)
def check(self, instance): """ Fetch WMI metrics. """ # Connection information host = instance.get('host', "localhost") namespace = instance.get('namespace', "root\\cimv2") provider = instance.get('provider') username = instance.get('username', "") password = instance.get('password', "") # WMI instance wmi_class = instance.get('class') metrics = instance.get('metrics') filters = instance.get('filters') tag_by = instance.get('tag_by', "") tag_queries = instance.get('tag_queries', []) constant_tags = instance.get('constant_tags') # Create or retrieve an existing WMISampler instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, namespace, wmi_class, instance_hash) metric_name_and_type_by_property, properties = \ self._get_wmi_properties(instance_key, metrics, tag_queries) wmi_sampler = self._get_wmi_sampler( instance_key, wmi_class, properties, tag_by=tag_by, filters=filters, host=host, namespace=namespace, provider=provider, username=username, password=password, ) # Sample, extract & submit metrics try: wmi_sampler.sample() metrics = self._extract_metrics(wmi_sampler, tag_by, tag_queries, constant_tags) except TimeoutException: self.log.warning( u"WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tag_queries={tag_queries}".format( wmi_class=wmi_class, wmi_properties=properties, filters=filters, tag_queries=tag_queries ) ) else: self._submit_metrics(metrics, metric_name_and_type_by_property)
def test_new_leader_event(self): self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_LEADER_CHECK, self.DEFAULT_AGENT_CONFIG) instance_hash = hash_mutable(MOCK_CONFIG_LEADER_CHECK['instances'][0]) self.check._instance_states[instance_hash].last_known_leader = 'My Old Leader' mocks = self._get_consul_mocks() mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_B self.run_check(MOCK_CONFIG_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 1) event = self.events[0] self.assertEqual(event['event_type'], 'consul.new_leader') self.assertIn('prev_consul_leader:My Old Leader', event['tags']) self.assertIn('curr_consul_leader:My New Leader', event['tags'])
def test_self_leader_event(self): self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_SELF_LEADER_CHECK, self.DEFAULT_AGENT_CONFIG) instance_hash = hash_mutable( MOCK_CONFIG_SELF_LEADER_CHECK['instances'][0]) self.check._instance_states[ instance_hash].last_known_leader = 'My Old Leader' mocks = self._get_consul_mocks() our_url = self.mock_get_cluster_leader_A(None) other_url = self.mock_get_cluster_leader_B(None) # We become the leader mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_A self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 1) self.assertEqual( our_url, self.check._instance_states[instance_hash].last_known_leader) event = self.events[0] self.assertEqual(event['event_type'], 'consul.new_leader') self.assertIn('prev_consul_leader:My Old Leader', event['tags']) self.assertIn('curr_consul_leader:%s' % our_url, event['tags']) # We are already the leader, no new events self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 0) # We lose the leader, no new events mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_B self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 0) self.assertEqual( other_url, self.check._instance_states[instance_hash].last_known_leader) # We regain the leadership mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_A self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 1) self.assertEqual( our_url, self.check._instance_states[instance_hash].last_known_leader) event = self.events[0] self.assertEqual(event['event_type'], 'consul.new_leader') self.assertIn('prev_consul_leader:%s' % other_url, event['tags']) self.assertIn('curr_consul_leader:%s' % our_url, event['tags'])
def check(self, instance): self.log.debug("PDHBaseCheck: check()") key = hash_mutable(instance) for inst_name, dd_name, metric_func, counter in self._metrics[key]: try: vals = counter.get_all_values() for instance_name, val in vals.iteritems(): tags = [] if key in self._tags: tags = list(self._tags[key]) if not counter.is_single_instance(): tag = "instance:%s" % instance_name tags.append(tag) metric_func(dd_name, val, tags) except Exception as e: # don't give up on all of the metrics because one failed self.log.error("Failed to get data for %s %s: %s" % (inst_name, dd_name, str(e))) pass
def check(self, instance): # Connect to the WMI provider host = instance.get('host', "localhost") user = instance.get('username', "") password = instance.get('password', "") instance_tags = instance.get('tags', []) sites = instance.get('sites', ['_Total']) instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash) filters = map(lambda x: {"Name": tuple(('=', x))}, sites) metrics_by_property, properties = self._get_wmi_properties( instance_key, self.METRICS, []) wmi_sampler = self._get_wmi_sampler(instance_key, self.CLASS, properties, filters=filters, host=host, namespace=self.NAMESPACE, username=user, password=password) # Sample, extract & submit metrics try: wmi_sampler.sample() metrics = self._extract_metrics(wmi_sampler, sites, instance_tags) except TimeoutException: self.log.warning( u"[IIS] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={instance_tags}".format( wmi_class=self.CLASS, wmi_properties=properties, filters=filters, instance_tags=instance_tags)) else: self._submit_events(wmi_sampler, sites) self._submit_metrics(metrics, metrics_by_property)
def check(self, instance): # Connect to the WMI provider host = instance.get('host', "localhost") user = instance.get('username', "") password = instance.get('password', "") instance_tags = instance.get('tags', []) sites = instance.get('sites', ['_Total']) instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash) filters = map(lambda x: {"Name": tuple(('=', x))}, sites) metrics_by_property, properties = self._get_wmi_properties(instance_key, self.METRICS, []) wmi_sampler = self._get_wmi_sampler( instance_key, self.CLASS, properties, filters=filters, host=host, namespace=self.NAMESPACE, username=user, password=password ) # Sample, extract & submit metrics try: wmi_sampler.sample() metrics = self._extract_metrics(wmi_sampler, sites, instance_tags) except TimeoutException: self.log.warning( u"[IIS] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={instance_tags}".format( wmi_class=self.CLASS, wmi_properties=properties, filters=filters, instance_tags=instance_tags ) ) else: self._submit_events(wmi_sampler, sites) self._submit_metrics(metrics, metrics_by_property)
def test_self_leader_event(self): self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_SELF_LEADER_CHECK, self.DEFAULT_AGENT_CONFIG) instance_hash = hash_mutable(MOCK_CONFIG_SELF_LEADER_CHECK['instances'][0]) self.check._instance_states[instance_hash].last_known_leader = 'My Old Leader' mocks = self._get_consul_mocks() our_url = self.mock_get_cluster_leader_A(None) other_url = self.mock_get_cluster_leader_B(None) # We become the leader mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_A self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 1) self.assertEqual(our_url, self.check._instance_states[instance_hash].last_known_leader) event = self.events[0] self.assertEqual(event['event_type'], 'consul.new_leader') self.assertIn('prev_consul_leader:My Old Leader', event['tags']) self.assertIn('curr_consul_leader:%s' % our_url, event['tags']) # We are already the leader, no new events self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 0) # We lose the leader, no new events mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_B self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 0) self.assertEqual(other_url, self.check._instance_states[instance_hash].last_known_leader) # We regain the leadership mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_A self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks) self.assertEqual(len(self.events), 1) self.assertEqual(our_url, self.check._instance_states[instance_hash].last_known_leader) event = self.events[0] self.assertEqual(event['event_type'], 'consul.new_leader') self.assertIn('prev_consul_leader:%s' % other_url, event['tags']) self.assertIn('curr_consul_leader:%s' % our_url, event['tags'])
def check(self, instance): host = instance.get('host', "localhost") user = instance.get('username', "") password = instance.get('password', "") services = instance.get('services', []) instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash) tags = [] if (host == "localhost" or host == ".") else [u'host:{0}'.format(host)] if len(services) == 0: raise Exception('No services defined in windows_service.yaml') properties = ["Name", "State"] filters = map(lambda x: {"Name": tuple(('=', x))}, services) wmi_sampler = self._get_wmi_sampler(instance_key, self.CLASS, properties, filters=filters, host=host, namespace=self.NAMESPACE, username=user, password=password) try: wmi_sampler.sample() except TimeoutException: self.log.warning( u"[WinService] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={tags}".format( wmi_class=self.CLASS, wmi_properties=properties, filters=filters, tags=tags)) else: self._process_services(wmi_sampler, services, tags)
def check(self, instance): # Connect to the WMI provider host = instance.get('host', "localhost") user = instance.get('username', "") password = instance.get('password', "") services = instance.get('services', []) instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash) tags = [] if (host == "localhost" or host == ".") else [u'host:{0}'.format(host)] if len(services) == 0: raise Exception('No services defined in windows_service.yaml') properties = ["Name", "State"] filters = map(lambda x: {"Name": tuple(('=', x))}, services) wmi_sampler = self._get_wmi_sampler( instance_key, self.CLASS, properties, filters=filters, host=host, namespace=self.NAMESPACE, username=user, password=password ) try: # Sample, extract & submit metrics wmi_sampler.sample() except TimeoutException: self.log.warning( u"[WinService] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={tags}".format( wmi_class=self.CLASS, wmi_properties=properties, filters=filters, tags=tags ) ) else: self._process_services(wmi_sampler, services, tags)
def check(self, instance): # Instance state is mutable, any changes to it will be reflected in self._instance_states instance_state = self._instance_states[hash_mutable(instance)] self._check_for_leader_change(instance, instance_state) peers = self.get_peers_in_cluster(instance) main_tags = [] agent_dc = self._get_agent_datacenter(instance, instance_state) if agent_dc is not None: main_tags.append('consul_datacenter:{0}'.format(agent_dc)) for tag in instance.get('tags', []): main_tags.append(tag) if not self._is_instance_leader(instance, instance_state): self.gauge("consul.peers", len(peers), tags=main_tags + ["mode:follower"]) self.log.debug( "This consul agent is not the cluster leader." + "Skipping service and catalog checks for this instance") return else: self.gauge("consul.peers", len(peers), tags=main_tags + ["mode:leader"]) service_check_tags = ['consul_url:{0}'.format(instance.get('url'))] perform_catalog_checks = instance.get( 'catalog_checks', self.init_config.get('catalog_checks')) perform_network_latency_checks = instance.get( 'network_latency_checks', self.init_config.get('network_latency_checks')) try: # Make service checks from health checks for all services in catalog health_state = self.consul_request(instance, '/v1/health/state/any') sc = {} # compute the highest status level (OK < WARNING < CRITICAL) a a check among all the nodes is running on. for check in health_state: sc_id = '{0}/{1}/{2}'.format(check['CheckID'], check.get('ServiceID', ''), check.get('ServiceName', '')) status = self.STATUS_SC.get(check['Status']) if status is None: status = AgentCheck.UNKNOWN if sc_id not in sc: tags = ["check:{0}".format(check["CheckID"])] if check["ServiceName"]: tags.append("service:{0}".format(check["ServiceName"])) if check["ServiceID"]: tags.append("consul_service_id:{0}".format( check["ServiceID"])) sc[sc_id] = {'status': status, 'tags': tags} elif self.STATUS_SEVERITY[status] > self.STATUS_SEVERITY[ sc[sc_id]['status']]: sc[sc_id]['status'] = status for s in sc.values(): self.service_check(self.HEALTH_CHECK, s['status'], tags=main_tags + s['tags']) except Exception as e: self.log.error(e) self.service_check(self.CONSUL_CHECK, AgentCheck.CRITICAL, tags=service_check_tags) else: self.service_check(self.CONSUL_CHECK, AgentCheck.OK, tags=service_check_tags) if perform_catalog_checks: # Collect node by service, and service by node counts for a whitelist of services services = self.get_services_in_cluster(instance) service_whitelist = instance.get( 'service_whitelist', self.init_config.get('service_whitelist', [])) max_services = instance.get( 'max_services', self.init_config.get('max_services', self.MAX_SERVICES)) services = self._cull_services_list(services, service_whitelist, max_services) # {node_id: {"up: 0, "passing": 0, "warning": 0, "critical": 0} nodes_to_service_status = defaultdict(lambda: defaultdict(int)) for service in services: # For every service in the cluster, # Gauge the following: # `consul.catalog.nodes_up` : # of Nodes registered with that service # `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered # `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered # `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered service_tags = self._get_service_tags(service, services[service]) nodes_with_service = self.get_nodes_with_service( instance, service) # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0} node_status = defaultdict(int) for node in nodes_with_service: # The node_id is n['Node']['Node'] node_id = node.get('Node', {}).get("Node") # An additional service is registered on this node. Bump up the counter nodes_to_service_status[node_id]["up"] += 1 # If there is no Check for the node then Consul and dd-agent consider it up if 'Checks' not in node: node_status['passing'] += 1 node_status['up'] += 1 else: found_critical = False found_warning = False found_serf_health = False for check in node['Checks']: if check['CheckID'] == 'serfHealth': found_serf_health = True # For backwards compatibility, the "up" node_status is computed # based on the total # of nodes 'running' as part of the service. # If the serfHealth is `critical` it means the Consul agent isn't even responding, # and we don't register the node as `up` if check['Status'] != 'critical': node_status["up"] += 1 continue if check['Status'] == 'critical': found_critical = True break elif check['Status'] == 'warning': found_warning = True # Keep looping in case there is a critical status # Increment the counters based on what was found in Checks # `critical` checks override `warning`s, and if neither are found, register the node as `passing` if found_critical: node_status['critical'] += 1 nodes_to_service_status[node_id]["critical"] += 1 elif found_warning: node_status['warning'] += 1 nodes_to_service_status[node_id]["warning"] += 1 else: if not found_serf_health: # We have not found a serfHealth check for this node, which is unexpected # If we get here assume this node's status is "up", since we register it as 'passing' node_status['up'] += 1 node_status['passing'] += 1 nodes_to_service_status[node_id]["passing"] += 1 for status_key in self.STATUS_SC: status_value = node_status[status_key] self.gauge('{0}.nodes_{1}'.format( self.CONSUL_CATALOG_CHECK, status_key), status_value, tags=main_tags + service_tags) for node, service_status in nodes_to_service_status.iteritems(): # For every node discovered for whitelisted services, gauge the following: # `consul.catalog.services_up` : Total services registered on node # `consul.catalog.services_passing` : Total passing services on node # `consul.catalog.services_warning` : Total warning services on node # `consul.catalog.services_critical` : Total critical services on node node_tags = ['consul_node_id:{0}'.format(node)] self.gauge('{0}.services_up'.format(self.CONSUL_CATALOG_CHECK), len(services), tags=main_tags + node_tags) for status_key in self.STATUS_SC: status_value = service_status[status_key] self.gauge('{0}.services_{1}'.format( self.CONSUL_CATALOG_CHECK, status_key), status_value, tags=main_tags + node_tags) if perform_network_latency_checks: self.check_network_latency(instance, agent_dc, main_tags)
def __init__(self, name, init_config, agentConfig, instances, counter_list): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._countersettypes = {} self._counters = {} self._metrics = {} self._tags = {} try: for instance in instances: key = hash_mutable(instance) cfg_tags = instance.get('tags') if cfg_tags is not None: if not isinstance(cfg_tags, list): self.log.error("Tags must be configured as a list") raise ValueError("Tags must be type list, not %s" % str(type(cfg_tags))) self._tags[key] = list(cfg_tags) remote_machine = None host = instance.get('host') self._metrics[key] = [] if host is not None and host != ".": try: remote_machine = host username = instance.get('username') password = instance.get('password') nr = win32wnet.NETRESOURCE() nr.lpRemoteName = r"\\%s\c$" % remote_machine nr.dwType = 0 nr.lpLocalName = None win32wnet.WNetAddConnection2(nr, password, username, 0) except Exception as e: self.log.error("Failed to make remote connection %s" % str(e)) return ## counter_data_types allows the precision with which counters are queried ## to be configured on a per-metric basis. In the metric instance, precision ## should be specified as ## counter_data_types: ## - iis.httpd_request_method.get,int ## - iis.net.bytes_rcvd,float ## ## the above would query the counter associated with iis.httpd_request_method.get ## as an integer (LONG) and iis.net.bytes_rcvd as a double datatypes = {} precisions = instance.get('counter_data_types') if precisions is not None: if not isinstance(precisions, list): self.log.warning( "incorrect type for counter_data_type %s" % str(precisions)) else: for p in precisions: k, v = p.split(",") v = v.lower().strip() if v in int_types: self.log.info( "Setting datatype for %s to integer" % k) datatypes[k] = DATA_TYPE_INT elif v in double_types: self.log.info( "Setting datatype for %s to double" % k) datatypes[k] = DATA_TYPE_DOUBLE else: self.log.warning("Unknown data type %s" % str(v)) # list of the metrics. Each entry is itself an entry, # which is the pdh name, datadog metric name, type, and the # pdh counter object for counterset, inst_name, counter_name, dd_name, mtype in counter_list: m = getattr(self, mtype.lower()) precision = datatypes.get(dd_name) obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name=remote_machine, precision=precision) entry = [inst_name, dd_name, m, obj] self.log.debug("entry: %s" % str(entry)) self._metrics[key].append(entry) # get any additional metrics in the instance addl_metrics = instance.get('additional_metrics') if addl_metrics is not None: for counterset, inst_name, counter_name, dd_name, mtype in addl_metrics: if inst_name.lower() == "none" or len( inst_name ) == 0 or inst_name == "*" or inst_name.lower( ) == "all": inst_name = None m = getattr(self, mtype.lower()) precision = datatypes.get(dd_name) obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name=remote_machine, precision=precision) entry = [inst_name, dd_name, m, obj] self.log.debug("additional metric entry: %s" % str(entry)) self._metrics[key].append(entry) except Exception as e: self.log.debug("Exception in PDH init: %s", str(e)) raise
def __init__(self, name, init_config, agentConfig, instances, counter_list): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._countersettypes = {} self._counters = {} self._metrics = {} self._tags = {} try: for instance in instances: key = hash_mutable(instance) cfg_tags = instance.get('tags') if cfg_tags is not None: if not isinstance(cfg_tags, list): self.log.error("Tags must be configured as a list") raise ValueError("Tags must be type list, not %s" % str(type(cfg_tags))) self._tags[key] = list(cfg_tags) remote_machine = None host = instance.get('host') self._metrics[key] = [] if host is not None and host != ".": try: remote_machine = host username = instance.get('username') password = instance.get('password') nr = win32wnet.NETRESOURCE() nr.lpRemoteName = r"\\%s\c$" % remote_machine nr.dwType = 0 nr.lpLocalName = None win32wnet.WNetAddConnection2(nr, password, username, 0) except Exception as e: self.log.error("Failed to make remote connection %s" % str(e)) return ## counter_data_types allows the precision with which counters are queried ## to be configured on a per-metric basis. In the metric instance, precision ## should be specified as ## counter_data_types: ## - iis.httpd_request_method.get,int ## - iis.net.bytes_rcvd,float ## ## the above would query the counter associated with iis.httpd_request_method.get ## as an integer (LONG) and iis.net.bytes_rcvd as a double datatypes = {} precisions = instance.get('counter_data_types') if precisions is not None: if not isinstance(precisions, list): self.log.warning("incorrect type for counter_data_type %s" % str(precisions)) else: for p in precisions: k, v = p.split(",") v = v.lower().strip() if v in int_types: self.log.info("Setting datatype for %s to integer" % k) datatypes[k] = DATA_TYPE_INT elif v in double_types: self.log.info("Setting datatype for %s to double" % k) datatypes[k] = DATA_TYPE_DOUBLE else: self.log.warning("Unknown data type %s" % str(v)) # list of the metrics. Each entry is itself an entry, # which is the pdh name, datadog metric name, type, and the # pdh counter object for counterset, inst_name, counter_name, dd_name, mtype in counter_list: m = getattr(self, mtype.lower()) precision = datatypes.get(dd_name) obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name = remote_machine, precision=precision) entry = [inst_name, dd_name, m, obj] self.log.debug("entry: %s" % str(entry)) self._metrics[key].append(entry) # get any additional metrics in the instance addl_metrics = instance.get('additional_metrics') if addl_metrics is not None: for counterset, inst_name, counter_name, dd_name, mtype in addl_metrics: if inst_name.lower() == "none" or len(inst_name) == 0 or inst_name == "*" or inst_name.lower() == "all": inst_name = None m = getattr(self, mtype.lower()) precision = datatypes.get(dd_name) obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name = remote_machine, precision = precision) entry = [inst_name, dd_name, m, obj] self.log.debug("additional metric entry: %s" % str(entry)) self._metrics[key].append(entry) except Exception as e: self.log.debug("Exception in PDH init: %s", str(e)) raise
def check(self, instance): # Connect to the WMI provider host = instance.get('host', "localhost") username = instance.get('username', "") password = instance.get('password', "") instance_tags = instance.get('tags', []) notify = instance.get('notify', []) user = instance.get('user') ltypes = instance.get('type', []) source_names = instance.get('source_name', []) log_files = instance.get('log_file', []) event_ids = instance.get('event_id', []) message_filters = instance.get('message_filters', []) event_format = instance.get('event_format') instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.EVENT_CLASS, instance_hash) # Store the last timestamp by instance if instance_key not in self.last_ts: self.last_ts[instance_key] = datetime.utcnow() return # Event properties event_properties = list(self.EVENT_PROPERTIES) if event_format is not None: event_properties.extend(list(set(self.EXTRA_EVENT_PROPERTIES) & set(event_format))) else: event_properties.extend(self.EXTRA_EVENT_PROPERTIES) # Event filters query = {} filters = [] last_ts = self.last_ts[instance_key] query['TimeGenerated'] = ('>=', self._dt_to_wmi(last_ts)) if user: query['User'] = ('=', user) if ltypes: query['Type'] = [] for ltype in ltypes: query['Type'].append(('=', ltype)) if source_names: query['SourceName'] = [] for source_name in source_names: query['SourceName'].append(('=', source_name)) if log_files: query['LogFile'] = [] for log_file in log_files: query['LogFile'].append(('=', log_file)) if event_ids: query['EventCode'] = [] for event_id in event_ids: query['EventCode'].append(('=', event_id)) if message_filters: query['NOT Message'] = [] query['Message'] = [] for filt in message_filters: if filt[0] == '-': query['NOT Message'].append(('LIKE', filt[1:])) else: query['Message'].append(('LIKE', filt)) filters.append(query) wmi_sampler = self._get_wmi_sampler( instance_key, self.EVENT_CLASS, event_properties, filters=filters, host=host, namespace=self.NAMESPACE, username=username, password=password, and_props=['Message'] ) try: wmi_sampler.sample() except TimeoutException: self.log.warning( u"[Win32EventLog] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={tags}".format( wmi_class=self.EVENT_CLASS, wmi_properties=event_properties, filters=filters, tags=instance_tags ) ) else: for ev in wmi_sampler: # for local events we dont need to specify a hostname hostname = None if (host == "localhost" or host == ".") else host log_ev = LogEvent( ev, self.log, hostname, instance_tags, notify, self._tag_event_id, event_format ) # Since WQL only compares on the date and NOT the time, we have to # do a secondary check to make sure events are after the last # timestamp if log_ev.is_after(last_ts): self.event(log_ev.to_event_dict()) else: self.log.debug('Skipping event after %s. ts=%s' % (last_ts, log_ev.timestamp)) # Update the last time checked self.last_ts[instance_key] = datetime.utcnow()
def check(self, instance): sites = instance.get('sites') if sites is None: expected_sites = set() else: expected_sites = set(sites) # _Total should always be in the list of expected sites; we always # report _Total if "_Total" not in expected_sites: expected_sites.add("_Total") self.log.debug("expected sites is %s" % str(expected_sites)) key = hash_mutable(instance) for inst_name, dd_name, metric_func, counter in self._metrics[key]: try: try: vals = counter.get_all_values() except Exception as e: self.log.error("Failed to get_all_values %s %s" % (inst_name, dd_name)) continue for sitename, val in vals.iteritems(): tags = [] if key in self._tags: tags = self._tags[key] try: if not counter.is_single_instance(): # Skip any sites we don't specifically want. if not sites: tags.append("site:{0}".format(self.normalize(sitename))) # always report total elif sitename == "_Total": tags.append("site:{0}".format(self.normalize(sitename))) elif sitename not in sites: continue else: tags.append("site:{0}".format(self.normalize(sitename))) except Exception as e: self.log.error("Caught exception %s setting tags" % str(e)) try: metric_func(dd_name, val, tags) except Exception as e: self.log.error("metric_func: %s %s %s" % (dd_name, str(val), str(e))) pass if dd_name == "iis.uptime": uptime = int(val) status = AgentCheck.CRITICAL if uptime == 0 else AgentCheck.OK self.service_check(self.SERVICE_CHECK, status, tags=['site:{0}'.format(self.normalize(sitename))]) if sitename in expected_sites: self.log.debug("Removing %s from expected sites" % sitename) expected_sites.remove(sitename) else: self.log.warning("site not in expected_sites %s" % sitename) except Exception as e: # don't give up on all of the metrics because one failed self.log.error("IIS Failed to get metric data for %s %s: %s" % (inst_name, dd_name, str(e))) pass for site in expected_sites: self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL, tags=['site:{0}'.format(self.normalize(site))])
def check(self, instance): # Connect to the WMI provider host = instance.get('host', "localhost") username = instance.get('username', "") password = instance.get('password', "") instance_tags = instance.get('tags', []) notify = instance.get('notify', []) user = instance.get('user') ltypes = instance.get('type', []) source_names = instance.get('source_name', []) log_files = instance.get('log_file', []) event_ids = instance.get('event_id', []) message_filters = instance.get('message_filters', []) instance_hash = hash_mutable(instance) instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash) # Store the last timestamp by instance if instance_key not in self.last_ts: self.last_ts[instance_key] = datetime.utcnow() return query = {} filters = [] last_ts = self.last_ts[instance_key] query['TimeGenerated'] = ('>=', self._dt_to_wmi(last_ts)) if user: query['User'] = ('=', user) if ltypes: query['Type'] = [] for ltype in ltypes: query['Type'].append(('=', ltype)) if source_names: query['SourceName'] = [] for source_name in source_names: query['SourceName'].append(('=', source_name)) if log_files: query['LogFile'] = [] for log_file in log_files: query['LogFile'].append(('=', log_file)) if event_ids: query['EventCode'] = [] for event_id in event_ids: query['EventCode'].append(('=', event_id)) if message_filters: query['NOT Message'] = [] query['Message'] = [] for filt in message_filters: if filt[0] == '-': query['NOT Message'].append(('LIKE', filt[1:])) else: query['Message'].append(('LIKE', filt)) filters.append(query) wmi_sampler = self._get_wmi_sampler(instance_key, self.CLASS, self.EVENT_PROPERTIES, filters=filters, host=host, namespace=self.NAMESPACE, username=username, password=password, and_props=['Message']) try: wmi_sampler.sample() except TimeoutException: self.log.warning( u"[Win32EventLog] WMI query timed out." u" class={wmi_class} - properties={wmi_properties} -" u" filters={filters} - tags={tags}".format( wmi_class=self.CLASS, wmi_properties=self.EVENT_PROPERTIES, filters=filters, tags=instance_tags)) else: for ev in wmi_sampler: # for local events we dont need to specify a hostname hostname = None if (host == "localhost" or host == ".") else host log_ev = LogEvent(ev, hostname, instance_tags, notify, self.init_config.get('tag_event_id', False)) # Since WQL only compares on the date and NOT the time, we have to # do a secondary check to make sure events are after the last # timestamp if log_ev.is_after(last_ts): self.event(log_ev.to_event_dict()) else: self.log.debug('Skipping event after %s. ts=%s' % (last_ts, log_ev.timestamp)) # Update the last time checked self.last_ts[instance_key] = datetime.utcnow()
def __init__(self, name, init_config, agentConfig, instances, counter_list): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._countersettypes = {} self._counters = {} self._metrics = {} self._tags = {} try: for instance in instances: key = hash_mutable(instance) cfg_tags = instance.get('tags') if cfg_tags is not None: if not isinstance(cfg_tags, list): self.log.error("Tags must be configured as a list") raise ValueError("Tags must be type list, not %s" % str(type(cfg_tags))) self._tags[key] = list(cfg_tags) remote_machine = None host = instance.get('host') self._metrics[key] = [] if host is not None and host != ".": try: remote_machine = host username = instance.get('username') password = instance.get('password') nr = win32wnet.NETRESOURCE() nr.lpRemoteName = r"\\%s\c$" % remote_machine nr.dwType = 0 nr.lpLocalName = None win32wnet.WNetAddConnection2(nr, password, username, 0) except Exception as e: self.log.error("Failed to make remote connection %s" % str(e)) return # list of the metrics. Each entry is itself an entry, # which is the pdh name, datadog metric name, type, and the # pdh counter object for counterset, inst_name, counter_name, dd_name, mtype in counter_list: m = getattr(self, mtype.lower()) obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name=remote_machine) entry = [inst_name, dd_name, m, obj] self.log.debug("entry: %s" % str(entry)) self._metrics[key].append(entry) # get any additional metrics in the instance addl_metrics = instance.get('additional_metrics') if addl_metrics is not None: for counterset, inst_name, counter_name, dd_name, mtype in addl_metrics: if inst_name.lower() == "none" or len( inst_name ) == 0 or inst_name == "*" or inst_name.lower( ) == "all": inst_name = None m = getattr(self, mtype.lower()) obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name=remote_machine) entry = [inst_name, dd_name, m, obj] self.log.debug("additional metric entry: %s" % str(entry)) self._metrics[key].append(entry) except Exception as e: self.log.debug("Exception in PDH init: %s", str(e)) raise
def check(self, instance): # Instance state is mutable, any changes to it will be reflected in self._instance_states instance_state = self._instance_states[hash_mutable(instance)] self._check_for_leader_change(instance, instance_state) peers = self.get_peers_in_cluster(instance) main_tags = [] agent_dc = self._get_agent_datacenter(instance, instance_state) if agent_dc is not None: main_tags.append('consul_datacenter:{0}'.format(agent_dc)) for tag in instance.get('tags', []): main_tags.append(tag) if not self._is_instance_leader(instance, instance_state): self.gauge("consul.peers", len(peers), tags=main_tags + ["mode:follower"]) self.log.debug("This consul agent is not the cluster leader." + "Skipping service and catalog checks for this instance") return else: self.gauge("consul.peers", len(peers), tags=main_tags + ["mode:leader"]) service_check_tags = ['consul_url:{0}'.format(instance.get('url'))] perform_catalog_checks = instance.get('catalog_checks', self.init_config.get('catalog_checks')) perform_network_latency_checks = instance.get('network_latency_checks', self.init_config.get('network_latency_checks')) try: # Make service checks from health checks for all services in catalog health_state = self.consul_request(instance, '/v1/health/state/any') sc = {} # compute the highest status level (OK < WARNING < CRITICAL) a a check among all the nodes is running on. for check in health_state: sc_id = '{0}/{1}/{2}'.format(check['CheckID'], check.get('ServiceID', ''), check.get('ServiceName', '')) status = self.STATUS_SC.get(check['Status']) if status is None: status = AgentCheck.UNKNOWN if sc_id not in sc: tags = ["check:{0}".format(check["CheckID"])] if check["ServiceName"]: tags.append("service:{0}".format(check["ServiceName"])) if check["ServiceID"]: tags.append("consul_service_id:{0}".format(check["ServiceID"])) sc[sc_id] = {'status': status, 'tags': tags} elif self.STATUS_SEVERITY[status] > self.STATUS_SEVERITY[sc[sc_id]['status']]: sc[sc_id]['status'] = status for s in sc.values(): self.service_check(self.HEALTH_CHECK, s['status'], tags=main_tags+s['tags']) except Exception as e: self.log.error(e) self.service_check(self.CONSUL_CHECK, AgentCheck.CRITICAL, tags=service_check_tags) else: self.service_check(self.CONSUL_CHECK, AgentCheck.OK, tags=service_check_tags) if perform_catalog_checks: # Collect node by service, and service by node counts for a whitelist of services services = self.get_services_in_cluster(instance) service_whitelist = instance.get('service_whitelist', self.init_config.get('service_whitelist', [])) max_services = instance.get('max_services', self.init_config.get('max_services', self.MAX_SERVICES)) services = self._cull_services_list(services, service_whitelist, max_services) # {node_id: {"up: 0, "passing": 0, "warning": 0, "critical": 0} nodes_to_service_status = defaultdict(lambda: defaultdict(int)) for service in services: # For every service in the cluster, # Gauge the following: # `consul.catalog.nodes_up` : # of Nodes registered with that service # `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered # `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered # `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered service_tags = ['consul_service_id:{0}'.format(service)] nodes_with_service = self.get_nodes_with_service(instance, service) # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0} node_status = defaultdict(int) for node in nodes_with_service: # The node_id is n['Node']['Node'] node_id = node.get('Node', {}).get("Node") # An additional service is registered on this node. Bump up the counter nodes_to_service_status[node_id]["up"] += 1 # If there is no Check for the node then Consul and dd-agent consider it up if 'Checks' not in node: node_status['passing'] += 1 node_status['up'] += 1 else: found_critical = False found_warning = False found_serf_health = False for check in node['Checks']: if check['CheckID'] == 'serfHealth': found_serf_health = True # For backwards compatibility, the "up" node_status is computed # based on the total # of nodes 'running' as part of the service. # If the serfHealth is `critical` it means the Consul agent isn't even responding, # and we don't register the node as `up` if check['Status'] != 'critical': node_status["up"] += 1 continue if check['Status'] == 'critical': found_critical = True break elif check['Status'] == 'warning': found_warning = True # Keep looping in case there is a critical status # Increment the counters based on what was found in Checks # `critical` checks override `warning`s, and if neither are found, register the node as `passing` if found_critical: node_status['critical'] += 1 nodes_to_service_status[node_id]["critical"] += 1 elif found_warning: node_status['warning'] += 1 nodes_to_service_status[node_id]["warning"] += 1 else: if not found_serf_health: # We have not found a serfHealth check for this node, which is unexpected # If we get here assume this node's status is "up", since we register it as 'passing' node_status['up'] += 1 node_status['passing'] += 1 nodes_to_service_status[node_id]["passing"] += 1 for status_key in self.STATUS_SC: status_value = node_status[status_key] self.gauge( '{0}.nodes_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), status_value, tags=main_tags+service_tags ) for node, service_status in nodes_to_service_status.iteritems(): # For every node discovered for whitelisted services, gauge the following: # `consul.catalog.services_up` : Total services registered on node # `consul.catalog.services_passing` : Total passing services on node # `consul.catalog.services_warning` : Total warning services on node # `consul.catalog.services_critical` : Total critical services on node node_tags = ['consul_node_id:{0}'.format(node)] self.gauge('{0}.services_up'.format(self.CONSUL_CATALOG_CHECK), len(services), tags=main_tags+node_tags) for status_key in self.STATUS_SC: status_value = service_status[status_key] self.gauge( '{0}.services_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), status_value, tags=main_tags+node_tags ) if perform_network_latency_checks: self.check_network_latency(instance, agent_dc, main_tags)