Example #1
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self._countersettypes = {}
        self._counters = {}
        self._metrics = {}
        self._tags = {}

        try:
            for instance in instances:
                key = hash_mutable(instance)
                counterset = instance.get('countersetname')

                cfg_tags = instance.get('tags')
                if cfg_tags is not None:
                    tags = cfg_tags.join(",")
                    self._tags[key] = list(tags) if tags else []

                metrics = instance.get('metrics')
                # list of the metrics.  Each entry is itself an entry,
                # which is the pdh name, datadog metric name, type, and the
                # pdh counter object
                self._metrics[key] = []
                for inst_name, dd_name, mtype in metrics:
                    m = getattr(self, mtype.lower())
                    obj = WinPDHCounter(counterset, inst_name, self.log)
                    entry = [inst_name, dd_name, m, obj]
                    self.log.debug("entry: %s" % str(entry))
                    self._metrics[key].append(entry)

        except Exception as e:
            self.log.debug("Exception in PDH init: %s", str(e))
            raise
Example #2
0
    def test_network_latency_checks(self):
        self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_NETWORK_LATENCY_CHECKS,
                                self.DEFAULT_AGENT_CONFIG)

        mocks = self._get_consul_mocks()

        # We start out as the leader, and stay that way
        instance_hash = hash_mutable(MOCK_CONFIG_NETWORK_LATENCY_CHECKS['instances'][0])
        self.check._instance_states[instance_hash].last_known_leader = self.mock_get_cluster_leader_A(None)

        self.run_check(MOCK_CONFIG_NETWORK_LATENCY_CHECKS, mocks=mocks)

        latency = [m for m in self.metrics if m[0].startswith('consul.net.')]
        latency.sort()
        # Make sure we have the expected number of metrics
        self.assertEquals(19, len(latency))

        # Only 3 dc-latency metrics since we only do source = self
        dc = [m for m in latency if '.dc.latency.' in m[0]]
        self.assertEquals(3, len(dc))
        self.assertEquals(1.6746410750238774, dc[0][2])

        # 16 latency metrics, 2 nodes * 8 metrics each
        node = [m for m in latency if '.node.latency.' in m[0]]
        self.assertEquals(16, len(node))
        self.assertEquals(0.26577747932995816, node[0][2])
Example #3
0
    def test_network_latency_checks(self):
        self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_NETWORK_LATENCY_CHECKS,
                                self.DEFAULT_AGENT_CONFIG)

        mocks = self._get_consul_mocks()

        # We start out as the leader, and stay that way
        instance_hash = hash_mutable(MOCK_CONFIG_NETWORK_LATENCY_CHECKS['instances'][0])
        self.check._instance_states[instance_hash].last_known_leader = self.mock_get_cluster_leader_A(None)

        self.run_check(MOCK_CONFIG_NETWORK_LATENCY_CHECKS, mocks=mocks)

        latency = [m for m in self.metrics if m[0].startswith('consul.net.')]
        latency.sort()
        # Make sure we have the expected number of metrics
        self.assertEquals(19, len(latency))

        # Only 3 dc-latency metrics since we only do source = self
        dc = [m for m in latency if '.dc.latency.' in m[0]]
        self.assertEquals(3, len(dc))
        self.assertEquals(1.6746410750238774, dc[0][2])

        # 16 latency metrics, 2 nodes * 8 metrics each
        node = [m for m in latency if '.node.latency.' in m[0]]
        self.assertEquals(16, len(node))
        self.assertEquals(0.26577747932995816, node[0][2])
Example #4
0
    def check(self, instance):
        # Connect to the WMI provider
        host = instance.get('host', "localhost")
        provider = instance.get('provider')
        user = instance.get('username', "")
        password = instance.get('password', "")
        instance_tags = instance.get('tags', [])
        sites = instance.get('sites', ['_Total'])
        is_2008 = _is_affirmative(instance.get('is_2008', False))

        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS,
                                              instance_hash)
        filters = map(lambda x: {"Name": tuple(('=', x))}, sites)

        metrics_by_property, properties = self._get_wmi_properties(
            instance_key, self.METRICS, [])

        if is_2008:
            for idx, prop in enumerate(properties):
                if prop == "TotalBytesTransferred":
                    properties[idx] = "TotalBytesTransfered"
                    break

        wmi_sampler = self._get_wmi_sampler(instance_key,
                                            self.CLASS,
                                            properties,
                                            filters=filters,
                                            host=host,
                                            namespace=self.NAMESPACE,
                                            provider=provider,
                                            username=user,
                                            password=password)

        # Sample, extract & submit metrics
        try:
            wmi_sampler.sample()

            metrics = self._extract_metrics(wmi_sampler, sites, instance_tags)
        except TimeoutException:
            self.log.warning(
                u"[IIS] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={instance_tags}".format(
                    wmi_class=self.CLASS,
                    wmi_properties=properties,
                    filters=filters,
                    instance_tags=instance_tags))
        except pythoncom.com_error as e:
            if '0x80041017' in str(e):
                self.warning(
                    u"You may be running IIS6/7 which reports metrics a "
                    u"little differently. Try enabling the is_2008 flag for this instance."
                )
            raise e
        else:
            self._submit_events(wmi_sampler, sites)
            self._submit_metrics(metrics, metrics_by_property)
Example #5
0
    def check(self, instance):
        """
        Fetch WMI metrics.
        """
        # Connection information
        host = instance.get('host', "localhost")
        namespace = instance.get('namespace', "root\\cimv2")
        provider = instance.get('provider')
        username = instance.get('username', "")
        password = instance.get('password', "")

        # WMI instance
        wmi_class = instance.get('class')
        metrics = instance.get('metrics')
        filters = instance.get('filters')
        tag_by = instance.get('tag_by', "")
        tag_queries = instance.get('tag_queries', [])
        constant_tags = instance.get('constant_tags')

        # Create or retrieve an existing WMISampler
        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, namespace, wmi_class,
                                              instance_hash)

        metric_name_and_type_by_property, properties = \
            self._get_wmi_properties(instance_key, metrics, tag_queries)

        wmi_sampler = self._get_wmi_sampler(
            instance_key,
            wmi_class,
            properties,
            tag_by=tag_by,
            filters=filters,
            host=host,
            namespace=namespace,
            provider=provider,
            username=username,
            password=password,
        )

        # Sample, extract & submit metrics
        try:
            wmi_sampler.sample()
            metrics = self._extract_metrics(wmi_sampler, tag_by, tag_queries,
                                            constant_tags)
        except TimeoutException:
            self.log.warning(
                u"WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tag_queries={tag_queries}".format(
                    wmi_class=wmi_class,
                    wmi_properties=properties,
                    filters=filters,
                    tag_queries=tag_queries))
        else:
            self._submit_metrics(metrics, metric_name_and_type_by_property)
Example #6
0
    def check(self, instance):
        # Connect to the WMI provider
        host = instance.get('host', "localhost")
        provider = instance.get('provider')
        user = instance.get('username', "")
        password = instance.get('password', "")
        instance_tags = instance.get('tags', [])
        sites = instance.get('sites', ['_Total'])
        is_2008 = _is_affirmative(instance.get('is_2008', False))

        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash)
        filters = map(lambda x: {"Name": tuple(('=', x))}, sites)

        metrics_by_property, properties = self._get_wmi_properties(instance_key, self.METRICS, [])

        if is_2008:
            for idx, prop in enumerate(properties):
                if prop == "TotalBytesTransferred":
                    properties[idx] = "TotalBytesTransfered"
                    break

        wmi_sampler = self._get_wmi_sampler(
            instance_key,
            self.CLASS, properties,
            filters=filters,
            host=host, namespace=self.NAMESPACE, provider=provider,
            username=user, password=password
        )

        # Sample, extract & submit metrics
        try:
            wmi_sampler.sample()

            metrics = self._extract_metrics(wmi_sampler, sites, instance_tags)
        except TimeoutException:
            self.log.warning(
                u"[IIS] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={instance_tags}".format(
                    wmi_class=self.CLASS, wmi_properties=properties,
                    filters=filters, instance_tags=instance_tags
                )
            )
        except pythoncom.com_error as e:
            if '0x80041017' in str(e):
                self.warning(
                    u"You may be running IIS6/7 which reports metrics a "
                    u"little differently. Try enabling the is_2008 flag for this instance."
                )
            raise e
        else:
            self._submit_events(wmi_sampler, sites)
            self._submit_metrics(metrics, metrics_by_property)
Example #7
0
    def check(self, instance):
        key = hash_mutable(instance)
        for inst_name, dd_name, metric_func, counter in self._metrics[key]:
            vals = counter.get_all_values()
            for key, val in vals.iteritems():
                tags = []
                if key in self._tags:
                    tags = self._tags[key]

                if not counter.is_single_instance():
                    tag = "instance=%s" % key
                    tags.append(tag)
                metric_func(dd_name, val, tags)
    def check(self, instance):
        # Connect to the WMI provider
        host = instance.get('host', "localhost")
        user = instance.get('username', "")
        password = instance.get('password', "")
        services = instance.get('services', [])
        custom_tags = instance.get('tags', [])

        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS,
                                              instance_hash)
        tags = [] if (host == "localhost"
                      or host == ".") else [u'host:{0}'.format(host)]
        tags.extend(custom_tags)

        if len(services) == 0:
            raise Exception('No services defined in windows_service.yaml')

        properties = ["Name", "State"]
        if "ALL" in services:
            self.log.debug("tracking all services")
            filters = None
        else:
            filters = map(
                lambda x:
                {"Name": tuple(('LIKE', x))
                 if '%' in x else tuple(('=', x))}, services)

        wmi_sampler = self._get_wmi_sampler(instance_key,
                                            self.CLASS,
                                            properties,
                                            filters=filters,
                                            host=host,
                                            namespace=self.NAMESPACE,
                                            username=user,
                                            password=password)

        try:
            # Sample, extract & submit metrics
            wmi_sampler.sample()
        except TimeoutException:
            self.log.warning(
                u"[WinService] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={tags}".format(
                    wmi_class=self.CLASS,
                    wmi_properties=properties,
                    filters=filters,
                    tags=tags))
        else:
            self._process_services(wmi_sampler, services, tags)
Example #9
0
    def check(self, instance):
        """
        Fetch WMI metrics.
        """
        # Connection information
        host = instance.get('host', "localhost")
        namespace = instance.get('namespace', "root\\cimv2")
        provider = instance.get('provider')
        username = instance.get('username', "")
        password = instance.get('password', "")

        # WMI instance
        wmi_class = instance.get('class')
        metrics = instance.get('metrics')
        filters = instance.get('filters')
        tag_by = instance.get('tag_by', "")
        tag_queries = instance.get('tag_queries', [])
        constant_tags = instance.get('constant_tags')

        # Create or retrieve an existing WMISampler
        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, namespace, wmi_class, instance_hash)

        metric_name_and_type_by_property, properties = \
            self._get_wmi_properties(instance_key, metrics, tag_queries)

        wmi_sampler = self._get_wmi_sampler(
            instance_key,
            wmi_class, properties,
            tag_by=tag_by, filters=filters,
            host=host, namespace=namespace, provider=provider,
            username=username, password=password,
        )

        # Sample, extract & submit metrics
        try:
            wmi_sampler.sample()
            metrics = self._extract_metrics(wmi_sampler, tag_by, tag_queries, constant_tags)
        except TimeoutException:
            self.log.warning(
                u"WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tag_queries={tag_queries}".format(
                    wmi_class=wmi_class, wmi_properties=properties,
                    filters=filters, tag_queries=tag_queries
                )
            )
        else:
            self._submit_metrics(metrics, metric_name_and_type_by_property)
Example #10
0
    def test_new_leader_event(self):
        self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_LEADER_CHECK, self.DEFAULT_AGENT_CONFIG)
        instance_hash = hash_mutable(MOCK_CONFIG_LEADER_CHECK['instances'][0])
        self.check._instance_states[instance_hash].last_known_leader = 'My Old Leader'

        mocks = self._get_consul_mocks()
        mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_B

        self.run_check(MOCK_CONFIG_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 1)

        event = self.events[0]
        self.assertEqual(event['event_type'], 'consul.new_leader')
        self.assertIn('prev_consul_leader:My Old Leader', event['tags'])
        self.assertIn('curr_consul_leader:My New Leader', event['tags'])
Example #11
0
    def test_new_leader_event(self):
        self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_LEADER_CHECK, self.DEFAULT_AGENT_CONFIG)
        instance_hash = hash_mutable(MOCK_CONFIG_LEADER_CHECK['instances'][0])
        self.check._instance_states[instance_hash].last_known_leader = 'My Old Leader'

        mocks = self._get_consul_mocks()
        mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_B

        self.run_check(MOCK_CONFIG_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 1)

        event = self.events[0]
        self.assertEqual(event['event_type'], 'consul.new_leader')
        self.assertIn('prev_consul_leader:My Old Leader', event['tags'])
        self.assertIn('curr_consul_leader:My New Leader', event['tags'])
Example #12
0
    def test_self_leader_event(self):
        self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_SELF_LEADER_CHECK,
                                self.DEFAULT_AGENT_CONFIG)
        instance_hash = hash_mutable(
            MOCK_CONFIG_SELF_LEADER_CHECK['instances'][0])
        self.check._instance_states[
            instance_hash].last_known_leader = 'My Old Leader'

        mocks = self._get_consul_mocks()

        our_url = self.mock_get_cluster_leader_A(None)
        other_url = self.mock_get_cluster_leader_B(None)

        # We become the leader
        mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_A
        self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 1)
        self.assertEqual(
            our_url,
            self.check._instance_states[instance_hash].last_known_leader)
        event = self.events[0]
        self.assertEqual(event['event_type'], 'consul.new_leader')
        self.assertIn('prev_consul_leader:My Old Leader', event['tags'])
        self.assertIn('curr_consul_leader:%s' % our_url, event['tags'])

        # We are already the leader, no new events
        self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 0)

        # We lose the leader, no new events
        mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_B
        self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 0)
        self.assertEqual(
            other_url,
            self.check._instance_states[instance_hash].last_known_leader)

        # We regain the leadership
        mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_A
        self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 1)
        self.assertEqual(
            our_url,
            self.check._instance_states[instance_hash].last_known_leader)
        event = self.events[0]
        self.assertEqual(event['event_type'], 'consul.new_leader')
        self.assertIn('prev_consul_leader:%s' % other_url, event['tags'])
        self.assertIn('curr_consul_leader:%s' % our_url, event['tags'])
Example #13
0
    def check(self, instance):
        self.log.debug("PDHBaseCheck: check()")
        key = hash_mutable(instance)
        for inst_name, dd_name, metric_func, counter in self._metrics[key]:
            try:
                vals = counter.get_all_values()
                for instance_name, val in vals.iteritems():
                    tags = []
                    if key in self._tags:
                        tags = list(self._tags[key])

                    if not counter.is_single_instance():
                        tag = "instance:%s" % instance_name
                        tags.append(tag)
                    metric_func(dd_name, val, tags)
            except Exception as e:
                # don't give up on all of the metrics because one failed
                self.log.error("Failed to get data for %s %s: %s" % (inst_name, dd_name, str(e)))
                pass
Example #14
0
    def check(self, instance):
        # Connect to the WMI provider
        host = instance.get('host', "localhost")
        user = instance.get('username', "")
        password = instance.get('password', "")
        instance_tags = instance.get('tags', [])
        sites = instance.get('sites', ['_Total'])

        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS,
                                              instance_hash)
        filters = map(lambda x: {"Name": tuple(('=', x))}, sites)

        metrics_by_property, properties = self._get_wmi_properties(
            instance_key, self.METRICS, [])

        wmi_sampler = self._get_wmi_sampler(instance_key,
                                            self.CLASS,
                                            properties,
                                            filters=filters,
                                            host=host,
                                            namespace=self.NAMESPACE,
                                            username=user,
                                            password=password)

        # Sample, extract & submit metrics
        try:
            wmi_sampler.sample()

            metrics = self._extract_metrics(wmi_sampler, sites, instance_tags)
        except TimeoutException:
            self.log.warning(
                u"[IIS] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={instance_tags}".format(
                    wmi_class=self.CLASS,
                    wmi_properties=properties,
                    filters=filters,
                    instance_tags=instance_tags))
        else:
            self._submit_events(wmi_sampler, sites)
            self._submit_metrics(metrics, metrics_by_property)
Example #15
0
    def check(self, instance):
        self.log.debug("PDHBaseCheck: check()")
        key = hash_mutable(instance)
        for inst_name, dd_name, metric_func, counter in self._metrics[key]:
            try:
                vals = counter.get_all_values()
                for instance_name, val in vals.iteritems():
                    tags = []
                    if key in self._tags:
                        tags = list(self._tags[key])

                    if not counter.is_single_instance():
                        tag = "instance:%s" % instance_name
                        tags.append(tag)
                    metric_func(dd_name, val, tags)
            except Exception as e:
                # don't give up on all of the metrics because one failed
                self.log.error("Failed to get data for %s %s: %s" %
                               (inst_name, dd_name, str(e)))
                pass
Example #16
0
    def check(self, instance):
        # Connect to the WMI provider
        host = instance.get('host', "localhost")
        user = instance.get('username', "")
        password = instance.get('password', "")
        instance_tags = instance.get('tags', [])
        sites = instance.get('sites', ['_Total'])


        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash)
        filters = map(lambda x: {"Name": tuple(('=', x))}, sites)

        metrics_by_property, properties = self._get_wmi_properties(instance_key, self.METRICS, [])

        wmi_sampler = self._get_wmi_sampler(
            instance_key,
            self.CLASS, properties,
            filters=filters,
            host=host, namespace=self.NAMESPACE,
            username=user, password=password
        )

        # Sample, extract & submit metrics
        try:
            wmi_sampler.sample()

            metrics = self._extract_metrics(wmi_sampler, sites, instance_tags)
        except TimeoutException:
            self.log.warning(
                u"[IIS] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={instance_tags}".format(
                    wmi_class=self.CLASS, wmi_properties=properties,
                    filters=filters, instance_tags=instance_tags
                )
            )
        else:
            self._submit_events(wmi_sampler, sites)
            self._submit_metrics(metrics, metrics_by_property)
Example #17
0
    def test_self_leader_event(self):
        self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_SELF_LEADER_CHECK, self.DEFAULT_AGENT_CONFIG)
        instance_hash = hash_mutable(MOCK_CONFIG_SELF_LEADER_CHECK['instances'][0])
        self.check._instance_states[instance_hash].last_known_leader = 'My Old Leader'

        mocks = self._get_consul_mocks()

        our_url = self.mock_get_cluster_leader_A(None)
        other_url = self.mock_get_cluster_leader_B(None)

        # We become the leader
        mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_A
        self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 1)
        self.assertEqual(our_url, self.check._instance_states[instance_hash].last_known_leader)
        event = self.events[0]
        self.assertEqual(event['event_type'], 'consul.new_leader')
        self.assertIn('prev_consul_leader:My Old Leader', event['tags'])
        self.assertIn('curr_consul_leader:%s' % our_url, event['tags'])

        # We are already the leader, no new events
        self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 0)

        # We lose the leader, no new events
        mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_B
        self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 0)
        self.assertEqual(other_url, self.check._instance_states[instance_hash].last_known_leader)

        # We regain the leadership
        mocks['_get_cluster_leader'] = self.mock_get_cluster_leader_A
        self.run_check(MOCK_CONFIG_SELF_LEADER_CHECK, mocks=mocks)
        self.assertEqual(len(self.events), 1)
        self.assertEqual(our_url, self.check._instance_states[instance_hash].last_known_leader)
        event = self.events[0]
        self.assertEqual(event['event_type'], 'consul.new_leader')
        self.assertIn('prev_consul_leader:%s' % other_url, event['tags'])
        self.assertIn('curr_consul_leader:%s' % our_url, event['tags'])
Example #18
0
    def check(self, instance):
        host = instance.get('host', "localhost")
        user = instance.get('username', "")
        password = instance.get('password', "")
        services = instance.get('services', [])

        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS,
                                              instance_hash)
        tags = [] if (host == "localhost"
                      or host == ".") else [u'host:{0}'.format(host)]

        if len(services) == 0:
            raise Exception('No services defined in windows_service.yaml')

        properties = ["Name", "State"]
        filters = map(lambda x: {"Name": tuple(('=', x))}, services)
        wmi_sampler = self._get_wmi_sampler(instance_key,
                                            self.CLASS,
                                            properties,
                                            filters=filters,
                                            host=host,
                                            namespace=self.NAMESPACE,
                                            username=user,
                                            password=password)

        try:
            wmi_sampler.sample()
        except TimeoutException:
            self.log.warning(
                u"[WinService] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={tags}".format(
                    wmi_class=self.CLASS,
                    wmi_properties=properties,
                    filters=filters,
                    tags=tags))
        else:
            self._process_services(wmi_sampler, services, tags)
Example #19
0
    def check(self, instance):
        # Connect to the WMI provider
        host = instance.get('host', "localhost")
        user = instance.get('username', "")
        password = instance.get('password', "")
        services = instance.get('services', [])

        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS, instance_hash)
        tags = [] if (host == "localhost" or host == ".") else [u'host:{0}'.format(host)]

        if len(services) == 0:
            raise Exception('No services defined in windows_service.yaml')

        properties = ["Name", "State"]
        filters = map(lambda x: {"Name": tuple(('=', x))}, services)
        wmi_sampler = self._get_wmi_sampler(
            instance_key,
            self.CLASS, properties,
            filters=filters,
            host=host, namespace=self.NAMESPACE,
            username=user, password=password
        )

        try:
            # Sample, extract & submit metrics
            wmi_sampler.sample()
        except TimeoutException:
            self.log.warning(
                u"[WinService] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={tags}".format(
                    wmi_class=self.CLASS, wmi_properties=properties,
                    filters=filters, tags=tags
                )
            )
        else:
            self._process_services(wmi_sampler, services, tags)
Example #20
0
    def check(self, instance):
        # Instance state is mutable, any changes to it will be reflected in self._instance_states
        instance_state = self._instance_states[hash_mutable(instance)]

        self._check_for_leader_change(instance, instance_state)

        peers = self.get_peers_in_cluster(instance)
        main_tags = []
        agent_dc = self._get_agent_datacenter(instance, instance_state)

        if agent_dc is not None:
            main_tags.append('consul_datacenter:{0}'.format(agent_dc))

        for tag in instance.get('tags', []):
            main_tags.append(tag)

        if not self._is_instance_leader(instance, instance_state):
            self.gauge("consul.peers",
                       len(peers),
                       tags=main_tags + ["mode:follower"])
            self.log.debug(
                "This consul agent is not the cluster leader." +
                "Skipping service and catalog checks for this instance")
            return
        else:
            self.gauge("consul.peers",
                       len(peers),
                       tags=main_tags + ["mode:leader"])

        service_check_tags = ['consul_url:{0}'.format(instance.get('url'))]
        perform_catalog_checks = instance.get(
            'catalog_checks', self.init_config.get('catalog_checks'))
        perform_network_latency_checks = instance.get(
            'network_latency_checks',
            self.init_config.get('network_latency_checks'))

        try:
            # Make service checks from health checks for all services in catalog
            health_state = self.consul_request(instance,
                                               '/v1/health/state/any')

            sc = {}
            # compute the highest status level (OK < WARNING < CRITICAL) a a check among all the nodes is running on.
            for check in health_state:
                sc_id = '{0}/{1}/{2}'.format(check['CheckID'],
                                             check.get('ServiceID', ''),
                                             check.get('ServiceName', ''))
                status = self.STATUS_SC.get(check['Status'])
                if status is None:
                    status = AgentCheck.UNKNOWN

                if sc_id not in sc:
                    tags = ["check:{0}".format(check["CheckID"])]
                    if check["ServiceName"]:
                        tags.append("service:{0}".format(check["ServiceName"]))
                    if check["ServiceID"]:
                        tags.append("consul_service_id:{0}".format(
                            check["ServiceID"]))
                    sc[sc_id] = {'status': status, 'tags': tags}

                elif self.STATUS_SEVERITY[status] > self.STATUS_SEVERITY[
                        sc[sc_id]['status']]:
                    sc[sc_id]['status'] = status

            for s in sc.values():
                self.service_check(self.HEALTH_CHECK,
                                   s['status'],
                                   tags=main_tags + s['tags'])

        except Exception as e:
            self.log.error(e)
            self.service_check(self.CONSUL_CHECK,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags)
        else:
            self.service_check(self.CONSUL_CHECK,
                               AgentCheck.OK,
                               tags=service_check_tags)

        if perform_catalog_checks:
            # Collect node by service, and service by node counts for a whitelist of services

            services = self.get_services_in_cluster(instance)
            service_whitelist = instance.get(
                'service_whitelist',
                self.init_config.get('service_whitelist', []))
            max_services = instance.get(
                'max_services',
                self.init_config.get('max_services', self.MAX_SERVICES))

            services = self._cull_services_list(services, service_whitelist,
                                                max_services)

            # {node_id: {"up: 0, "passing": 0, "warning": 0, "critical": 0}
            nodes_to_service_status = defaultdict(lambda: defaultdict(int))

            for service in services:
                # For every service in the cluster,
                # Gauge the following:
                # `consul.catalog.nodes_up` : # of Nodes registered with that service
                # `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered
                # `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered
                # `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered

                service_tags = self._get_service_tags(service,
                                                      services[service])

                nodes_with_service = self.get_nodes_with_service(
                    instance, service)

                # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0}
                node_status = defaultdict(int)

                for node in nodes_with_service:
                    # The node_id is n['Node']['Node']
                    node_id = node.get('Node', {}).get("Node")

                    # An additional service is registered on this node. Bump up the counter
                    nodes_to_service_status[node_id]["up"] += 1

                    # If there is no Check for the node then Consul and dd-agent consider it up
                    if 'Checks' not in node:
                        node_status['passing'] += 1
                        node_status['up'] += 1
                    else:
                        found_critical = False
                        found_warning = False
                        found_serf_health = False

                        for check in node['Checks']:
                            if check['CheckID'] == 'serfHealth':
                                found_serf_health = True

                                # For backwards compatibility, the "up" node_status is computed
                                # based on the total # of nodes 'running' as part of the service.

                                # If the serfHealth is `critical` it means the Consul agent isn't even responding,
                                # and we don't register the node as `up`
                                if check['Status'] != 'critical':
                                    node_status["up"] += 1
                                    continue

                            if check['Status'] == 'critical':
                                found_critical = True
                                break
                            elif check['Status'] == 'warning':
                                found_warning = True
                                # Keep looping in case there is a critical status

                        # Increment the counters based on what was found in Checks
                        # `critical` checks override `warning`s, and if neither are found, register the node as `passing`
                        if found_critical:
                            node_status['critical'] += 1
                            nodes_to_service_status[node_id]["critical"] += 1
                        elif found_warning:
                            node_status['warning'] += 1
                            nodes_to_service_status[node_id]["warning"] += 1
                        else:
                            if not found_serf_health:
                                # We have not found a serfHealth check for this node, which is unexpected
                                # If we get here assume this node's status is "up", since we register it as 'passing'
                                node_status['up'] += 1

                            node_status['passing'] += 1
                            nodes_to_service_status[node_id]["passing"] += 1

                for status_key in self.STATUS_SC:
                    status_value = node_status[status_key]
                    self.gauge('{0}.nodes_{1}'.format(
                        self.CONSUL_CATALOG_CHECK, status_key),
                               status_value,
                               tags=main_tags + service_tags)

            for node, service_status in nodes_to_service_status.iteritems():
                # For every node discovered for whitelisted services, gauge the following:
                # `consul.catalog.services_up` : Total services registered on node
                # `consul.catalog.services_passing` : Total passing services on node
                # `consul.catalog.services_warning` : Total warning services on node
                # `consul.catalog.services_critical` : Total critical services on node

                node_tags = ['consul_node_id:{0}'.format(node)]
                self.gauge('{0}.services_up'.format(self.CONSUL_CATALOG_CHECK),
                           len(services),
                           tags=main_tags + node_tags)

                for status_key in self.STATUS_SC:
                    status_value = service_status[status_key]
                    self.gauge('{0}.services_{1}'.format(
                        self.CONSUL_CATALOG_CHECK, status_key),
                               status_value,
                               tags=main_tags + node_tags)

        if perform_network_latency_checks:
            self.check_network_latency(instance, agent_dc, main_tags)
Example #21
0
    def __init__(self, name, init_config, agentConfig, instances,
                 counter_list):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self._countersettypes = {}
        self._counters = {}
        self._metrics = {}
        self._tags = {}

        try:
            for instance in instances:
                key = hash_mutable(instance)

                cfg_tags = instance.get('tags')
                if cfg_tags is not None:
                    if not isinstance(cfg_tags, list):
                        self.log.error("Tags must be configured as a list")
                        raise ValueError("Tags must be type list, not %s" %
                                         str(type(cfg_tags)))
                    self._tags[key] = list(cfg_tags)

                remote_machine = None
                host = instance.get('host')
                self._metrics[key] = []
                if host is not None and host != ".":
                    try:
                        remote_machine = host

                        username = instance.get('username')
                        password = instance.get('password')
                        nr = win32wnet.NETRESOURCE()
                        nr.lpRemoteName = r"\\%s\c$" % remote_machine
                        nr.dwType = 0
                        nr.lpLocalName = None
                        win32wnet.WNetAddConnection2(nr, password, username, 0)

                    except Exception as e:
                        self.log.error("Failed to make remote connection %s" %
                                       str(e))
                        return

                ## counter_data_types allows the precision with which counters are queried
                ## to be configured on a per-metric basis. In the metric instance, precision
                ## should be specified as
                ## counter_data_types:
                ## - iis.httpd_request_method.get,int
                ## - iis.net.bytes_rcvd,float
                ##
                ## the above would query the counter associated with iis.httpd_request_method.get
                ## as an integer (LONG) and iis.net.bytes_rcvd as a double
                datatypes = {}
                precisions = instance.get('counter_data_types')
                if precisions is not None:
                    if not isinstance(precisions, list):
                        self.log.warning(
                            "incorrect type for counter_data_type %s" %
                            str(precisions))
                    else:
                        for p in precisions:
                            k, v = p.split(",")
                            v = v.lower().strip()
                            if v in int_types:
                                self.log.info(
                                    "Setting datatype for %s to integer" % k)
                                datatypes[k] = DATA_TYPE_INT
                            elif v in double_types:
                                self.log.info(
                                    "Setting datatype for %s to double" % k)
                                datatypes[k] = DATA_TYPE_DOUBLE
                            else:
                                self.log.warning("Unknown data type %s" %
                                                 str(v))

                # list of the metrics.  Each entry is itself an entry,
                # which is the pdh name, datadog metric name, type, and the
                # pdh counter object

                for counterset, inst_name, counter_name, dd_name, mtype in counter_list:
                    m = getattr(self, mtype.lower())

                    precision = datatypes.get(dd_name)

                    obj = WinPDHCounter(counterset,
                                        counter_name,
                                        self.log,
                                        inst_name,
                                        machine_name=remote_machine,
                                        precision=precision)

                    entry = [inst_name, dd_name, m, obj]
                    self.log.debug("entry: %s" % str(entry))
                    self._metrics[key].append(entry)

                # get any additional metrics in the instance
                addl_metrics = instance.get('additional_metrics')
                if addl_metrics is not None:
                    for counterset, inst_name, counter_name, dd_name, mtype in addl_metrics:
                        if inst_name.lower() == "none" or len(
                                inst_name
                        ) == 0 or inst_name == "*" or inst_name.lower(
                        ) == "all":
                            inst_name = None
                        m = getattr(self, mtype.lower())

                        precision = datatypes.get(dd_name)

                        obj = WinPDHCounter(counterset,
                                            counter_name,
                                            self.log,
                                            inst_name,
                                            machine_name=remote_machine,
                                            precision=precision)
                        entry = [inst_name, dd_name, m, obj]
                        self.log.debug("additional metric entry: %s" %
                                       str(entry))
                        self._metrics[key].append(entry)

        except Exception as e:
            self.log.debug("Exception in PDH init: %s", str(e))
            raise
Example #22
0
    def __init__(self, name, init_config, agentConfig, instances, counter_list):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self._countersettypes = {}
        self._counters = {}
        self._metrics = {}
        self._tags = {}

        try:
            for instance in instances:
                key = hash_mutable(instance)

                cfg_tags = instance.get('tags')
                if cfg_tags is not None:
                    if not isinstance(cfg_tags, list):
                        self.log.error("Tags must be configured as a list")
                        raise ValueError("Tags must be type list, not %s" % str(type(cfg_tags)))
                    self._tags[key] = list(cfg_tags)

                remote_machine = None
                host = instance.get('host')
                self._metrics[key] = []
                if host is not None and host != ".":
                    try:
                        remote_machine = host

                        username = instance.get('username')
                        password = instance.get('password')
                        nr = win32wnet.NETRESOURCE()
                        nr.lpRemoteName = r"\\%s\c$" % remote_machine
                        nr.dwType = 0
                        nr.lpLocalName = None
                        win32wnet.WNetAddConnection2(nr, password, username, 0)

                    except Exception as e:
                        self.log.error("Failed to make remote connection %s" % str(e))
                        return

                ## counter_data_types allows the precision with which counters are queried
                ## to be configured on a per-metric basis. In the metric instance, precision
                ## should be specified as
                ## counter_data_types:
                ## - iis.httpd_request_method.get,int
                ## - iis.net.bytes_rcvd,float
                ##
                ## the above would query the counter associated with iis.httpd_request_method.get
                ## as an integer (LONG) and iis.net.bytes_rcvd as a double
                datatypes = {}
                precisions = instance.get('counter_data_types')
                if precisions is not None:
                    if not isinstance(precisions, list):
                        self.log.warning("incorrect type for counter_data_type %s" % str(precisions))
                    else:
                        for p in precisions:
                            k, v = p.split(",")
                            v = v.lower().strip()
                            if v in int_types:
                                self.log.info("Setting datatype for %s to integer" % k)
                                datatypes[k] = DATA_TYPE_INT
                            elif v in double_types:
                                self.log.info("Setting datatype for %s to double" % k)
                                datatypes[k] = DATA_TYPE_DOUBLE
                            else:
                                self.log.warning("Unknown data type %s" % str(v))

                # list of the metrics.  Each entry is itself an entry,
                # which is the pdh name, datadog metric name, type, and the
                # pdh counter object

                for counterset, inst_name, counter_name, dd_name, mtype in counter_list:
                    m = getattr(self, mtype.lower())

                    precision = datatypes.get(dd_name)

                    obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name = remote_machine, precision=precision)
                    entry = [inst_name, dd_name, m, obj]
                    self.log.debug("entry: %s" % str(entry))
                    self._metrics[key].append(entry)

                # get any additional metrics in the instance
                addl_metrics = instance.get('additional_metrics')
                if addl_metrics is not None:
                    for counterset, inst_name, counter_name, dd_name, mtype in addl_metrics:
                        if inst_name.lower() == "none" or len(inst_name) == 0 or inst_name == "*" or inst_name.lower() == "all":
                            inst_name = None
                        m = getattr(self, mtype.lower())

                        precision = datatypes.get(dd_name)

                        obj = WinPDHCounter(counterset, counter_name, self.log, inst_name, machine_name = remote_machine, precision = precision)
                        entry = [inst_name, dd_name, m, obj]
                        self.log.debug("additional metric entry: %s" % str(entry))
                        self._metrics[key].append(entry)


        except Exception as e:
            self.log.debug("Exception in PDH init: %s", str(e))
            raise
Example #23
0
    def check(self, instance):
        # Connect to the WMI provider
        host = instance.get('host', "localhost")
        username = instance.get('username', "")
        password = instance.get('password', "")
        instance_tags = instance.get('tags', [])
        notify = instance.get('notify', [])

        user = instance.get('user')
        ltypes = instance.get('type', [])
        source_names = instance.get('source_name', [])
        log_files = instance.get('log_file', [])
        event_ids = instance.get('event_id', [])
        message_filters = instance.get('message_filters', [])
        event_format = instance.get('event_format')

        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.EVENT_CLASS, instance_hash)

        # Store the last timestamp by instance
        if instance_key not in self.last_ts:
            self.last_ts[instance_key] = datetime.utcnow()
            return

        # Event properties
        event_properties = list(self.EVENT_PROPERTIES)

        if event_format is not None:
            event_properties.extend(list(set(self.EXTRA_EVENT_PROPERTIES) & set(event_format)))
        else:
            event_properties.extend(self.EXTRA_EVENT_PROPERTIES)

        # Event filters
        query = {}
        filters = []
        last_ts = self.last_ts[instance_key]
        query['TimeGenerated'] = ('>=', self._dt_to_wmi(last_ts))
        if user:
            query['User'] = ('=', user)
        if ltypes:
            query['Type'] = []
            for ltype in ltypes:
                query['Type'].append(('=', ltype))
        if source_names:
            query['SourceName'] = []
            for source_name in source_names:
                query['SourceName'].append(('=', source_name))
        if log_files:
            query['LogFile'] = []
            for log_file in log_files:
                query['LogFile'].append(('=', log_file))
        if event_ids:
            query['EventCode'] = []
            for event_id in event_ids:
                query['EventCode'].append(('=', event_id))
        if message_filters:
            query['NOT Message'] = []
            query['Message'] = []
            for filt in message_filters:
                if filt[0] == '-':
                    query['NOT Message'].append(('LIKE', filt[1:]))
                else:
                    query['Message'].append(('LIKE', filt))

        filters.append(query)

        wmi_sampler = self._get_wmi_sampler(
            instance_key,
            self.EVENT_CLASS, event_properties,
            filters=filters,
            host=host, namespace=self.NAMESPACE,
            username=username, password=password,
            and_props=['Message']
        )

        try:
            wmi_sampler.sample()
        except TimeoutException:
            self.log.warning(
                u"[Win32EventLog] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={tags}".format(
                    wmi_class=self.EVENT_CLASS, wmi_properties=event_properties,
                    filters=filters, tags=instance_tags
                )
            )
        else:
            for ev in wmi_sampler:
                # for local events we dont need to specify a hostname
                hostname = None if (host == "localhost" or host == ".") else host
                log_ev = LogEvent(
                    ev, self.log, hostname, instance_tags, notify,
                    self._tag_event_id, event_format
                )

                # Since WQL only compares on the date and NOT the time, we have to
                # do a secondary check to make sure events are after the last
                # timestamp
                if log_ev.is_after(last_ts):
                    self.event(log_ev.to_event_dict())
                else:
                    self.log.debug('Skipping event after %s. ts=%s' % (last_ts, log_ev.timestamp))

            # Update the last time checked
            self.last_ts[instance_key] = datetime.utcnow()
Example #24
0
    def check(self, instance):

        sites = instance.get('sites')
        if sites is None:
            expected_sites = set()
        else:
            expected_sites = set(sites)
        # _Total should always be in the list of expected sites; we always
        # report _Total
        if "_Total" not in expected_sites:
            expected_sites.add("_Total")

        self.log.debug("expected sites is %s" % str(expected_sites))
        key = hash_mutable(instance)
        for inst_name, dd_name, metric_func, counter in self._metrics[key]:
            try:
                try:
                    vals = counter.get_all_values()
                except Exception as e:
                    self.log.error("Failed to get_all_values %s %s" % (inst_name, dd_name))
                    continue

                for sitename, val in vals.iteritems():
                    tags = []
                    if key in self._tags:
                        tags = self._tags[key]

                    try:
                        if not counter.is_single_instance():
                            # Skip any sites we don't specifically want.
                            if not sites:
                                tags.append("site:{0}".format(self.normalize(sitename)))
                            # always report total
                            elif sitename == "_Total":
                                tags.append("site:{0}".format(self.normalize(sitename)))
                            elif sitename not in sites:
                                continue
                            else:
                                tags.append("site:{0}".format(self.normalize(sitename)))
                    except Exception as e:
                        self.log.error("Caught exception %s setting tags" % str(e))

                    try:
                        metric_func(dd_name, val, tags)
                    except Exception as e:
                        self.log.error("metric_func: %s %s %s" % (dd_name, str(val), str(e)))
                        pass

                    if dd_name == "iis.uptime":
                        uptime = int(val)
                        status = AgentCheck.CRITICAL if uptime == 0 else AgentCheck.OK
                        self.service_check(self.SERVICE_CHECK, status, tags=['site:{0}'.format(self.normalize(sitename))])
                        if sitename in expected_sites:
                            self.log.debug("Removing %s from expected sites" % sitename)
                            expected_sites.remove(sitename)
                        else:
                            self.log.warning("site not in expected_sites %s" % sitename)

            except Exception as e:
                # don't give up on all of the metrics because one failed
                self.log.error("IIS Failed to get metric data for %s %s: %s" % (inst_name, dd_name, str(e)))
                pass

        for site in expected_sites:
            self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL,
                            tags=['site:{0}'.format(self.normalize(site))])
Example #25
0
    def check(self, instance):
        # Connect to the WMI provider
        host = instance.get('host', "localhost")
        username = instance.get('username', "")
        password = instance.get('password', "")
        instance_tags = instance.get('tags', [])
        notify = instance.get('notify', [])

        user = instance.get('user')
        ltypes = instance.get('type', [])
        source_names = instance.get('source_name', [])
        log_files = instance.get('log_file', [])
        event_ids = instance.get('event_id', [])
        message_filters = instance.get('message_filters', [])

        instance_hash = hash_mutable(instance)
        instance_key = self._get_instance_key(host, self.NAMESPACE, self.CLASS,
                                              instance_hash)

        # Store the last timestamp by instance
        if instance_key not in self.last_ts:
            self.last_ts[instance_key] = datetime.utcnow()
            return

        query = {}
        filters = []
        last_ts = self.last_ts[instance_key]
        query['TimeGenerated'] = ('>=', self._dt_to_wmi(last_ts))
        if user:
            query['User'] = ('=', user)
        if ltypes:
            query['Type'] = []
            for ltype in ltypes:
                query['Type'].append(('=', ltype))
        if source_names:
            query['SourceName'] = []
            for source_name in source_names:
                query['SourceName'].append(('=', source_name))
        if log_files:
            query['LogFile'] = []
            for log_file in log_files:
                query['LogFile'].append(('=', log_file))
        if event_ids:
            query['EventCode'] = []
            for event_id in event_ids:
                query['EventCode'].append(('=', event_id))
        if message_filters:
            query['NOT Message'] = []
            query['Message'] = []
            for filt in message_filters:
                if filt[0] == '-':
                    query['NOT Message'].append(('LIKE', filt[1:]))
                else:
                    query['Message'].append(('LIKE', filt))

        filters.append(query)

        wmi_sampler = self._get_wmi_sampler(instance_key,
                                            self.CLASS,
                                            self.EVENT_PROPERTIES,
                                            filters=filters,
                                            host=host,
                                            namespace=self.NAMESPACE,
                                            username=username,
                                            password=password,
                                            and_props=['Message'])

        try:
            wmi_sampler.sample()
        except TimeoutException:
            self.log.warning(
                u"[Win32EventLog] WMI query timed out."
                u" class={wmi_class} - properties={wmi_properties} -"
                u" filters={filters} - tags={tags}".format(
                    wmi_class=self.CLASS,
                    wmi_properties=self.EVENT_PROPERTIES,
                    filters=filters,
                    tags=instance_tags))
        else:
            for ev in wmi_sampler:
                # for local events we dont need to specify a hostname
                hostname = None if (host == "localhost"
                                    or host == ".") else host
                log_ev = LogEvent(ev, hostname, instance_tags, notify,
                                  self.init_config.get('tag_event_id', False))

                # Since WQL only compares on the date and NOT the time, we have to
                # do a secondary check to make sure events are after the last
                # timestamp
                if log_ev.is_after(last_ts):
                    self.event(log_ev.to_event_dict())
                else:
                    self.log.debug('Skipping event after %s. ts=%s' %
                                   (last_ts, log_ev.timestamp))

            # Update the last time checked
            self.last_ts[instance_key] = datetime.utcnow()
Example #26
0
    def __init__(self, name, init_config, agentConfig, instances,
                 counter_list):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self._countersettypes = {}
        self._counters = {}
        self._metrics = {}
        self._tags = {}

        try:
            for instance in instances:
                key = hash_mutable(instance)

                cfg_tags = instance.get('tags')
                if cfg_tags is not None:
                    if not isinstance(cfg_tags, list):
                        self.log.error("Tags must be configured as a list")
                        raise ValueError("Tags must be type list, not %s" %
                                         str(type(cfg_tags)))
                    self._tags[key] = list(cfg_tags)

                remote_machine = None
                host = instance.get('host')
                self._metrics[key] = []
                if host is not None and host != ".":
                    try:
                        remote_machine = host

                        username = instance.get('username')
                        password = instance.get('password')
                        nr = win32wnet.NETRESOURCE()
                        nr.lpRemoteName = r"\\%s\c$" % remote_machine
                        nr.dwType = 0
                        nr.lpLocalName = None
                        win32wnet.WNetAddConnection2(nr, password, username, 0)

                    except Exception as e:
                        self.log.error("Failed to make remote connection %s" %
                                       str(e))
                        return

                # list of the metrics.  Each entry is itself an entry,
                # which is the pdh name, datadog metric name, type, and the
                # pdh counter object

                for counterset, inst_name, counter_name, dd_name, mtype in counter_list:
                    m = getattr(self, mtype.lower())
                    obj = WinPDHCounter(counterset,
                                        counter_name,
                                        self.log,
                                        inst_name,
                                        machine_name=remote_machine)
                    entry = [inst_name, dd_name, m, obj]
                    self.log.debug("entry: %s" % str(entry))
                    self._metrics[key].append(entry)

                # get any additional metrics in the instance
                addl_metrics = instance.get('additional_metrics')
                if addl_metrics is not None:
                    for counterset, inst_name, counter_name, dd_name, mtype in addl_metrics:
                        if inst_name.lower() == "none" or len(
                                inst_name
                        ) == 0 or inst_name == "*" or inst_name.lower(
                        ) == "all":
                            inst_name = None
                        m = getattr(self, mtype.lower())
                        obj = WinPDHCounter(counterset,
                                            counter_name,
                                            self.log,
                                            inst_name,
                                            machine_name=remote_machine)
                        entry = [inst_name, dd_name, m, obj]
                        self.log.debug("additional metric entry: %s" %
                                       str(entry))
                        self._metrics[key].append(entry)

        except Exception as e:
            self.log.debug("Exception in PDH init: %s", str(e))
            raise
Example #27
0
    def check(self, instance):
        # Instance state is mutable, any changes to it will be reflected in self._instance_states
        instance_state = self._instance_states[hash_mutable(instance)]

        self._check_for_leader_change(instance, instance_state)

        peers = self.get_peers_in_cluster(instance)
        main_tags = []
        agent_dc = self._get_agent_datacenter(instance, instance_state)

        if agent_dc is not None:
            main_tags.append('consul_datacenter:{0}'.format(agent_dc))

        for tag in instance.get('tags', []):
            main_tags.append(tag)

        if not self._is_instance_leader(instance, instance_state):
            self.gauge("consul.peers", len(peers), tags=main_tags + ["mode:follower"])
            self.log.debug("This consul agent is not the cluster leader." +
                           "Skipping service and catalog checks for this instance")
            return
        else:
            self.gauge("consul.peers", len(peers), tags=main_tags + ["mode:leader"])

        service_check_tags = ['consul_url:{0}'.format(instance.get('url'))]
        perform_catalog_checks = instance.get('catalog_checks',
                                              self.init_config.get('catalog_checks'))
        perform_network_latency_checks = instance.get('network_latency_checks',
                                                      self.init_config.get('network_latency_checks'))

        try:
            # Make service checks from health checks for all services in catalog
            health_state = self.consul_request(instance, '/v1/health/state/any')

            sc = {}
            # compute the highest status level (OK < WARNING < CRITICAL) a a check among all the nodes is running on.
            for check in health_state:
                sc_id = '{0}/{1}/{2}'.format(check['CheckID'], check.get('ServiceID', ''), check.get('ServiceName', ''))
                status = self.STATUS_SC.get(check['Status'])
                if status is None:
                    status = AgentCheck.UNKNOWN

                if sc_id not in sc:
                    tags = ["check:{0}".format(check["CheckID"])]
                    if check["ServiceName"]:
                        tags.append("service:{0}".format(check["ServiceName"]))
                    if check["ServiceID"]:
                        tags.append("consul_service_id:{0}".format(check["ServiceID"]))
                    sc[sc_id] = {'status': status, 'tags': tags}

                elif self.STATUS_SEVERITY[status] > self.STATUS_SEVERITY[sc[sc_id]['status']]:
                    sc[sc_id]['status'] = status

            for s in sc.values():
                self.service_check(self.HEALTH_CHECK, s['status'], tags=main_tags+s['tags'])

        except Exception as e:
            self.log.error(e)
            self.service_check(self.CONSUL_CHECK, AgentCheck.CRITICAL,
                               tags=service_check_tags)
        else:
            self.service_check(self.CONSUL_CHECK, AgentCheck.OK,
                               tags=service_check_tags)

        if perform_catalog_checks:
            # Collect node by service, and service by node counts for a whitelist of services

            services = self.get_services_in_cluster(instance)
            service_whitelist = instance.get('service_whitelist',
                                             self.init_config.get('service_whitelist', []))
            max_services = instance.get('max_services',
                                        self.init_config.get('max_services', self.MAX_SERVICES))

            services = self._cull_services_list(services, service_whitelist, max_services)

            # {node_id: {"up: 0, "passing": 0, "warning": 0, "critical": 0}
            nodes_to_service_status = defaultdict(lambda: defaultdict(int))

            for service in services:
                # For every service in the cluster,
                # Gauge the following:
                # `consul.catalog.nodes_up` : # of Nodes registered with that service
                # `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered
                # `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered
                # `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered

                service_tags = ['consul_service_id:{0}'.format(service)]

                nodes_with_service = self.get_nodes_with_service(instance, service)

                # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0}
                node_status = defaultdict(int)

                for node in nodes_with_service:
                    # The node_id is n['Node']['Node']
                    node_id = node.get('Node', {}).get("Node")

                    # An additional service is registered on this node. Bump up the counter
                    nodes_to_service_status[node_id]["up"] += 1

                    # If there is no Check for the node then Consul and dd-agent consider it up
                    if 'Checks' not in node:
                        node_status['passing'] += 1
                        node_status['up'] += 1
                    else:
                        found_critical = False
                        found_warning = False
                        found_serf_health = False

                        for check in node['Checks']:
                            if check['CheckID'] == 'serfHealth':
                                found_serf_health = True

                                # For backwards compatibility, the "up" node_status is computed
                                # based on the total # of nodes 'running' as part of the service.

                                # If the serfHealth is `critical` it means the Consul agent isn't even responding,
                                # and we don't register the node as `up`
                                if check['Status'] != 'critical':
                                    node_status["up"] += 1
                                    continue

                            if check['Status'] == 'critical':
                                found_critical = True
                                break
                            elif check['Status'] == 'warning':
                                found_warning = True
                                # Keep looping in case there is a critical status

                        # Increment the counters based on what was found in Checks
                        # `critical` checks override `warning`s, and if neither are found, register the node as `passing`
                        if found_critical:
                            node_status['critical'] += 1
                            nodes_to_service_status[node_id]["critical"] += 1
                        elif found_warning:
                            node_status['warning'] += 1
                            nodes_to_service_status[node_id]["warning"] += 1
                        else:
                            if not found_serf_health:
                                # We have not found a serfHealth check for this node, which is unexpected
                                # If we get here assume this node's status is "up", since we register it as 'passing'
                                node_status['up'] += 1

                            node_status['passing'] += 1
                            nodes_to_service_status[node_id]["passing"] += 1

                for status_key in self.STATUS_SC:
                    status_value = node_status[status_key]
                    self.gauge(
                        '{0}.nodes_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key),
                        status_value,
                        tags=main_tags+service_tags
                    )

            for node, service_status in nodes_to_service_status.iteritems():
                # For every node discovered for whitelisted services, gauge the following:
                # `consul.catalog.services_up` : Total services registered on node
                # `consul.catalog.services_passing` : Total passing services on node
                # `consul.catalog.services_warning` : Total warning services on node
                # `consul.catalog.services_critical` : Total critical services on node

                node_tags = ['consul_node_id:{0}'.format(node)]
                self.gauge('{0}.services_up'.format(self.CONSUL_CATALOG_CHECK),
                           len(services),
                           tags=main_tags+node_tags)

                for status_key in self.STATUS_SC:
                    status_value = service_status[status_key]
                    self.gauge(
                        '{0}.services_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key),
                        status_value,
                        tags=main_tags+node_tags
                    )

        if perform_network_latency_checks:
            self.check_network_latency(instance, agent_dc, main_tags)