def test_discovery_devices_monitored_count(read_mock, aggregator): read_mock.return_value = '["192.168.0.1","192.168.0.2"]' host = socket.gethostbyname(common.HOST) network = ipaddress.ip_network(u'{}/29'.format(host), strict=False).with_prefixlen check_tags = [ 'autodiscovery_subnet:{}'.format(to_native_string(network)), ] network_tags = ['network:{}'.format(network)] instance = { 'name': 'snmp_conf', # Make sure the check handles bytes 'network_address': to_native_string(network), 'port': common.PORT, 'community_string': 'public', 'retries': 0, 'discovery_interval': 0, } init_config = { 'profiles': { 'profile1': {'definition': {'metrics': common.SUPPORTED_METRIC_TYPES, 'sysobjectid': '1.3.6.1.4.1.8072.*'}} } } check = SnmpCheck('snmp', init_config, [instance]) check.check(instance) check._running = False aggregator.assert_metric('snmp.discovered_devices_count', tags=network_tags) for device_ip in ['192.168.0.1', '192.168.0.2']: tags = check_tags + ['snmp_device:{}'.format(device_ip)] aggregator.assert_metric('snmp.devices_monitored', metric_type=aggregator.GAUGE, value=1, count=1, tags=tags) common.assert_common_check_run_metrics(aggregator, network_tags) aggregator.assert_all_metrics_covered()
def test_discovery(aggregator): host = socket.gethostbyname(common.HOST) network = ipaddress.ip_network(u'{}/29'.format(host), strict=False).with_prefixlen check_tags = [ 'snmp_device:{}'.format(host), 'snmp_profile:profile1', 'autodiscovery_subnet:{}'.format(to_native_string(network)), ] instance = { 'name': 'snmp_conf', # Make sure the check handles bytes 'network_address': to_native_string(network), 'port': common.PORT, 'community_string': 'public', 'retries': 0, 'discovery_interval': 0, } init_config = { 'profiles': { 'profile1': { 'definition': { 'metrics': common.SUPPORTED_METRIC_TYPES, 'sysobjectid': '1.3.6.1.4.1.8072.*' } } } } check = SnmpCheck('snmp', init_config, [instance]) try: for _ in range(30): check.check(instance) if len(aggregator.metric_names) > 1: break time.sleep(1) aggregator.reset() finally: check._running = False del check # This is what the Agent would do when unscheduling the check. for metric in common.SUPPORTED_METRIC_TYPES: metric_name = "snmp." + metric['name'] aggregator.assert_metric(metric_name, tags=check_tags, count=1) aggregator.assert_metric('snmp.sysUpTimeInstance') aggregator.assert_metric('snmp.discovered_devices_count', tags=['network:{}'.format(network)]) aggregator.assert_metric('snmp.devices_monitored', metric_type=aggregator.GAUGE, tags=check_tags) aggregator.assert_all_metrics_covered()
def get_version(db): with closing(db.cursor()) as cursor: cursor.execute('SELECT VERSION()') result = cursor.fetchone() # Version might include a build, a flavor, or both # e.g. 4.1.26-log, 4.1.26-MariaDB, 10.0.1-MariaDB-mariadb1precise-log # See http://dev.mysql.com/doc/refman/4.1/en/information-functions.html#function_version # https://mariadb.com/kb/en/library/version/ # and https://mariadb.com/kb/en/library/server-system-variables/#version raw_version = to_native_string(result[0]) parts = raw_version.split('-') version, flavor, build = [parts[0], '', ''] for data in parts: if data == "MariaDB": flavor = "MariaDB" if data != "MariaDB" and flavor == '': flavor = "MySQL" if data in BUILDS: build = data if build == '': build = 'unspecified' return MySQLVersion(version, flavor, build)
def _create_event(self, status, tags=None): hostname = to_native_string(self.hostname) if status == "red": alert_type = "error" msg_title = "{} is {}".format(hostname, status) elif status == "yellow": alert_type = "warning" msg_title = "{} is {}".format(hostname, status) else: # then it should be green alert_type = "success" msg_title = "{} recovered as {}".format(hostname, status) msg = "ElasticSearch: {} just reported as {}".format(hostname, status) return { 'timestamp': int(time.time()), 'event_type': 'elasticsearch', 'host': hostname, 'msg_text': msg, 'msg_title': msg_title, 'alert_type': alert_type, 'source_type_name': "elasticsearch", 'event_object': hostname, 'tags': tags, }
def get_pcf_channel_metrics(self, queue_manager): args = {pymqi.CMQCFC.MQCACH_CHANNEL_NAME: pymqi.ensure_bytes('*')} try: pcf = pymqi.PCFExecute(queue_manager) response = pcf.MQCMD_INQUIRE_CHANNEL(args) except pymqi.MQMIError as e: self.log.warning("Error getting CHANNEL stats %s", e) else: channels = len(response) mname = '{}.channel.channels'.format(metrics.METRIC_PREFIX) self.gauge(mname, channels, tags=self.config.tags_no_channel) for channel_info in response: channel_name = to_native_string( channel_info[pymqi.CMQCFC.MQCACH_CHANNEL_NAME]).strip() channel_tags = self.config.tags_no_channel + [ "channel:{}".format(channel_name) ] self._submit_metrics_from_properties(channel_info, metrics.channel_metrics(), channel_tags) # Check specific channels # If a channel is not discoverable, a user may want to check it specifically. # Specific channels are checked first to send channel metrics and `ibm_mq.channel` service checks # at the same time, but the end result is the same in any order. for channel in self.config.channels: self._submit_channel_status(queue_manager, channel, self.config.tags_no_channel) # Grab all the discoverable channels self._submit_channel_status(queue_manager, '*', self.config.tags_no_channel)
def __init__(self, check, config, connection_args): self.collection_interval = float( config.activity_config.get("collection_interval", MySQLActivity.DEFAULT_COLLECTION_INTERVAL) ) if self.collection_interval <= 0: self.collection_interval = MySQLActivity.DEFAULT_COLLECTION_INTERVAL super(MySQLActivity, self).__init__( check, run_sync=is_affirmative(config.activity_config.get("run_sync", False)), enabled=is_affirmative(config.activity_config.get("enabled", True)), expected_db_exceptions=(pymysql.err.OperationalError, pymysql.err.InternalError), min_collection_interval=config.min_collection_interval, dbms="mysql", rate_limit=1 / float(self.collection_interval), job_name="query-activity", shutdown_callback=self._close_db_conn, ) self._check = check self._config = config self._log = check.log self._connection_args = connection_args self._db = None self._db_version = None self._obfuscator_options = to_native_string(json.dumps(self._config.obfuscator_options))
def _collect_metadata(self, overview_response): version = to_native_string(overview_response['rabbitmq_version']) if version: # Rabbit versions follow semantic versioning https://www.rabbitmq.com/changelog.html self.set_metadata('version', version) self.log.debug("found rabbitmq version %s", version) else: self.log.warning("could not retrieve rabbitmq version information")
def test_external_host_tags(aggregator, realtime_instance): realtime_instance['collect_tags'] = True check = VSphereCheck('vsphere', {}, [realtime_instance]) config = VSphereConfig(realtime_instance, MagicMock()) check.api = MockedAPI(config) check.api_rest = VSphereRestAPI(config, MagicMock()) with check.tags_cache.update(): check.refresh_tags_cache() with check.infrastructure_cache.update(): check.refresh_infrastructure_cache() fixture_file = os.path.join(HERE, 'fixtures', 'host_tags_values.json') with open(fixture_file, 'r') as f: expected_tags = json.load(f) check.set_external_tags = MagicMock() check.submit_external_host_tags() submitted_tags = check.set_external_tags.mock_calls[0].args[0] submitted_tags.sort(key=lambda x: x[0]) for ex, sub in zip(expected_tags, submitted_tags): ex_host, sub_host = ex[0], sub[0] ex_tags, sub_tags = ex[1]['vsphere'], sub[1]['vsphere'] ex_tags = [ to_native_string(t) for t in ex_tags ] # json library loads data in unicode, let's convert back to native assert ex_host == sub_host assert ex_tags == sub_tags check.config.excluded_host_tags = ['vsphere_host'] check.set_external_tags = MagicMock() check.submit_external_host_tags() submitted_tags = check.set_external_tags.mock_calls[0].args[0] submitted_tags.sort(key=lambda x: x[0]) for ex, sub in zip(expected_tags, submitted_tags): ex_host, sub_host = ex[0], sub[0] ex_tags, sub_tags = ex[1]['vsphere'], sub[1]['vsphere'] ex_tags = [ to_native_string(t) for t in ex_tags if 'vsphere_host:' not in t ] assert ex_host == sub_host assert ex_tags == sub_tags check.set_external_tags = MagicMock() check.submit_external_host_tags()
def dd_environment(): with terraform_run(os.path.join(get_here(), 'terraform')) as outputs: kubeconfig = to_native_string(outputs['kubeconfig']['value']) with port_forward(kubeconfig, 'linkerd', 'linkerd-controller', 4191) as (ip, port): instance = { 'prometheus_url': 'http://{}:{}/metrics'.format(ip, port), 'metrics': [LINKERD_FIXTURE_METRICS], 'type_overrides': LINKERD_FIXTURE_TYPES, } yield instance
def test_encoding(self, aggregator, msg_text): check = AgentCheck() event = { 'event_type': 'new.event', 'msg_title': 'new test event', 'aggregation_key': 'test.event', 'msg_text': msg_text, 'tags': ['∆', u'Ω-bar'], 'timestamp': 1, } check.event(event) aggregator.assert_event(to_native_string(msg_text), tags=['∆', 'Ω-bar'])
def _submit_channel_status(self, queue_manager, search_channel_name, tags, channels_to_skip=None): """Submit channel status Note: Error 3065 (MQRCCF_CHL_STATUS_NOT_FOUND) might indicate that the channel has not been used. More info: https://www.ibm.com/support/knowledgecenter/SSFKSJ_7.1.0/com.ibm.mq.doc/fm16690_.htm :param search_channel_name might contain wildcard characters """ channels_to_skip = channels_to_skip or [] search_channel_tags = tags + ["channel:{}".format(search_channel_name)] try: args = { pymqi.CMQCFC.MQCACH_CHANNEL_NAME: pymqi.ensure_bytes(search_channel_name) } pcf = pymqi.PCFExecute(queue_manager) response = pcf.MQCMD_INQUIRE_CHANNEL_STATUS(args) self.service_check(self.CHANNEL_SERVICE_CHECK, AgentCheck.OK, search_channel_tags) except pymqi.MQMIError as e: self.service_check(self.CHANNEL_SERVICE_CHECK, AgentCheck.CRITICAL, search_channel_tags) if e.comp == pymqi.CMQC.MQCC_FAILED and e.reason == pymqi.CMQCFC.MQRCCF_CHL_STATUS_NOT_FOUND: self.log.debug("Channel status not found for channel %s: %s", search_channel_name, e) else: self.log.warning( "Error getting CHANNEL status for channel %s: %s", search_channel_name, e) else: for channel_info in response: channel_name = to_native_string( channel_info[pymqi.CMQCFC.MQCACH_CHANNEL_NAME]).strip() if channel_name in channels_to_skip: continue channel_tags = tags + ["channel:{}".format(channel_name)] self._submit_metrics_from_properties( channel_info, metrics.channel_status_metrics(), channel_tags) channel_status = channel_info[ pymqi.CMQCFC.MQIACH_CHANNEL_STATUS] self._submit_channel_count(channel_name, channel_status, channel_tags) self._submit_status_check(channel_name, channel_status, channel_tags)
def _get_version(self, queue_manager): pcf = pymqi.PCFExecute(queue_manager) resp = pcf.MQCMD_INQUIRE_Q_MGR( {pymqi.CMQCFC.MQIACF_Q_MGR_ATTRS: [pymqi.CMQC.MQCA_VERSION]}) try: version = to_native_string(resp[0][pymqi.CMQC.MQCA_VERSION]) self.log.debug("IBM MQ version from response: %s", version) except Exception as e: self.log.debug("Error collecting IBM MQ version: %s", e) return None if version is None: return None return self._parse_version(version)
def _normalize_tags_type(self, tags, device_name=None, metric_name=None): if self.disable_generic_tags: return super(SnowflakeCheck, self)._normalize_tags_type(tags, device_name, metric_name) # If disable_generic_tags is not enabled, for each generic tag we emmit both the generic and the non generic # version to ease transition. normalized_tags = [] for tag in tags: if tag is not None: try: tag = to_native_string(tag) except UnicodeError: self.log.warning('Encoding error with tag `%s` for metric `%s`, ignoring tag', tag, metric_name) continue normalized_tags.extend(list({tag, self.degeneralise_tag(tag)})) return normalized_tags
def _get_version(self, queue_manager): pcf = pymqi.PCFExecute( queue_manager, response_wait_interval=self.config.timeout, convert=self.config.convert_endianness ) resp = pcf.MQCMD_INQUIRE_Q_MGR({pymqi.CMQCFC.MQIACF_Q_MGR_ATTRS: [pymqi.CMQC.MQCA_VERSION]}) pcf.disconnect() try: version = to_native_string(resp[0][pymqi.CMQC.MQCA_VERSION]) self.log.debug("IBM MQ version from response: %s", version) except Exception as e: self.log.debug("Error collecting IBM MQ version: %s", e) return None if version is None: return None return self._parse_version(version)
def _discover_queues(self, queue_manager, mq_pattern_filter): queues = [] for queue_type in SUPPORTED_QUEUE_TYPES: args = { pymqi.CMQC.MQCA_Q_NAME: pymqi.ensure_bytes(mq_pattern_filter), pymqi.CMQC.MQIA_Q_TYPE: queue_type } try: pcf = pymqi.PCFExecute(queue_manager) response = pcf.MQCMD_INQUIRE_Q(args) except pymqi.MQMIError as e: self.warning("Error discovering queue: %s", e) else: for queue_info in response: queue = queue_info[pymqi.CMQC.MQCA_Q_NAME] queues.append(to_native_string(queue).strip()) return queues
def _process_service_check(self, data, url, tag_by_host=False, services_incl_filter=None, services_excl_filter=None, custom_tags=None): ''' Report a service check, tagged by the service and the backend. Statuses are defined in `STATUS_TO_SERVICE_CHECK` mapping. ''' custom_tags = [] if custom_tags is None else custom_tags service_name = data['pxname'] status = data['status'] haproxy_hostname = to_native_string(self.hostname) check_hostname = haproxy_hostname if tag_by_host else '' if self._is_service_excl_filtered(service_name, services_incl_filter, services_excl_filter): return if status in Services.STATUS_TO_SERVICE_CHECK: service_check_tags = ["haproxy_service:%s" % service_name] service_check_tags.extend(custom_tags) self._handle_legacy_service_tag(service_check_tags, service_name) hostname = data['svname'] if data['back_or_front'] == Services.BACKEND: service_check_tags.append('backend:%s' % hostname) status = Services.STATUS_TO_SERVICE_CHECK[status] message = "%s reported %s:%s %s" % (haproxy_hostname, service_name, hostname, status) self.service_check(self.SERVICE_CHECK_NAME, status, message=message, hostname=check_hostname, tags=service_check_tags)
def get_parent_tags_recursively(mor, infrastructure_data): """Go up the resources hierarchy from the given mor. Note that a host running a VM is not considered to be a parent of that VM. rootFolder(vim.Folder): - vm(vim.Folder): VM1-1 VM1-2 - host(vim.Folder): HOST1 HOST2 """ mor_props = infrastructure_data.get(mor) parent = mor_props.get('parent') if parent: tags = [] parent_props = infrastructure_data.get(parent, {}) parent_name = to_native_string(parent_props.get('name', 'unknown')) if isinstance(parent, vim.HostSystem): tags.append('vsphere_host:{}'.format(parent_name)) elif isinstance(parent, vim.Folder): tags.append('vsphere_folder:{}'.format(parent_name)) elif isinstance(parent, vim.ComputeResource): if isinstance(parent, vim.ClusterComputeResource): tags.append('vsphere_cluster:{}'.format(parent_name)) tags.append('vsphere_compute:{}'.format(parent_name)) elif isinstance(parent, vim.Datacenter): tags.append('vsphere_datacenter:{}'.format(parent_name)) elif isinstance(parent, vim.Datastore): tags.append('vsphere_datastore:{}'.format(parent_name)) parent_tags = get_parent_tags_recursively(parent, infrastructure_data) parent_tags.extend(tags) return parent_tags return []
def refresh_infrastructure_cache(self): """Fetch the complete infrastructure, generate tags for each monitored resources and store all of that into the infrastructure_cache. It also computes the resource `hostname` property to be used when submitting metrics for this mor.""" self.log.debug("Refreshing the infrastructure cache...") t0 = Timer() infrastructure_data = self.api.get_infrastructure() self.gauge( "datadog.vsphere.refresh_infrastructure_cache.time", t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) self.log.debug("Infrastructure cache refreshed in %.3f seconds.", t0.total()) for mor, properties in iteritems(infrastructure_data): if not isinstance(mor, tuple( self.config.collected_resource_types)): # Do nothing for the resource types we do not collect continue if is_resource_excluded_by_filters(mor, infrastructure_data, self.config.resource_filters): # The resource does not match the specified patterns continue mor_name = to_native_string(properties.get("name", "unknown")) mor_type_str = MOR_TYPE_AS_STRING[type(mor)] hostname = None tags = [] if isinstance(mor, vim.VirtualMachine): power_state = properties.get("runtime.powerState") if power_state != vim.VirtualMachinePowerState.poweredOn: # Skipping because the VM is not powered on # TODO: Sometimes VM are "poweredOn" but "disconnected" and thus have no metrics self.log.debug("Skipping VM %s in state %s", mor_name, to_native_string(power_state)) continue # Hosts are not considered as parents of the VMs they run, we use the `runtime.host` property # to get the name of the ESXi host runtime_host = properties.get("runtime.host") runtime_host_props = infrastructure_data.get(runtime_host, {}) runtime_hostname = to_native_string( runtime_host_props.get("name", "unknown")) tags.append('vsphere_host:{}'.format(runtime_hostname)) if self.config.use_guest_hostname: hostname = properties.get("guest.hostName", mor_name) else: hostname = mor_name elif isinstance(mor, vim.HostSystem): hostname = mor_name else: tags.append('vsphere_{}:{}'.format(mor_type_str, mor_name)) tags.extend(get_parent_tags_recursively(mor, infrastructure_data)) tags.append('vsphere_type:{}'.format(mor_type_str)) mor_payload = {"tags": tags} if hostname: mor_payload['hostname'] = hostname self.infrastructure_cache.set_mor_data(mor, mor_payload)
def _collect_metrics_async(self, instance, query_specs): """ Task that collects the metrics listed in the morlist for one MOR """ # ## <TEST-INSTRUMENTATION> t = Timer() # ## </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager results = perfManager.QueryPerf(query_specs) if results: for mor_perfs in results: mor_name = str(mor_perfs.entity) try: mor = self.mor_cache.get_mor(i_key, mor_name) except MorNotFoundError: self.log.error( "Trying to get metrics from object %s deleted from the cache, skipping. " "Consider increasing the parameter `clean_morlist_interval` to avoid that", mor_name, ) continue for result in mor_perfs.value: counter_id = result.id.counterId if not self.metadata_cache.contains(i_key, counter_id): self.log.debug( "Skipping value for counter %s, because there is no metadata about it", ensure_unicode(counter_id), ) continue # Metric types are absolute, delta, and rate metric_name = self.metadata_cache.get_metadata(i_key, result.id.counterId).get('name') if self.in_compatibility_mode(instance): if metric_name not in ALL_METRICS: self.log.debug("Skipping unknown `%s` metric.", ensure_unicode(metric_name)) continue if not result.value: self.log.debug("Skipping `%s` metric because the value is empty", ensure_unicode(metric_name)) continue instance_name = result.id.instance or "none" # Get the most recent value that isn't negative valid_values = [v for v in result.value if v >= 0] if not valid_values: continue value = self._transform_value(instance, result.id.counterId, valid_values[-1]) hostname = mor['hostname'] tags = ['instance:{}'.format(ensure_unicode(instance_name))] if not hostname: # no host tags available tags.extend(mor['tags']) else: hostname = to_native_string(hostname) if self.excluded_host_tags: tags.extend(mor["excluded_host_tags"]) tags.extend(instance.get('tags', [])) # vsphere "rates" should be submitted as gauges (rate is # precomputed). self.gauge("vsphere.{}".format(ensure_unicode(metric_name)), value, hostname=hostname, tags=tags) # ## <TEST-INSTRUMENTATION> custom_tags = instance.get('tags', []) + ['instance:{}'.format(i_key)] self.histogram('datadog.agent.vsphere.metric_colection.time', t.total(), tags=custom_tags)
def submit_metrics_callback(self, query_results): """ Callback of the collection of metrics. This is run in the main thread! `query_results` currently contain results of one resource type in practice, but this function is generic and can handle results with mixed resource types. """ # `have_instance_value` is used later to avoid collecting aggregated metrics # when instance metrics are collected. have_instance_value = defaultdict(set) for results_per_mor in query_results: resource_type = type(results_per_mor.entity) metadata = self.metrics_metadata_cache.get_metadata(resource_type) for result in results_per_mor.value: metric_name = metadata.get(result.id.counterId) if result.id.instance: have_instance_value[resource_type].add(metric_name) for results_per_mor in query_results: mor_props = self.infrastructure_cache.get_mor_props( results_per_mor.entity) if mor_props is None: self.log.debug( "Skipping results for mor %s because the integration is not yet aware of it. If this is a problem" " you can increase the value of 'refresh_infrastructure_cache_interval'.", results_per_mor.entity, ) continue resource_type = type(results_per_mor.entity) metadata = self.metrics_metadata_cache.get_metadata(resource_type) for result in results_per_mor.value: metric_name = metadata.get(result.id.counterId) if not metric_name: # Fail-safe self.log.debug( "Skipping value for counter %s, because the integration doesn't have metadata about it. If this" " is a problem you can increase the value of 'refresh_metrics_metadata_cache_interval'", result.id.counterId, ) continue if not result.value: self.log.debug( "Skipping metric %s because the value is empty", to_native_string(metric_name)) continue # Get the most recent value that isn't negative valid_values = [v for v in result.value if v >= 0] if not valid_values: self.log.debug( "Skipping metric %s because the value returned by vCenter" " is negative (i.e. the metric is not yet available).", to_native_string(metric_name), ) continue tags = [] if should_collect_per_instance_values( self.config, metric_name, resource_type) and ( metric_name in have_instance_value[resource_type]): instance_value = result.id.instance # When collecting per instance values, it's possible that both aggregated metric and per instance # metrics are received. In that case, the metric with no instance value is skipped. if not instance_value: continue instance_tag_key = get_mapped_instance_tag(metric_name) tags.append('{}:{}'.format(instance_tag_key, instance_value)) vsphere_tags = self.tags_cache.get_mor_tags( results_per_mor.entity) mor_tags = mor_props['tags'] + vsphere_tags if resource_type in HISTORICAL_RESOURCES: # Tags are attached to the metrics tags.extend(mor_tags) hostname = None else: # Tags are (mostly) submitted as external host tags. hostname = to_native_string(mor_props.get('hostname')) if self.config.excluded_host_tags: tags.extend([ t for t in mor_tags if t.split(":", 1)[0] in self.config.excluded_host_tags ]) tags.extend(self.config.base_tags) value = valid_values[-1] if metric_name in PERCENT_METRICS: # Convert the percentage to a float. value /= 100.0 # vSphere "rates" should be submitted as gauges (rate is precomputed). self.gauge(to_native_string(metric_name), value, hostname=hostname, tags=tags)
def format_metric_name(counter): return "{}.{}.{}".format( to_native_string(counter.groupInfo.key), to_native_string(counter.nameInfo.key), SHORT_ROLLUP[str(counter.rollupType)], )