class VSphereCheck(AgentCheck): """ Get performance metrics from a vCenter server and upload them to Datadog References: http://pubs.vmware.com/vsphere-51/index.jsp#com.vmware.wssdk.apiref.doc/vim.PerformanceManager.html *_atomic jobs perform one single task asynchronously in the ThreadPool, we don't know exactly when they will finish, but we reap them if they're stuck. The other calls are performed synchronously. """ SERVICE_CHECK_NAME = 'vcenter.can_connect' def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.time_started = time.time() self.pool_started = False self.exceptionq = Queue() # Connections open to vCenter instances self.server_instances = {} # Event configuration self.event_config = {} # Caching resources, timeouts self.cache_times = {} for instance in self.instances: i_key = self._instance_key(instance) self.cache_times[i_key] = { MORLIST: { LAST: 0, INTERVAL: init_config.get('refresh_morlist_interval', REFRESH_MORLIST_INTERVAL) }, METRICS_METADATA: { LAST: 0, INTERVAL: init_config.get('refresh_metrics_metadata_interval', REFRESH_METRICS_METADATA_INTERVAL) } } self.event_config[i_key] = instance.get('event_config') # First layer of cache (get entities from the tree) self.morlist_raw = {} # Second layer, processed from the first one self.morlist = {} # Metrics metadata, basically perfCounterId -> {name, group, description} self.metrics_metadata = {} self.latest_event_query = {} def stop(self): self.stop_pool() def start_pool(self): self.log.info("Starting Thread Pool") self.pool_size = int( self.init_config.get('threads_count', DEFAULT_SIZE_POOL)) self.pool = Pool(self.pool_size) self.pool_started = True self.jobs_status = {} def stop_pool(self): self.log.info("Stopping Thread Pool") if self.pool_started: self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 self.pool_started = False def restart_pool(self): self.stop_pool() self.start_pool() def _clean(self): now = time.time() # TODO: use that for name in self.jobs_status.keys(): start_time = self.jobs_status[name] if now - start_time > JOB_TIMEOUT: self.log.critical("Restarting Pool. One check is stuck.") self.restart_pool() break def _query_event(self, instance): i_key = self._instance_key(instance) last_time = self.latest_event_query.get(i_key) server_instance = self._get_server_instance(instance) event_manager = server_instance.content.eventManager # Be sure we don't duplicate any event, never query the "past" if not last_time: last_time = self.latest_event_query[i_key] = \ event_manager.latestEvent.createdTime + timedelta(seconds=1) query_filter = vim.event.EventFilterSpec() time_filter = vim.event.EventFilterSpec.ByTime( beginTime=self.latest_event_query[i_key]) query_filter.time = time_filter try: new_events = event_manager.QueryEvents(query_filter) self.log.debug("Got {0} events from vCenter event manager".format( len(new_events))) for event in new_events: normalized_event = VSphereEvent(event, self.event_config[i_key]) # Can return None if the event if filtered out event_payload = normalized_event.get_datadog_payload() if event_payload is not None: self.event(event_payload) last_time = event.createdTime + timedelta(seconds=1) except Exception as e: # Don't get stuck on a failure to fetch an event # Ignore them for next pass self.log.warning("Unable to fetch Events %s", e) last_time = event_manager.latestEvent.createdTime + timedelta( seconds=1) self.latest_event_query[i_key] = last_time def _instance_key(self, instance): i_key = instance.get('name') if i_key is None: raise Exception("Must define a unique 'name' per vCenter instance") return i_key def _should_cache(self, instance, entity): i_key = self._instance_key(instance) now = time.time() return now - self.cache_times[i_key][entity][LAST] > self.cache_times[ i_key][entity][INTERVAL] def _get_server_instance(self, instance): i_key = self._instance_key(instance) service_check_tags = [ 'vcenter_server:{0}'.format(instance.get('name')), 'vcenter_host:{0}'.format(instance.get('host')), ] # Check for ssl configs and generate an appropriate ssl context object ssl_verify = instance.get('ssl_verify', True) ssl_capath = instance.get('ssl_capath', None) if not ssl_verify: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_NONE elif ssl_capath: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_REQUIRED context.load_verify_locations(capath=ssl_capath) # If both configs are used, log a message explaining the default if not ssl_verify and ssl_capath: self.log.debug("Your configuration is incorrectly attempting to " "specify both a CA path, and to disable SSL " "verification. You cannot do both. Proceeding with " "disabling ssl verification.") if i_key not in self.server_instances: try: # Object returned by SmartConnect is a ServerInstance # https://www.vmware.com/support/developer/vc-sdk/visdk2xpubs/ReferenceGuide/vim.ServiceInstance.html server_instance = connect.SmartConnect( host=instance.get('host'), user=instance.get('username'), pwd=instance.get('password'), sslContext=context if not ssl_verify or ssl_capath else None) except Exception as e: err_msg = "Connection to %s failed: %s" % ( instance.get('host'), e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=err_msg) raise Exception(err_msg) self.server_instances[i_key] = server_instance # Test if the connection is working try: self.server_instances[i_key].RetrieveContent() self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) except Exception as e: err_msg = "Connection to %s died unexpectedly: %s" % ( instance.get('host'), e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=err_msg) raise Exception(err_msg) return self.server_instances[i_key] def _compute_needed_metrics(self, instance, available_metrics): """ Compare the available metrics for one MOR we have computed and intersect them with the set of metrics we want to report """ if instance.get('all_metrics', False): return available_metrics i_key = self._instance_key(instance) wanted_metrics = [] # Get only the basic metrics for metric in available_metrics: # No cache yet, skip it for now if (i_key not in self.metrics_metadata or metric.counterId not in self.metrics_metadata[i_key]): continue if self.metrics_metadata[i_key][ metric.counterId]['name'] in BASIC_METRICS: wanted_metrics.append(metric) return wanted_metrics def get_external_host_tags(self): """ Returns a list of tags for every host that is detected by the vSphere integration. List of pairs (hostname, list_of_tags) """ self.log.info("Sending external_host_tags now") external_host_tags = [] for instance in self.instances: i_key = self._instance_key(instance) mor_list = self.morlist[i_key].items() for mor_name, mor in mor_list: external_host_tags.append((mor['hostname'], { SOURCE_TYPE: mor['tags'] })) return external_host_tags @atomic_method def _cache_morlist_raw_atomic(self, i_key, obj_type, obj, tags, regexes=None, include_only_marked=False): """ Compute tags for a single node in the vCenter rootFolder and queue other such jobs for children nodes. Usual hierarchy: rootFolder - datacenter1 - compute_resource1 == cluster - host1 - host2 - host3 - compute_resource2 - host5 - vm1 - vm2 If it's a node we want to query metric for, queue it in self.morlist_raw that will be processed by another job. """ ### <TEST-INSTRUMENTATION> t = Timer() self.log.debug("job_atomic: Exploring MOR {0} (type={1})".format( obj, obj_type)) ### </TEST-INSTRUMENTATION> tags_copy = deepcopy(tags) if obj_type == 'rootFolder': for datacenter in obj.childEntity: # Skip non-datacenter if not hasattr(datacenter, 'hostFolder'): continue self.pool.apply_async(self._cache_morlist_raw_atomic, args=(i_key, 'datacenter', datacenter, tags_copy, regexes, include_only_marked)) elif obj_type == 'datacenter': dc_tag = "vsphere_datacenter:%s" % obj.name tags_copy.append(dc_tag) for compute_resource in obj.hostFolder.childEntity: # Skip non-compute resource if not hasattr(compute_resource, 'host'): continue self.pool.apply_async(self._cache_morlist_raw_atomic, args=(i_key, 'compute_resource', compute_resource, tags_copy, regexes, include_only_marked)) elif obj_type == 'compute_resource': if obj.__class__ == vim.ClusterComputeResource: cluster_tag = "vsphere_cluster:%s" % obj.name tags_copy.append(cluster_tag) for host in obj.host: # Skip non-host if not hasattr(host, 'vm'): continue self.pool.apply_async(self._cache_morlist_raw_atomic, args=(i_key, 'host', host, tags_copy, regexes, include_only_marked)) elif obj_type == 'host': if regexes and regexes.get('host_include') is not None: match = re.search(regexes['host_include'], obj.name) if not match: self.log.debug( u"Filtered out VM {0} because of host_include_only_regex" .format(obj.name)) return watched_mor = dict(mor_type='host', mor=obj, hostname=obj.name, tags=tags_copy + ['vsphere_type:host']) self.morlist_raw[i_key].append(watched_mor) host_tag = "vsphere_host:%s" % obj.name tags_copy.append(host_tag) for vm in obj.vm: if vm.runtime.powerState != 'poweredOn': continue self.pool.apply_async(self._cache_morlist_raw_atomic, args=(i_key, 'vm', vm, tags_copy, regexes, include_only_marked)) elif obj_type == 'vm': if regexes and regexes.get('vm_include') is not None: match = re.search(regexes['vm_include'], obj.name) if not match: self.log.debug( u"Filtered out VM {0} because of vm_include_only_regex" .format(obj.name)) return # Also, if include_only_marked is true, then check if there exists a # custom field with the value DatadogMonitored if include_only_marked: monitored = False for field in obj.customValue: if field.value == VM_MONITORING_FLAG: monitored = True break # we shall monitor if not monitored: self.log.debug( u"Filtered out VM {0} because of include_only_marked". format(obj.name)) return watched_mor = dict(mor_type='vm', mor=obj, hostname=obj.name, tags=tags_copy + ['vsphere_type:vm']) self.morlist_raw[i_key].append(watched_mor) ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.morlist_raw_atomic.time', t.total()) ### </TEST-INSTRUMENTATION> def _cache_morlist_raw(self, instance): """ Initiate the first layer to refresh self.morlist by queueing _cache_morlist_raw_atomic on the rootFolder in a recursive/asncy approach """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0: self.log.debug( "Skipping morlist collection now, RAW results " "processing not over (latest refresh was {0}s ago)".format( time.time() - self.cache_times[i_key][MORLIST][LAST])) return self.morlist_raw[i_key] = [] server_instance = self._get_server_instance(instance) root_folder = server_instance.content.rootFolder instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = _is_affirmative( instance.get('include_only_marked', False)) self.pool.apply_async(self._cache_morlist_raw_atomic, args=(i_key, 'rootFolder', root_folder, [instance_tag ], regexes, include_only_marked)) self.cache_times[i_key][MORLIST][LAST] = time.time() @atomic_method def _cache_morlist_process_atomic(self, instance, mor): """ Process one item of the self.morlist_raw list by querying the available metrics for this MOR and then putting it in self.morlist """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager self.log.debug("job_atomic: Querying available metrics" " for MOR {0} (type={1})".format( mor['mor'], mor['mor_type'])) available_metrics = perfManager.QueryAvailablePerfMetric( mor['mor'], intervalId=REAL_TIME_INTERVAL) mor['metrics'] = self._compute_needed_metrics(instance, available_metrics) mor_name = str(mor['mor']) if mor_name in self.morlist[i_key]: # Was already here last iteration self.morlist[i_key][mor_name]['metrics'] = mor['metrics'] else: self.morlist[i_key][mor_name] = mor self.morlist[i_key][mor_name]['last_seen'] = time.time() ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.morlist_process_atomic.time', t.total()) ### </TEST-INSTRUMENTATION> def _cache_morlist_process(self, instance): """ Empties the self.morlist_raw by popping items and running asynchronously the _cache_morlist_process_atomic operation that will get the available metrics for this MOR and put it in self.morlist """ i_key = self._instance_key(instance) if i_key not in self.morlist: self.morlist[i_key] = {} batch_size = self.init_config.get('batch_morlist_size', BATCH_MORLIST_SIZE) for i in xrange(batch_size): try: mor = self.morlist_raw[i_key].pop() self.pool.apply_async(self._cache_morlist_process_atomic, args=(instance, mor)) except (IndexError, KeyError): self.log.debug("No more work to process in morlist_raw") return def _vacuum_morlist(self, instance): """ Check if self.morlist doesn't have some old MORs that are gone, ie we cannot get any metrics from them anyway (or =0) """ i_key = self._instance_key(instance) morlist = self.morlist[i_key].items() for mor_name, mor in morlist: last_seen = mor['last_seen'] if (time.time() - last_seen) > 2 * REFRESH_MORLIST_INTERVAL: del self.morlist[i_key][mor_name] def _cache_metrics_metadata(self, instance): """ Get from the server instance, all the performance counters metadata meaning name/group/description... attached with the corresponding ID """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) self.log.info( "Warming metrics metadata cache for instance {0}".format(i_key)) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager new_metadata = {} for counter in perfManager.perfCounter: d = dict( name="%s.%s" % (counter.groupInfo.key, counter.nameInfo.key), unit=counter.unitInfo.key, instance_tag='instance' # FIXME: replace by what we want to tag! ) new_metadata[counter.key] = d self.cache_times[i_key][METRICS_METADATA][LAST] = time.time() self.log.info( "Finished metadata collection for instance {0}".format(i_key)) # Reset metadata self.metrics_metadata[i_key] = new_metadata ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.metric_metadata_collection.time', t.total()) ### </TEST-INSTRUMENTATION> def _transform_value(self, instance, counter_id, value): """ Given the counter_id, look up for the metrics metadata to check the vsphere type of the counter and apply pre-reporting transformation if needed. """ i_key = self._instance_key(instance) if counter_id in self.metrics_metadata[i_key]: unit = self.metrics_metadata[i_key][counter_id]['unit'] if unit == 'percent': return float(value) / 100 # Defaults to return the value without transformation return value @atomic_method def _collect_metrics_atomic(self, instance, mor): """ Task that collects the metrics listed in the morlist for one MOR """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager query = vim.PerformanceManager.QuerySpec(maxSample=1, entity=mor['mor'], metricId=mor['metrics'], intervalId=20, format='normal') results = perfManager.QueryPerf(querySpec=[query]) if results: for result in results[0].value: if result.id.counterId not in self.metrics_metadata[i_key]: self.log.debug( "Skipping this metric value, because there is no metadata about it" ) continue instance_name = result.id.instance or "none" value = self._transform_value(instance, result.id.counterId, result.value[0]) # Metric types are absolute, delta, and rate if ALL_METRICS[self.metrics_metadata[i_key][ result.id.counterId]['name']]['s_type'] == 'rate': record_metric = self.rate else: record_metric = self.gauge record_metric( "vsphere.%s" % self.metrics_metadata[i_key][result.id.counterId]['name'], value, hostname=mor['hostname'], tags=['instance:%s' % instance_name]) ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.metric_colection.time', t.total()) ### </TEST-INSTRUMENTATION> def collect_metrics(self, instance): """ Calls asynchronously _collect_metrics_atomic on all MORs, as the job queue is processed the Aggregator will receive the metrics. """ i_key = self._instance_key(instance) if i_key not in self.morlist: self.log.debug( "Not collecting metrics for this instance, nothing to do yet: {0}" .format(i_key)) return mors = self.morlist[i_key].items() self.log.debug("Collecting metrics of %d mors" % len(mors)) vm_count = 0 for mor_name, mor in mors: if mor['mor_type'] == 'vm': vm_count += 1 if 'metrics' not in mor: # self.log.debug("Skipping entity %s collection because we didn't cache its metrics yet" % mor['hostname']) continue self.pool.apply_async(self._collect_metrics_atomic, args=(instance, mor)) self.gauge('vsphere.vm.count', vm_count, tags=["vcenter_server:%s" % instance.get('name')]) def check(self, instance): if not self.pool_started: self.start_pool() ### <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial']) ### </TEST-INSTRUMENTATION> # First part: make sure our object repository is neat & clean if self._should_cache(instance, METRICS_METADATA): self._cache_metrics_metadata(instance) if self._should_cache(instance, MORLIST): self._cache_morlist_raw(instance) self._cache_morlist_process(instance) self._vacuum_morlist(instance) # Second part: do the job self.collect_metrics(instance) self._query_event(instance) # For our own sanity self._clean() thread_crashed = False try: while True: self.log.critical(self.exceptionq.get_nowait()) thread_crashed = True except Empty: pass if thread_crashed: self.stop_pool() raise Exception("One thread in the pool crashed, check the logs") ### <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'])
class VSphereCheck(AgentCheck): SERVICE_CHECK_NAME = 'vcenter.can_connect' def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.time_started = time.time() self.pool_started = False self.exceptionq = Queue() self.server_instances = {} self.event_config = {} self.cache_times = {} for instance in self.instances: i_key = self._instance_key(instance) self.cache_times[i_key] = { MORLIST: { LAST: 0, INTERVAL: init_config.get('refresh_morlist_interval', REFRESH_MORLIST_INTERVAL) }, METRICS_METADATA: { LAST: 0, INTERVAL: init_config.get('refresh_metrics_metadata_interval', REFRESH_METRICS_METADATA_INTERVAL) } } self.event_config[i_key] = instance.get('event_config') self.morlist_raw = {} self.morlist = {} self.metrics_metadata = {} self.latest_event_query = {} def stop(self): self.stop_pool() def start_pool(self): self.log.info("Starting Thread Pool") self.pool_size = int(self.init_config.get('threads_count', DEFAULT_SIZE_POOL)) self.pool = Pool(self.pool_size) self.pool_started = True self.jobs_status = {} def stop_pool(self): self.log.info("Stopping Thread Pool") if self.pool_started: self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 self.pool_started = False def restart_pool(self): self.stop_pool() self.start_pool() def _clean(self): now = time.time() for name in self.jobs_status.keys(): start_time = self.jobs_status[name] if now - start_time > JOB_TIMEOUT: self.log.critical("Restarting Pool. One check is stuck.") self.restart_pool() break def _query_event(self, instance): i_key = self._instance_key(instance) last_time = self.latest_event_query.get(i_key) server_instance = self._get_server_instance(instance) event_manager = server_instance.content.eventManager if not last_time: last_time = self.latest_event_query[i_key] = \ event_manager.latestEvent.createdTime + timedelta(seconds=1) query_filter = vim.event.EventFilterSpec() time_filter = vim.event.EventFilterSpec.ByTime(beginTime=self.latest_event_query[i_key]) query_filter.time = time_filter try: new_events = event_manager.QueryEvents(query_filter) self.log.debug("Got {0} events from vCenter event manager".format(len(new_events))) for event in new_events: normalized_event = VSphereEvent(event, self.event_config[i_key]) event_payload = normalized_event.get_datamonitor_payload() if event_payload is not None: self.event(event_payload) last_time = event.createdTime + timedelta(seconds=1) except Exception as e: self.log.warning("Unable to fetch Events %s", e) last_time = event_manager.latestEvent.createdTime + timedelta(seconds=1) self.latest_event_query[i_key] = last_time def _instance_key(self, instance): i_key = instance.get('name') if i_key is None: raise Exception("Must define a unique 'name' per vCenter instance") return i_key def _should_cache(self, instance, entity): i_key = self._instance_key(instance) now = time.time() return now - self.cache_times[i_key][entity][LAST] > self.cache_times[i_key][entity][INTERVAL] def _get_server_instance(self, instance): i_key = self._instance_key(instance) service_check_tags = [ 'vcenter_server:{0}'.format(instance.get('name')), 'vcenter_host:{0}'.format(instance.get('host')), ] ssl_verify = instance.get('ssl_verify', True) ssl_capath = instance.get('ssl_capath', None) if not ssl_verify: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_NONE elif ssl_capath: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_REQUIRED context.load_verify_locations(capath=ssl_capath) if not ssl_verify and ssl_capath: self.log.debug("Your configuration is incorrectly attempting to " "specify both a CA path, and to disable SSL " "verification. You cannot do both. Proceeding with " "disabling ssl verification.") if i_key not in self.server_instances: try: server_instance = connect.SmartConnect( host=instance.get('host'), user=instance.get('username'), pwd=instance.get('password'), sslContext=context if not ssl_verify or ssl_capath else None ) except Exception as e: err_msg = "Connection to %s failed: %s" % (instance.get('host'), e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=err_msg) raise Exception(err_msg) self.server_instances[i_key] = server_instance try: self.server_instances[i_key].RetrieveContent() self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) except Exception as e: err_msg = "Connection to %s died unexpectedly: %s" % (instance.get('host'), e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=err_msg) raise Exception(err_msg) return self.server_instances[i_key] def _compute_needed_metrics(self, instance, available_metrics): if instance.get('all_metrics', False): return available_metrics i_key = self._instance_key(instance) wanted_metrics = [] for metric in available_metrics: if (i_key not in self.metrics_metadata or metric.counterId not in self.metrics_metadata[i_key]): continue if self.metrics_metadata[i_key][metric.counterId]['name'] in BASIC_METRICS: wanted_metrics.append(metric) return wanted_metrics def get_external_host_tags(self): self.log.info("Sending external_host_tags now") external_host_tags = [] for instance in self.instances: i_key = self._instance_key(instance) mor_list = self.morlist[i_key].items() for mor_name, mor in mor_list: external_host_tags.append((mor['hostname'], {SOURCE_TYPE: mor['tags']})) return external_host_tags @atomic_method def _cache_morlist_raw_atomic(self, i_key, obj_type, obj, tags, regexes=None): t = Timer() self.log.debug("job_atomic: Exploring MOR {0} (type={1})".format(obj, obj_type)) tags_copy = deepcopy(tags) if obj_type == 'rootFolder': for datacenter in obj.childEntity: if not hasattr(datacenter, 'hostFolder'): continue self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'datacenter', datacenter, tags_copy, regexes) ) elif obj_type == 'datacenter': dc_tag = "vsphere_datacenter:%s" % obj.name tags_copy.append(dc_tag) for compute_resource in obj.hostFolder.childEntity: if not hasattr(compute_resource, 'host'): continue self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'compute_resource', compute_resource, tags_copy, regexes) ) elif obj_type == 'compute_resource': if obj.__class__ == vim.ClusterComputeResource: cluster_tag = "vsphere_cluster:%s" % obj.name tags_copy.append(cluster_tag) for host in obj.host: if not hasattr(host, 'vm'): continue self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'host', host, tags_copy, regexes) ) elif obj_type == 'host': if regexes and regexes.get('host_include') is not None: match = re.search(regexes['host_include'], obj.name) if not match: self.log.debug(u"Filtered out VM {0} because of host_include_only_regex".format(obj.name)) return watched_mor = dict(mor_type='host', mor=obj, hostname=obj.name, tags=tags_copy + ['vsphere_type:host']) self.morlist_raw[i_key].append(watched_mor) host_tag = "vsphere_host:%s" % obj.name tags_copy.append(host_tag) for vm in obj.vm: if vm.runtime.powerState != 'poweredOn': continue self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'vm', vm, tags_copy, regexes) ) elif obj_type == 'vm': if regexes and regexes.get('vm_include') is not None: match = re.search(regexes['vm_include'], obj.name) if not match: self.log.debug(u"Filtered out VM {0} because of vm_include_only_regex".format(obj.name)) return watched_mor = dict(mor_type='vm', mor=obj, hostname=obj.name, tags=tags_copy + ['vsphere_type:vm']) self.morlist_raw[i_key].append(watched_mor) self.histogram('datamonitor.agent.vsphere.morlist_raw_atomic.time', t.total()) def _cache_morlist_raw(self, instance): i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0: self.log.debug( "Skipping morlist collection now, RAW results " "processing not over (latest refresh was {0}s ago)".format( time.time() - self.cache_times[i_key][MORLIST][LAST]) ) return self.morlist_raw[i_key] = [] server_instance = self._get_server_instance(instance) root_folder = server_instance.content.rootFolder instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'rootFolder', root_folder, [instance_tag], regexes) ) self.cache_times[i_key][MORLIST][LAST] = time.time() @atomic_method def _cache_morlist_process_atomic(self, instance, mor): t = Timer() i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager self.log.debug( "job_atomic: Querying available metrics" " for MOR {0} (type={1})".format(mor['mor'], mor['mor_type']) ) available_metrics = perfManager.QueryAvailablePerfMetric( mor['mor'], intervalId=REAL_TIME_INTERVAL) mor['metrics'] = self._compute_needed_metrics(instance, available_metrics) mor_name = str(mor['mor']) if mor_name in self.morlist[i_key]: self.morlist[i_key][mor_name]['metrics'] = mor['metrics'] else: self.morlist[i_key][mor_name] = mor self.morlist[i_key][mor_name]['last_seen'] = time.time() self.histogram('datamonitor.agent.vsphere.morlist_process_atomic.time', t.total()) def _cache_morlist_process(self, instance): i_key = self._instance_key(instance) if i_key not in self.morlist: self.morlist[i_key] = {} batch_size = self.init_config.get('batch_morlist_size', BATCH_MORLIST_SIZE) for i in xrange(batch_size): try: mor = self.morlist_raw[i_key].pop() self.pool.apply_async(self._cache_morlist_process_atomic, args=(instance, mor)) except (IndexError, KeyError): self.log.debug("No more work to process in morlist_raw") return def _vacuum_morlist(self, instance): i_key = self._instance_key(instance) morlist = self.morlist[i_key].items() for mor_name, mor in morlist: last_seen = mor['last_seen'] if (time.time() - last_seen) > 2 * REFRESH_MORLIST_INTERVAL: del self.morlist[i_key][mor_name] def _cache_metrics_metadata(self, instance): t = Timer() i_key = self._instance_key(instance) self.log.info("Warming metrics metadata cache for instance {0}".format(i_key)) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager new_metadata = {} for counter in perfManager.perfCounter: d = dict( name="%s.%s" % (counter.groupInfo.key, counter.nameInfo.key), unit=counter.unitInfo.key, instance_tag='instance' ) new_metadata[counter.key] = d self.cache_times[i_key][METRICS_METADATA][LAST] = time.time() self.log.info("Finished metadata collection for instance {0}".format(i_key)) self.metrics_metadata[i_key] = new_metadata self.histogram('datamonitor.agent.vsphere.metric_metadata_collection.time', t.total()) def _transform_value(self, instance, counter_id, value): i_key = self._instance_key(instance) if counter_id in self.metrics_metadata[i_key]: unit = self.metrics_metadata[i_key][counter_id]['unit'] if unit == 'percent': return float(value) / 100 return value @atomic_method def _collect_metrics_atomic(self, instance, mor): t = Timer() i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager query = vim.PerformanceManager.QuerySpec(maxSample=1, entity=mor['mor'], metricId=mor['metrics'], intervalId=20, format='normal') results = perfManager.QueryPerf(querySpec=[query]) if results: for result in results[0].value: if result.id.counterId not in self.metrics_metadata[i_key]: self.log.debug("Skipping this metric value, because there is no metadata about it") continue instance_name = result.id.instance or "none" value = self._transform_value(instance, result.id.counterId, result.value[0]) if ALL_METRICS[self.metrics_metadata[i_key][result.id.counterId]['name']]['s_type'] == 'rate': record_metric = self.rate else: record_metric = self.gauge ip = "unknown" content = server_instance.RetrieveContent() for child in content.rootFolder.childEntity: if hasattr(child, 'vmFolder'): datacenter = child vmFolder = datacenter.vmFolder vmList = vmFolder.childEntity for vm in vmList: if isinstance(vm, vim.VirtualMachine): ip = vm.summary.guest.ipAddress self.log.info("Get VM ip {} by VMtools".format(ip)) if ip != "unknown" and ip != "None": record_metric( "vsphere.%s" % self.metrics_metadata[i_key][result.id.counterId]['name'], value, hostname=mor['hostname'], tags=['instance:%s' % instance_name, 'ip:%s' % ip, 'type:VM'] ) else: record_metric( "vsphere.%s" % self.metrics_metadata[i_key][result.id.counterId]['name'], value, hostname=mor['hostname'], tags=['instance:%s' % instance_name, "type:VM"] ) self.histogram('datamonitor.agent.vsphere.metric_colection.time', t.total()) def collect_metrics(self, instance): i_key = self._instance_key(instance) if i_key not in self.morlist: self.log.debug("Not collecting metrics for this instance, nothing to do yet: {0}".format(i_key)) return mors = self.morlist[i_key].items() self.log.debug("Collecting metrics of %d mors" % len(mors)) vm_count = 0 for mor_name, mor in mors: if mor['mor_type'] == 'vm': vm_count += 1 if 'metrics' not in mor: continue self.pool.apply_async(self._collect_metrics_atomic, args=(instance, mor)) self.gauge('vsphere.vm.count', vm_count, tags=["vcenter_server:%s" % instance.get('name')]) def check(self, instance): if not self.pool_started: self.start_pool() self.gauge('datamonitor.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial']) if self._should_cache(instance, METRICS_METADATA): self._cache_metrics_metadata(instance) if self._should_cache(instance, MORLIST): self._cache_morlist_raw(instance) self._cache_morlist_process(instance) self._vacuum_morlist(instance) self.collect_metrics(instance) self._query_event(instance) self._clean() thread_crashed = False try: while True: self.log.critical(self.exceptionq.get_nowait()) thread_crashed = True except Empty: pass if thread_crashed: self.stop_pool() raise Exception("One thread in the pool crashed, check the logs") self.gauge('datamonitor.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'])
class VSphereCheck(AgentCheck): """ Get performance metrics from a vCenter server and upload them to Datadog References: http://pubs.vmware.com/vsphere-51/index.jsp#com.vmware.wssdk.apiref.doc/vim.PerformanceManager.html *_atomic jobs perform one single task asynchronously in the ThreadPool, we don't know exactly when they will finish, but we reap them if they're stuck. The other calls are performed synchronously. """ SERVICE_CHECK_NAME = 'vcenter.can_connect' def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.time_started = time.time() self.pool_started = False self.exceptionq = Queue() # Connections open to vCenter instances self.server_instances = {} # Event configuration self.event_config = {} # Caching resources, timeouts self.cache_times = {} for instance in self.instances: i_key = self._instance_key(instance) self.cache_times[i_key] = { MORLIST: { LAST: 0, INTERVAL: init_config.get('refresh_morlist_interval', REFRESH_MORLIST_INTERVAL) }, METRICS_METADATA: { LAST: 0, INTERVAL: init_config.get('refresh_metrics_metadata_interval', REFRESH_METRICS_METADATA_INTERVAL) } } self.event_config[i_key] = instance.get('event_config') # First layer of cache (get entities from the tree) self.morlist_raw = {} # Second layer, processed from the first one self.morlist = {} # Metrics metadata, basically perfCounterId -> {name, group, description} self.metrics_metadata = {} self.latest_event_query = {} def stop(self): self.stop_pool() def start_pool(self): self.log.info("Starting Thread Pool") self.pool_size = int(self.init_config.get('threads_count', DEFAULT_SIZE_POOL)) self.pool = Pool(self.pool_size) self.pool_started = True self.jobs_status = {} def stop_pool(self): self.log.info("Stopping Thread Pool") if self.pool_started: self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 self.pool_started = False def restart_pool(self): self.stop_pool() self.start_pool() def _clean(self): now = time.time() # TODO: use that for name in self.jobs_status.keys(): start_time = self.jobs_status[name] if now - start_time > JOB_TIMEOUT: self.log.critical("Restarting Pool. One check is stuck.") self.restart_pool() break def _query_event(self, instance): i_key = self._instance_key(instance) last_time = self.latest_event_query.get(i_key) server_instance = self._get_server_instance(instance) event_manager = server_instance.content.eventManager # Be sure we don't duplicate any event, never query the "past" if not last_time: last_time = self.latest_event_query[i_key] = \ event_manager.latestEvent.createdTime + timedelta(seconds=1) query_filter = vim.event.EventFilterSpec() time_filter = vim.event.EventFilterSpec.ByTime(beginTime=self.latest_event_query[i_key]) query_filter.time = time_filter try: new_events = event_manager.QueryEvents(query_filter) self.log.debug("Got {0} events from vCenter event manager".format(len(new_events))) for event in new_events: normalized_event = VSphereEvent(event, self.event_config[i_key]) # Can return None if the event if filtered out event_payload = normalized_event.get_datadog_payload() if event_payload is not None: self.event(event_payload) last_time = event.createdTime + timedelta(seconds=1) except Exception as e: # Don't get stuck on a failure to fetch an event # Ignore them for next pass self.log.warning("Unable to fetch Events %s", e) last_time = event_manager.latestEvent.createdTime + timedelta(seconds=1) self.latest_event_query[i_key] = last_time def _instance_key(self, instance): i_key = instance.get('name') if i_key is None: raise Exception("Must define a unique 'name' per vCenter instance") return i_key def _should_cache(self, instance, entity): i_key = self._instance_key(instance) now = time.time() return now - self.cache_times[i_key][entity][LAST] > self.cache_times[i_key][entity][INTERVAL] def _get_server_instance(self, instance): i_key = self._instance_key(instance) service_check_tags = [ 'vcenter_server:{0}'.format(instance.get('name')), 'vcenter_host:{0}'.format(instance.get('host')), ] # Check for ssl configs and generate an appropriate ssl context object ssl_verify = instance.get('ssl_verify', True) ssl_capath = instance.get('ssl_capath', None) if not ssl_verify: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_NONE elif ssl_capath: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_REQUIRED context.load_verify_locations(capath=ssl_capath) # If both configs are used, log a message explaining the default if not ssl_verify and ssl_capath: self.log.debug("Your configuration is incorrectly attempting to " "specify both a CA path, and to disable SSL " "verification. You cannot do both. Proceeding with " "disabling ssl verification.") if i_key not in self.server_instances: try: server_instance = connect.SmartConnect( host = instance.get('host'), user = instance.get('username'), pwd = instance.get('password'), sslContext = context if not ssl_verify or ssl_capath else None ) except Exception as e: err_msg = "Connection to %s failed: %s" % (instance.get('host'), e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=err_msg) raise Exception(err_msg) self.server_instances[i_key] = server_instance # Test if the connection is working try: self.server_instances[i_key].RetrieveContent() self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) except Exception as e: err_msg = "Connection to %s died unexpectedly: %s" % (instance.get('host'), e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=err_msg) raise Exception(err_msg) return self.server_instances[i_key] def _compute_needed_metrics(self, instance, available_metrics): """ Compare the available metrics for one MOR we have computed and intersect them with the set of metrics we want to report """ if instance.get('all_metrics', False): return available_metrics i_key = self._instance_key(instance) wanted_metrics = [] # Get only the basic metrics for metric in available_metrics: # No cache yet, skip it for now if (i_key not in self.metrics_metadata or metric.counterId not in self.metrics_metadata[i_key]): continue if self.metrics_metadata[i_key][metric.counterId]['name'] in BASIC_METRICS: wanted_metrics.append(metric) return wanted_metrics def get_external_host_tags(self): """ Returns a list of tags for every host that is detected by the vSphere integration. List of pairs (hostname, list_of_tags) """ self.log.info("Sending external_host_tags now") external_host_tags = [] for instance in self.instances: i_key = self._instance_key(instance) mor_list = self.morlist[i_key].items() for mor_name, mor in mor_list: external_host_tags.append((mor['hostname'], {SOURCE_TYPE: mor['tags']})) return external_host_tags @atomic_method def _cache_morlist_raw_atomic(self, i_key, obj_type, obj, tags, regexes=None): """ Compute tags for a single node in the vCenter rootFolder and queue other such jobs for children nodes. Usual hierarchy: rootFolder - datacenter1 - compute_resource1 == cluster - host1 - host2 - host3 - compute_resource2 - host5 - vm1 - vm2 If it's a node we want to query metric for, queue it in self.morlist_raw that will be processed by another job. """ ### <TEST-INSTRUMENTATION> t = Timer() self.log.debug("job_atomic: Exploring MOR {0} (type={1})".format(obj, obj_type)) ### </TEST-INSTRUMENTATION> tags_copy = deepcopy(tags) if obj_type == 'rootFolder': for datacenter in obj.childEntity: # Skip non-datacenter if not hasattr(datacenter, 'hostFolder'): continue self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'datacenter', datacenter, tags_copy, regexes) ) elif obj_type == 'datacenter': dc_tag = "vsphere_datacenter:%s" % obj.name tags_copy.append(dc_tag) for compute_resource in obj.hostFolder.childEntity: # Skip non-compute resource if not hasattr(compute_resource, 'host'): continue self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'compute_resource', compute_resource, tags_copy, regexes) ) elif obj_type == 'compute_resource': if obj.__class__ == vim.ClusterComputeResource: cluster_tag = "vsphere_cluster:%s" % obj.name tags_copy.append(cluster_tag) for host in obj.host: # Skip non-host if not hasattr(host, 'vm'): continue self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'host', host, tags_copy, regexes) ) elif obj_type == 'host': if regexes and regexes.get('host_include') is not None: match = re.search(regexes['host_include'], obj.name) if not match: self.log.debug(u"Filtered out VM {0} because of host_include_only_regex".format(obj.name)) return watched_mor = dict(mor_type='host', mor=obj, hostname=obj.name, tags=tags_copy+['vsphere_type:host']) self.morlist_raw[i_key].append(watched_mor) host_tag = "vsphere_host:%s" % obj.name tags_copy.append(host_tag) for vm in obj.vm: if vm.runtime.powerState != 'poweredOn': continue self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'vm', vm, tags_copy, regexes) ) elif obj_type == 'vm': if regexes and regexes.get('vm_include') is not None: match = re.search(regexes['vm_include'], obj.name) if not match: self.log.debug(u"Filtered out VM {0} because of vm_include_only_regex".format(obj.name)) return watched_mor = dict(mor_type='vm', mor=obj, hostname=obj.name, tags=tags_copy+['vsphere_type:vm']) self.morlist_raw[i_key].append(watched_mor) ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.morlist_raw_atomic.time', t.total()) ### </TEST-INSTRUMENTATION> def _cache_morlist_raw(self, instance): """ Initiate the first layer to refresh self.morlist by queueing _cache_morlist_raw_atomic on the rootFolder in a recursive/asncy approach """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0: self.log.debug( "Skipping morlist collection now, RAW results " "processing not over (latest refresh was {0}s ago)".format( time.time() - self.cache_times[i_key][MORLIST][LAST]) ) return self.morlist_raw[i_key] = [] server_instance = self._get_server_instance(instance) root_folder = server_instance.content.rootFolder instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } self.pool.apply_async( self._cache_morlist_raw_atomic, args=(i_key, 'rootFolder', root_folder, [instance_tag], regexes) ) self.cache_times[i_key][MORLIST][LAST] = time.time() @atomic_method def _cache_morlist_process_atomic(self, instance, mor): """ Process one item of the self.morlist_raw list by querying the available metrics for this MOR and then putting it in self.morlist """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager self.log.debug( "job_atomic: Querying available metrics" " for MOR {0} (type={1})".format(mor['mor'], mor['mor_type']) ) available_metrics = perfManager.QueryAvailablePerfMetric( mor['mor'], intervalId=REAL_TIME_INTERVAL) mor['metrics'] = self._compute_needed_metrics(instance, available_metrics) mor_name = str(mor['mor']) if mor_name in self.morlist[i_key]: # Was already here last iteration self.morlist[i_key][mor_name]['metrics'] = mor['metrics'] else: self.morlist[i_key][mor_name] = mor self.morlist[i_key][mor_name]['last_seen'] = time.time() ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.morlist_process_atomic.time', t.total()) ### </TEST-INSTRUMENTATION> def _cache_morlist_process(self, instance): """ Empties the self.morlist_raw by popping items and running asynchronously the _cache_morlist_process_atomic operation that will get the available metrics for this MOR and put it in self.morlist """ i_key = self._instance_key(instance) if i_key not in self.morlist: self.morlist[i_key] = {} batch_size = self.init_config.get('batch_morlist_size', BATCH_MORLIST_SIZE) for i in xrange(batch_size): try: mor = self.morlist_raw[i_key].pop() self.pool.apply_async(self._cache_morlist_process_atomic, args=(instance, mor)) except (IndexError, KeyError): self.log.debug("No more work to process in morlist_raw") return def _vacuum_morlist(self, instance): """ Check if self.morlist doesn't have some old MORs that are gone, ie we cannot get any metrics from them anyway (or =0) """ i_key = self._instance_key(instance) morlist = self.morlist[i_key].items() for mor_name, mor in morlist: last_seen = mor['last_seen'] if (time.time() - last_seen) > 2 * REFRESH_MORLIST_INTERVAL: del self.morlist[i_key][mor_name] def _cache_metrics_metadata(self, instance): """ Get from the server instance, all the performance counters metadata meaning name/group/description... attached with the corresponding ID """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) self.log.info("Warming metrics metadata cache for instance {0}".format(i_key)) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager new_metadata = {} for counter in perfManager.perfCounter: d = dict( name = "%s.%s" % (counter.groupInfo.key, counter.nameInfo.key), unit = counter.unitInfo.key, instance_tag = 'instance' # FIXME: replace by what we want to tag! ) new_metadata[counter.key] = d self.cache_times[i_key][METRICS_METADATA][LAST] = time.time() self.log.info("Finished metadata collection for instance {0}".format(i_key)) # Reset metadata self.metrics_metadata[i_key] = new_metadata ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.metric_metadata_collection.time', t.total()) ### </TEST-INSTRUMENTATION> def _transform_value(self, instance, counter_id, value): """ Given the counter_id, look up for the metrics metadata to check the vsphere type of the counter and apply pre-reporting transformation if needed. """ i_key = self._instance_key(instance) if counter_id in self.metrics_metadata[i_key]: unit = self.metrics_metadata[i_key][counter_id]['unit'] if unit == 'percent': return float(value) / 100 # Defaults to return the value without transformation return value @atomic_method def _collect_metrics_atomic(self, instance, mor): """ Task that collects the metrics listed in the morlist for one MOR """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager query = vim.PerformanceManager.QuerySpec(maxSample=1, entity=mor['mor'], metricId=mor['metrics'], intervalId=20, format='normal') results = perfManager.QueryPerf(querySpec=[query]) if results: for result in results[0].value: if result.id.counterId not in self.metrics_metadata[i_key]: self.log.debug("Skipping this metric value, because there is no metadata about it") continue instance_name = result.id.instance or "none" value = self._transform_value(instance, result.id.counterId, result.value[0]) # Metric types are absolute, delta, and rate if ALL_METRICS[self.metrics_metadata[i_key][result.id.counterId]['name']]['s_type'] == 'rate': record_metric = self.rate else: record_metric = self.gauge record_metric( "vsphere.%s" % self.metrics_metadata[i_key][result.id.counterId]['name'], value, hostname=mor['hostname'], tags=['instance:%s' % instance_name] ) ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.metric_colection.time', t.total()) ### </TEST-INSTRUMENTATION> def collect_metrics(self, instance): """ Calls asynchronously _collect_metrics_atomic on all MORs, as the job queue is processed the Aggregator will receive the metrics. """ i_key = self._instance_key(instance) if i_key not in self.morlist: self.log.debug("Not collecting metrics for this instance, nothing to do yet: {0}".format(i_key)) return mors = self.morlist[i_key].items() self.log.debug("Collecting metrics of %d mors" % len(mors)) vm_count = 0 for mor_name, mor in mors: if mor['mor_type'] == 'vm': vm_count += 1 if 'metrics' not in mor: # self.log.debug("Skipping entity %s collection because we didn't cache its metrics yet" % mor['hostname']) continue self.pool.apply_async(self._collect_metrics_atomic, args=(instance, mor)) self.gauge('vsphere.vm.count', vm_count, tags=["vcenter_server:%s" % instance.get('name')]) def check(self, instance): if not self.pool_started: self.start_pool() ### <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial']) ### </TEST-INSTRUMENTATION> # First part: make sure our object repository is neat & clean if self._should_cache(instance, METRICS_METADATA): self._cache_metrics_metadata(instance) if self._should_cache(instance, MORLIST): self._cache_morlist_raw(instance) self._cache_morlist_process(instance) self._vacuum_morlist(instance) # Second part: do the job self.collect_metrics(instance) self._query_event(instance) # For our own sanity self._clean() thread_crashed = False try: while True: self.log.critical(self.exceptionq.get_nowait()) thread_crashed = True except Empty: pass if thread_crashed: self.stop_pool() raise Exception("One thread in the pool crashed, check the logs") ### <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'])
class NetworkCheck(AgentCheck): SOURCE_TYPE_NAME = 'servicecheck' SERVICE_CHECK_PREFIX = 'network_check' STATUS_TO_SERVICE_CHECK = { Status.UP : AgentCheck.OK, Status.WARNING : AgentCheck.WARNING, Status.DOWN : AgentCheck.CRITICAL } """ Services checks inherits from this class. This class should never be directly instanciated. Work flow: The main agent loop will call the check function for each instance for each iteration of the loop. The check method will make an asynchronous call to the _process method in one of the thread initiated in the thread pool created in this class constructor. The _process method will call the _check method of the inherited class which will perform the actual check. The _check method must return a tuple which first element is either Status.UP or Status.DOWN. The second element is a short error message that will be displayed when the service turns down. """ def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # A dictionary to keep track of service statuses self.statuses = {} self.notified = {} self.nb_failures = 0 self.pool_started = False # Make sure every instance has a name that we use as a unique key # to keep track of statuses names = [] for inst in instances: if 'name' not in inst: raise Exception("All instances should have a 'name' parameter," " error on instance: {0}".format(inst)) if inst['name'] in names: raise Exception("Duplicate names for instances with name {0}" .format(inst['name'])) def stop(self): self.stop_pool() self.pool_started = False def start_pool(self): # The pool size should be the minimum between the number of instances # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count' # parameter in the init_config of the check self.log.info("Starting Thread Pool") default_size = min(self.instance_count(), DEFAULT_SIZE_POOL) self.pool_size = int(self.init_config.get('threads_count', default_size)) self.pool = Pool(self.pool_size) self.resultsq = Queue() self.jobs_status = {} self.pool_started = True def stop_pool(self): self.log.info("Stopping Thread Pool") if self.pool_started: self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 def restart_pool(self): self.stop_pool() self.start_pool() def check(self, instance): if not self.pool_started: self.start_pool() if threading.activeCount() > 5 * self.pool_size + 5: # On Windows the agent runs on multiple threads so we need to have an offset of 5 in case the pool_size is 1 raise Exception("Thread number (%s) is exploding. Skipping this check" % threading.activeCount()) self._process_results() self._clean() name = instance.get('name', None) if name is None: self.log.error('Each service check must have a name') return if name not in self.jobs_status: # A given instance should be processed one at a time self.jobs_status[name] = time.time() self.pool.apply_async(self._process, args=(instance,)) else: self.log.error("Instance: %s skipped because it's already running." % name) def _process(self, instance): try: statuses = self._check(instance) if isinstance(statuses, tuple): # Assume the check only returns one service check status, msg = statuses self.resultsq.put((status, msg, None, instance)) elif isinstance(statuses, list): for status in statuses: sc_name, status, msg = status self.resultsq.put((status, msg, sc_name, instance)) except Exception: result = (FAILURE, FAILURE, FAILURE, FAILURE) self.resultsq.put(result) def _process_results(self): for i in range(MAX_LOOP_ITERATIONS): try: # We want to fetch the result in a non blocking way status, msg, sc_name, instance = self.resultsq.get_nowait() except Empty: break if status == FAILURE: self.nb_failures += 1 if self.nb_failures >= self.pool_size - 1: self.nb_failures = 0 self.restart_pool() continue self.report_as_service_check(sc_name, status, instance, msg) # FIXME: 5.3, this has been deprecated before, get rid of events # Don't create any event to avoid duplicates with server side # service_checks skip_event = _is_affirmative(instance.get('skip_event', False)) instance_name = instance['name'] if not skip_event: self.warning("Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Datadog Agent.") event = None if instance_name not in self.statuses: self.statuses[instance_name] = defaultdict(list) self.statuses[instance_name][sc_name].append(status) window = int(instance.get('window', 1)) if window > 256: self.log.warning("Maximum window size (256) exceeded, defaulting it to 256") window = 256 threshold = instance.get('threshold', 1) if len(self.statuses[instance_name][sc_name]) > window: self.statuses[instance_name][sc_name].pop(0) nb_failures = self.statuses[instance_name][sc_name].count(Status.DOWN) if nb_failures >= threshold: if self.notified.get((instance_name, sc_name), Status.UP) != Status.DOWN: event = self._create_status_event(sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.DOWN else: if self.notified.get((instance_name, sc_name), Status.UP) != Status.UP: event = self._create_status_event(sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.UP if event is not None: self.events.append(event) # The job is finished here, this instance can be re processed if instance_name in self.jobs_status: del self.jobs_status[instance_name] def _check(self, instance): """This function should be implemented by inherited classes""" raise NotImplementedError def _clean(self): now = time.time() for name in self.jobs_status.keys(): start_time = self.jobs_status[name] if now - start_time > TIMEOUT: self.log.critical("Restarting Pool. One check is stuck: %s" % name) self.restart_pool() break
class ServicesCheck(AgentCheck): SOURCE_TYPE_NAME = 'servicecheck' """ Services checks inherits from this class. This class should never be directly instanciated. Work flow: The main agent loop will call the check function for each instance for each iteration of the loop. The check method will make an asynchronous call to the _process method in one of the thread initiated in the thread pool created in this class constructor. The _process method will call the _check method of the inherited class which will perform the actual check. The _check method must return a tuple which first element is either Status.UP or Status.DOWN. The second element is a short error message that will be displayed when the service turns down. """ def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # A dictionary to keep track of service statuses self.statuses = {} self.notified = {} self.start_pool() self.nb_failures = 0 def stop(self): self.stop_pool() def start_pool(self): # The pool size should be the minimum between the number of instances # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count' # parameter in the init_config of the check self.log.info("Starting Thread Pool") default_size = min(self.instance_count(), DEFAULT_SIZE_POOL) self.pool_size = int(self.init_config.get('threads_count', default_size)) self.pool = Pool(self.pool_size) self.resultsq = Queue() self.jobs_status = {} def stop_pool(self): self.log.info("Stopping Thread Pool") self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 def restart_pool(self): self.stop_pool() self.start_pool() def check(self, instance): if threading.activeCount() > 5 * self.pool_size: raise Exception("Thread number (%s) is exploding. Skipping this check" % threading.activeCount()) self._process_results() self._clean() name = instance.get('name', None) if name is None: self.log.error('Each service check must have a name') return if name not in self.jobs_status: # A given instance should be processed one at a time self.jobs_status[name] = time.time() self.pool.apply_async(self._process, args=(instance,)) else: self.log.error("Instance: %s skipped because it's already running." % name) def _process(self, instance): name = instance.get('name', None) try: status, msg = self._check(instance) result = (status, msg, name, instance) # We put the results in the result queue self.resultsq.put(result) except Exception, e: result = (FAILURE, FAILURE, FAILURE, FAILURE) self.resultsq.put(result)
class NetworkCheck(AgentCheck): SOURCE_TYPE_NAME = 'servicecheck' SERVICE_CHECK_PREFIX = 'network_check' STATUS_TO_SERVICE_CHECK = { Status.UP: AgentCheck.OK, Status.WARNING: AgentCheck.WARNING, Status.CRITICAL: AgentCheck.CRITICAL, Status.DOWN: AgentCheck.CRITICAL, } """ Services checks inherits from this class. This class should never be directly instanciated. Work flow: The main agent loop will call the check function for each instance for each iteration of the loop. The check method will make an asynchronous call to the _process method in one of the thread initiated in the thread pool created in this class constructor. The _process method will call the _check method of the inherited class which will perform the actual check. The _check method must return a tuple which first element is either Status.UP or Status.DOWN. The second element is a short error message that will be displayed when the service turns down. """ def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # A dictionary to keep track of service statuses self.statuses = {} self.notified = {} self.nb_failures = 0 self.pool_started = False # Make sure every instance has a name that we use as a unique key # to keep track of statuses names = [] for inst in instances: name = inst.get('name', None) if not name: raise Exception("All instances should have a 'name' parameter," " error on instance: {0}".format(inst)) if name in names: raise Exception( "Duplicate names for instances with name {0}".format( inst['name'])) def stop(self): self.stop_pool() self.pool_started = False def start_pool(self): # The pool size should be the minimum between the number of instances # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count' # parameter in the init_config of the check self.log.info("Starting Thread Pool") default_size = min(self.instance_count(), DEFAULT_SIZE_POOL) self.pool_size = int( self.init_config.get('threads_count', default_size)) self.pool = Pool(self.pool_size) self.resultsq = Queue() self.jobs_status = {} self.jobs_results = {} self.pool_started = True def stop_pool(self): self.log.info("Stopping Thread Pool") if self.pool_started: self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 def restart_pool(self): self.stop_pool() self.start_pool() def check(self, instance): if not self.pool_started: self.start_pool() if threading.activeCount( ) > 5 * self.pool_size + 5: # On Windows the agent runs on multiple threads so we need to have an offset of 5 in case the pool_size is 1 raise Exception( "Thread number (%s) is exploding. Skipping this check" % threading.activeCount()) self._process_results() self._clean() name = instance.get('name', None) if name is None: self.log.error('Each service check must have a name') return if name not in self.jobs_status: # A given instance should be processed one at a time self.jobs_status[name] = time.time() self.jobs_results[name] = self.pool.apply_async(self._process, args=(instance, )) else: self.log.error( "Instance: %s skipped because it's already running." % name) def _process(self, instance): try: statuses = self._check(instance) if isinstance(statuses, tuple): # Assume the check only returns one service check status, msg = statuses self.resultsq.put((status, msg, None, instance)) elif isinstance(statuses, list): for status in statuses: sc_name, status, msg = status self.resultsq.put((status, msg, sc_name, instance)) except Exception: result = (FAILURE, FAILURE, FAILURE, instance) self.resultsq.put(result) def _process_results(self): for i in xrange(MAX_LOOP_ITERATIONS): try: # We want to fetch the result in a non blocking way status, msg, sc_name, instance = self.resultsq.get_nowait() except Empty: break instance_name = instance['name'] if status == FAILURE: self.nb_failures += 1 if self.nb_failures >= self.pool_size - 1: self.nb_failures = 0 self.restart_pool() # clean failed job self._clean_job(instance_name) continue self.report_as_service_check(sc_name, status, instance, msg) # FIXME: 5.3, this has been deprecated before, get rid of events # Don't create any event to avoid duplicates with server side # service_checks skip_event = _is_affirmative(instance.get('skip_event', False)) if not skip_event: self.warning( "Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Datadog Agent." ) event = None if instance_name not in self.statuses: self.statuses[instance_name] = defaultdict(list) self.statuses[instance_name][sc_name].append(status) window = int(instance.get('window', 1)) if window > 256: self.log.warning( "Maximum window size (256) exceeded, defaulting it to 256" ) window = 256 threshold = instance.get('threshold', 1) if len(self.statuses[instance_name][sc_name]) > window: self.statuses[instance_name][sc_name].pop(0) nb_failures = self.statuses[instance_name][sc_name].count( Status.DOWN) if nb_failures >= threshold: if self.notified.get( (instance_name, sc_name), Status.UP) != Status.DOWN: event = self._create_status_event( sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.DOWN else: if self.notified.get( (instance_name, sc_name), Status.UP) != Status.UP: event = self._create_status_event( sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.UP if event is not None: self.events.append(event) self._clean_job(instance_name) def _clean_job(self, instance_name): # The job is finished here, this instance can be re processed if instance_name in self.jobs_status: self.log.debug("Instance: %s cleaned from jobs status." % instance_name) del self.jobs_status[instance_name] # if an exception happened, log it if instance_name in self.jobs_results: self.log.debug("Instance: %s cleaned from jobs results." % instance_name) ret = self.jobs_results[instance_name].get() if isinstance(ret, Exception): self.log.exception( "Exception in worker thread: {0}".format(ret)) del self.jobs_results[instance_name] def _check(self, instance): """This function should be implemented by inherited classes""" raise NotImplementedError def _clean(self): now = time.time() for name, start_time in self.jobs_status.iteritems(): if now - start_time > TIMEOUT: self.log.critical("Restarting Pool. One check is stuck: %s" % name) self.restart_pool() break
class NetworkCheck(AgentCheck): SOURCE_TYPE_NAME = 'servicecheck' SERVICE_CHECK_PREFIX = 'network_check' _global_current_pool_size = 0 STATUS_TO_SERVICE_CHECK = { Status.UP : AgentCheck.OK, Status.WARNING : AgentCheck.WARNING, Status.CRITICAL : AgentCheck.CRITICAL, Status.DOWN : AgentCheck.CRITICAL, } """ Services checks inherits from this class. This class should never be directly instanciated. Work flow: The main agent loop will call the check function for each instance for each iteration of the loop. The check method will make an asynchronous call to the _process method in one of the thread initiated in the thread pool created in this class constructor. The _process method will call the _check method of the inherited class which will perform the actual check. The _check method must return a tuple which first element is either Status.UP or Status.DOWN. The second element is a short error message that will be displayed when the service turns down. """ def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # A dictionary to keep track of service statuses self.statuses = {} self.notified = {} self.nb_failures = 0 self.pool_size = 0 self.pool_started = False # Make sure every instance has a name that we use as a unique key # to keep track of statuses names = [] for inst in instances: inst_name = inst.get('name', None) if not inst_name: raise Exception("All instances should have a 'name' parameter," " error on instance: {0}".format(inst)) if inst_name in names: raise Exception("Duplicate names for instances with name {0}" .format(inst_name)) names.append(inst_name) def stop(self): self.stop_pool() self.pool_started = False def start_pool(self): # The pool size should be the minimum between the number of instances # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count' # parameter in the init_config of the check self.log.info("Starting Thread Pool") default_size = min(self.instance_count(), DEFAULT_SIZE_POOL) self.pool_size = int(self.init_config.get('threads_count', default_size)) # To keep track on the total number of threads we should have running NetworkCheck._global_current_pool_size += self.pool_size self.pool = Pool(self.pool_size) self.resultsq = Queue() self.jobs_status = {} self.jobs_results = {} self.pool_started = True def stop_pool(self): self.log.info("Stopping Thread Pool") # To keep track on the total number of threads we should have running NetworkCheck._global_current_pool_size -= self.pool_size if self.pool_started: self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 def restart_pool(self): self.stop_pool() self.start_pool() def check(self, instance): if not self.pool_started: self.start_pool() if threading.activeCount() > 5 * NetworkCheck._global_current_pool_size + 6: # On Windows the agent runs on multiple threads because of WMI so we need an offset of 6 raise Exception("Thread number (%s) is exploding. Skipping this check" % threading.activeCount()) self._process_results() self._clean() name = instance.get('name', None) if name is None: self.log.error('Each service check must have a name') return if name not in self.jobs_status: # A given instance should be processed one at a time self.jobs_status[name] = time.time() self.jobs_results[name] = self.pool.apply_async(self._process, args=(instance,)) else: self.log.error("Instance: %s skipped because it's already running." % name) def _process(self, instance): try: statuses = self._check(instance) if isinstance(statuses, tuple): # Assume the check only returns one service check status, msg = statuses self.resultsq.put((status, msg, None, instance)) elif isinstance(statuses, list): for status in statuses: sc_name, status, msg = status self.resultsq.put((status, msg, sc_name, instance)) except Exception: self.log.exception( u"Failed to process instance '%s'.", instance.get('name', u"") ) result = (FAILURE, FAILURE, FAILURE, instance) self.resultsq.put(result) def _process_results(self): for i in xrange(MAX_LOOP_ITERATIONS): try: # We want to fetch the result in a non blocking way status, msg, sc_name, instance = self.resultsq.get_nowait() except Empty: break instance_name = instance['name'] if status == FAILURE: self.nb_failures += 1 if self.nb_failures >= self.pool_size - 1: self.nb_failures = 0 self.restart_pool() # clean failed job self._clean_job(instance_name) continue self.report_as_service_check(sc_name, status, instance, msg) self._clean_job(instance_name) def _clean_job(self, instance_name): # The job is finished here, this instance can be re processed if instance_name in self.jobs_status: self.log.debug("Instance: %s cleaned from jobs status." % instance_name) del self.jobs_status[instance_name] # if an exception happened, log it if instance_name in self.jobs_results: self.log.debug("Instance: %s cleaned from jobs results." % instance_name) ret = self.jobs_results[instance_name].get() if isinstance(ret, Exception): self.log.exception("Exception in worker thread: {0}".format(ret)) del self.jobs_results[instance_name] def _check(self, instance): """This function should be implemented by inherited classes""" raise NotImplementedError def _clean(self): now = time.time() for name, start_time in self.jobs_status.iteritems(): if now - start_time > TIMEOUT: self.log.critical("Restarting Pool. One check is stuck: %s" % name) self.restart_pool() break
class VSphereCheck(AgentCheck): """ Get performance metrics from a vCenter server and upload them to Datadog References: http://pubs.vmware.com/vsphere-51/index.jsp#com.vmware.wssdk.apiref.doc/vim.PerformanceManager.html *_atomic jobs perform one single task asynchronously in the ThreadPool, we don't know exactly when they will finish, but we reap them if they're stuck. The other calls are performed synchronously. """ SERVICE_CHECK_NAME = 'vcenter.can_connect' def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.time_started = time.time() self.pool_started = False self.exceptionq = Queue() # Connections open to vCenter instances self.server_instances = {} # Event configuration self.event_config = {} # Caching resources, timeouts self.cache_times = {} for instance in self.instances: i_key = self._instance_key(instance) self.cache_times[i_key] = { MORLIST: { LAST: 0, INTERVAL: init_config.get('refresh_morlist_interval', REFRESH_MORLIST_INTERVAL) }, METRICS_METADATA: { LAST: 0, INTERVAL: init_config.get('refresh_metrics_metadata_interval', REFRESH_METRICS_METADATA_INTERVAL) } } self.event_config[i_key] = instance.get('event_config') # managed entity raw view self.registry = {} # First layer of cache (get entities from the tree) self.morlist_raw = {} # Second layer, processed from the first one self.morlist = {} # Metrics metadata, basically perfCounterId -> {name, group, description} self.metrics_metadata = {} self.latest_event_query = {} def stop(self): self.stop_pool() def start_pool(self): self.log.info("Starting Thread Pool") self.pool_size = int(self.init_config.get('threads_count', DEFAULT_SIZE_POOL)) self.pool = Pool(self.pool_size) self.pool_started = True self.jobs_status = {} def stop_pool(self): self.log.info("Stopping Thread Pool") if self.pool_started: self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 self.pool_started = False def restart_pool(self): self.stop_pool() self.start_pool() def _clean(self): now = time.time() # TODO: use that for name in self.jobs_status.keys(): start_time = self.jobs_status[name] if now - start_time > JOB_TIMEOUT: self.log.critical("Restarting Pool. One check is stuck.") self.restart_pool() break def _query_event(self, instance): i_key = self._instance_key(instance) last_time = self.latest_event_query.get(i_key) server_instance = self._get_server_instance(instance) event_manager = server_instance.content.eventManager # Be sure we don't duplicate any event, never query the "past" if not last_time: last_time = self.latest_event_query[i_key] = \ event_manager.latestEvent.createdTime + timedelta(seconds=1) query_filter = vim.event.EventFilterSpec() time_filter = vim.event.EventFilterSpec.ByTime(beginTime=self.latest_event_query[i_key]) query_filter.time = time_filter try: new_events = event_manager.QueryEvents(query_filter) self.log.debug("Got {0} events from vCenter event manager".format(len(new_events))) for event in new_events: normalized_event = VSphereEvent(event, self.event_config[i_key]) # Can return None if the event if filtered out event_payload = normalized_event.get_datadog_payload() if event_payload is not None: self.event(event_payload) last_time = event.createdTime + timedelta(seconds=1) except Exception as e: # Don't get stuck on a failure to fetch an event # Ignore them for next pass self.log.warning("Unable to fetch Events %s", e) last_time = event_manager.latestEvent.createdTime + timedelta(seconds=1) self.latest_event_query[i_key] = last_time def _instance_key(self, instance): i_key = instance.get('name') if i_key is None: raise Exception("Must define a unique 'name' per vCenter instance") return i_key def _should_cache(self, instance, entity): i_key = self._instance_key(instance) now = time.time() return now - self.cache_times[i_key][entity][LAST] > self.cache_times[i_key][entity][INTERVAL] def _get_server_instance(self, instance): i_key = self._instance_key(instance) service_check_tags = [ 'vcenter_server:{0}'.format(instance.get('name')), 'vcenter_host:{0}'.format(instance.get('host')), ] # Check for ssl configs and generate an appropriate ssl context object ssl_verify = instance.get('ssl_verify', True) ssl_capath = instance.get('ssl_capath', None) if not ssl_verify: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_NONE elif ssl_capath: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_REQUIRED context.load_verify_locations(capath=ssl_capath) # If both configs are used, log a message explaining the default if not ssl_verify and ssl_capath: self.log.debug("Your configuration is incorrectly attempting to " "specify both a CA path, and to disable SSL " "verification. You cannot do both. Proceeding with " "disabling ssl verification.") if i_key not in self.server_instances: try: # Object returned by SmartConnect is a ServerInstance # https://www.vmware.com/support/developer/vc-sdk/visdk2xpubs/ReferenceGuide/vim.ServiceInstance.html server_instance = connect.SmartConnect( host = instance.get('host'), user = instance.get('username'), pwd = instance.get('password'), sslContext = context if not ssl_verify or ssl_capath else None ) except Exception as e: err_msg = "Connection to %s failed: %s" % (instance.get('host'), e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=err_msg) raise Exception(err_msg) self.server_instances[i_key] = server_instance # Test if the connection is working try: self.server_instances[i_key].RetrieveContent() self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) except Exception as e: err_msg = "Connection to %s died unexpectedly: %s" % (instance.get('host'), e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=err_msg) raise Exception(err_msg) return self.server_instances[i_key] def _compute_needed_metrics(self, instance, available_metrics): """ Compare the available metrics for one MOR we have computed and intersect them with the set of metrics we want to report """ if instance.get('all_metrics', False): return available_metrics i_key = self._instance_key(instance) wanted_metrics = [] # Get only the basic metrics for metric in available_metrics: # No cache yet, skip it for now if (i_key not in self.metrics_metadata or metric.counterId not in self.metrics_metadata[i_key]): continue if self.metrics_metadata[i_key][metric.counterId]['name'] in BASIC_METRICS: wanted_metrics.append(metric) return wanted_metrics def get_external_host_tags(self): """ Returns a list of tags for every host that is detected by the vSphere integration. List of pairs (hostname, list_of_tags) """ self.log.debug(u"Sending external_host_tags now") external_host_tags = [] for instance in self.instances: i_key = self._instance_key(instance) mor_by_mor_name = self.morlist.get(i_key) if not mor_by_mor_name: self.log.warning( u"Unable to extract hosts' tags for `%s` vSphere instance." u"Is the check failing on this instance?", instance ) continue for mor in mor_by_mor_name.itervalues(): if mor['hostname']: # some mor's have a None hostname external_host_tags.append((mor['hostname'], {SOURCE_TYPE: mor['tags']})) return external_host_tags def _discover_mor(self, instance, tags, regexes=None, include_only_marked=False): """ Explore vCenter infrastructure to discover hosts, virtual machines and compute their associated tags. Start with the vCenter `rootFolder` and proceed recursively, queueing other such jobs for children nodes. Example topology: ``` rootFolder - datacenter1 - compute_resource1 == cluster - host1 - host2 - host3 - compute_resource2 - host5 - vm1 - vm2 ``` If it's a node we want to query metric for, queue it in `self.morlist_raw` that will be processed by another job. """ def _get_parent_tags(mor): tags = [] if mor.parent: tag = [] if isinstance(mor.parent, vim.HostSystem): tag.append(u'vsphere_host:{}'.format(mor.parent.name)) elif isinstance(mor.parent, vim.Folder): tag.append(u'vsphere_folder:{}'.format(mor.parent.name)) elif isinstance(mor.parent, vim.ComputeResource): if isinstance(mor.parent, vim.ClusterComputeResource): tag.append(u'vsphere_cluster:{}'.format(mor.parent.name)) tag.append(u'vsphere_compute:{}'.format(mor.parent.name)) elif isinstance(mor.parent, vim.Datacenter): tag.append(u'vsphere_datacenter:{}'.format(mor.parent.name)) tags = _get_parent_tags(mor.parent) if tag: tags.extend(tag) return tags def _get_all_objs(content, vimtype, regexes=None, include_only_marked=False, tags=[]): """ Get all the vsphere objects associated with a given type """ obj_list = [] container = content.viewManager.CreateContainerView( content.rootFolder, [RESOURCE_TYPE_MAP[vimtype]], True) for c in container.view: instance_tags = [] if not self._is_excluded(c, regexes, include_only_marked): hostname = c.name if c.parent: instance_tags += _get_parent_tags(c) vsphere_type = None if isinstance(c, vim.VirtualMachine): vsphere_type = u'vsphere_type:vm' if c.runtime.powerState == vim.VirtualMachinePowerState.poweredOff: continue host = c.runtime.host.name instance_tags.append(u'vsphere_host:{}'.format(host)) elif isinstance(c, vim.HostSystem): vsphere_type = u'vsphere_type:host' elif isinstance(c, vim.Datastore): vsphere_type = u'vsphere_type:datastore' instance_tags.append(u'vsphere_datastore:{}'.format(c.name)) hostname = None elif isinstance(c, vim.Datacenter): vsphere_type = u'vsphere_type:datacenter' hostname = None if vsphere_type: instance_tags.append(vsphere_type) obj_list.append(dict(mor_type=vimtype, mor=c, hostname=hostname, tags=tags+instance_tags)) return obj_list # @atomic_method def build_resource_registry(instance, tags, regexes=None, include_only_marked=False): i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) if i_key not in self.morlist_raw: self.morlist_raw[i_key] = {} for resource in sorted(RESOURCE_TYPE_MAP): self.morlist_raw[i_key][resource] = _get_all_objs( server_instance.RetrieveContent(), resource, regexes, include_only_marked, tags ) # collect... self.pool.apply_async( build_resource_registry, args=(instance, tags, regexes, include_only_marked) ) @staticmethod def _is_excluded(obj, regexes, include_only_marked): """ Return `True` if the given host or virtual machine is excluded by the user configuration, i.e. violates any of the following rules: * Do not match the corresponding `*_include_only` regular expressions * Is "non-labeled" while `include_only_marked` is enabled (virtual machine only) """ # Host if isinstance(obj, vim.HostSystem): # Based on `host_include_only_regex` if regexes and regexes.get('host_include') is not None: match = re.search(regexes['host_include'], obj.name) if not match: return True # VirtualMachine elif isinstance(obj, vim.VirtualMachine): # Based on `vm_include_only_regex` if regexes and regexes.get('vm_include') is not None: match = re.search(regexes['vm_include'], obj.name) if not match: return True # Based on `include_only_marked` if include_only_marked: monitored = False for field in obj.customValue: if field.value == VM_MONITORING_FLAG: monitored = True break # we shall monitor if not monitored: return True return False def _cache_morlist_raw(self, instance): """ Initiate the first layer to refresh the list of MORs (`self.morlist`). Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery. """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) for resource_type in RESOURCE_TYPE_MAP: if i_key in self.morlist_raw and len(self.morlist_raw[i_key].get(resource_type, [])) > 0: self.log.debug( "Skipping morlist collection now, RAW results " "processing not over (latest refresh was {0}s ago)".format( time.time() - self.cache_times[i_key][MORLIST][LAST]) ) return self.morlist_raw[i_key] = {} instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = _is_affirmative(instance.get('include_only_marked', False)) # Discover hosts and virtual machines self._discover_mor(instance, [instance_tag], regexes, include_only_marked) self.cache_times[i_key][MORLIST][LAST] = time.time() @atomic_method def _cache_morlist_process_atomic(self, instance, mor): """ Process one item of the self.morlist_raw list by querying the available metrics for this MOR and then putting it in self.morlist """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager self.log.debug( "job_atomic: Querying available metrics" " for MOR {0} (type={1})".format(mor['mor'], mor['mor_type']) ) mor['interval'] = REAL_TIME_INTERVAL if mor['mor_type'] in REALTIME_RESOURCES else None available_metrics = perfManager.QueryAvailablePerfMetric( mor['mor'], intervalId=mor['interval']) mor['metrics'] = self._compute_needed_metrics(instance, available_metrics) mor_name = str(mor['mor']) if mor_name in self.morlist[i_key]: # Was already here last iteration self.morlist[i_key][mor_name]['metrics'] = mor['metrics'] else: self.morlist[i_key][mor_name] = mor self.morlist[i_key][mor_name]['last_seen'] = time.time() ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.morlist_process_atomic.time', t.total()) ### </TEST-INSTRUMENTATION> def _cache_morlist_process(self, instance): """ Empties the self.morlist_raw by popping items and running asynchronously the _cache_morlist_process_atomic operation that will get the available metrics for this MOR and put it in self.morlist """ i_key = self._instance_key(instance) if i_key not in self.morlist: self.morlist[i_key] = {} batch_size = self.init_config.get('batch_morlist_size', BATCH_MORLIST_SIZE) processed = 0 for resource_type in RESOURCE_TYPE_MAP: for i in xrange(batch_size): try: mor = self.morlist_raw[i_key][resource_type].pop() self.pool.apply_async(self._cache_morlist_process_atomic, args=(instance, mor)) processed += 1 if processed == batch_size: break except (IndexError, KeyError): self.log.debug("No more work to process in morlist_raw") break if processed == batch_size: break return def _vacuum_morlist(self, instance): """ Check if self.morlist doesn't have some old MORs that are gone, ie we cannot get any metrics from them anyway (or =0) """ i_key = self._instance_key(instance) morlist = self.morlist[i_key].items() for mor_name, mor in morlist: last_seen = mor['last_seen'] if (time.time() - last_seen) > 2 * REFRESH_MORLIST_INTERVAL: del self.morlist[i_key][mor_name] def _cache_metrics_metadata(self, instance): """ Get from the server instance, all the performance counters metadata meaning name/group/description... attached with the corresponding ID """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) self.log.info("Warming metrics metadata cache for instance {0}".format(i_key)) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager new_metadata = {} for counter in perfManager.perfCounter: d = dict( name = "%s.%s" % (counter.groupInfo.key, counter.nameInfo.key), unit = counter.unitInfo.key, instance_tag = 'instance' # FIXME: replace by what we want to tag! ) new_metadata[counter.key] = d self.cache_times[i_key][METRICS_METADATA][LAST] = time.time() self.log.info("Finished metadata collection for instance {0}".format(i_key)) # Reset metadata self.metrics_metadata[i_key] = new_metadata ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.metric_metadata_collection.time', t.total()) ### </TEST-INSTRUMENTATION> def _transform_value(self, instance, counter_id, value): """ Given the counter_id, look up for the metrics metadata to check the vsphere type of the counter and apply pre-reporting transformation if needed. """ i_key = self._instance_key(instance) if counter_id in self.metrics_metadata[i_key]: unit = self.metrics_metadata[i_key][counter_id]['unit'] if unit == 'percent': return float(value) / 100 # Defaults to return the value without transformation return value @atomic_method def _collect_metrics_atomic(self, instance, mor): """ Task that collects the metrics listed in the morlist for one MOR """ ### <TEST-INSTRUMENTATION> t = Timer() ### </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager query = vim.PerformanceManager.QuerySpec(maxSample=1, entity=mor['mor'], metricId=mor['metrics'], intervalId=mor['interval'], format='normal') results = perfManager.QueryPerf(querySpec=[query]) if results: for result in results[0].value: if result.id.counterId not in self.metrics_metadata[i_key]: self.log.debug("Skipping this metric value, because there is no metadata about it") continue instance_name = result.id.instance or "none" value = self._transform_value(instance, result.id.counterId, result.value[0]) # Metric types are absolute, delta, and rate metric_name = self.metrics_metadata[i_key][result.id.counterId]['name'] if metric_name not in ALL_METRICS: self.log.debug(u"Skipping unknown `%s` metric.", metric_name) continue tags = ['instance:%s' % instance_name] if not mor['hostname']: # no host tags available tags.extend(mor['tags']) # vsphere "rates" should be submitted as gauges (rate is # precomputed). self.gauge( "vsphere.%s" % metric_name, value, hostname=mor['hostname'], tags=['instance:%s' % instance_name] ) ### <TEST-INSTRUMENTATION> self.histogram('datadog.agent.vsphere.metric_colection.time', t.total()) ### </TEST-INSTRUMENTATION> def collect_metrics(self, instance): """ Calls asynchronously _collect_metrics_atomic on all MORs, as the job queue is processed the Aggregator will receive the metrics. """ i_key = self._instance_key(instance) if i_key not in self.morlist: self.log.debug("Not collecting metrics for this instance, nothing to do yet: {0}".format(i_key)) return mors = self.morlist[i_key].items() self.log.debug("Collecting metrics of %d mors" % len(mors)) vm_count = 0 for mor_name, mor in mors: if mor['mor_type'] == 'vm': vm_count += 1 if 'metrics' not in mor or not mor['metrics']: # self.log.debug("Skipping entity %s collection because we didn't cache its metrics yet" % mor['hostname']) continue self.pool.apply_async(self._collect_metrics_atomic, args=(instance, mor)) self.gauge('vsphere.vm.count', vm_count, tags=["vcenter_server:%s" % instance.get('name')]) def check(self, instance): if not self.pool_started: self.start_pool() ### <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial']) ### </TEST-INSTRUMENTATION> # First part: make sure our object repository is neat & clean if self._should_cache(instance, METRICS_METADATA): self._cache_metrics_metadata(instance) if self._should_cache(instance, MORLIST): self._cache_morlist_raw(instance) self._cache_morlist_process(instance) self._vacuum_morlist(instance) # Second part: do the job self.collect_metrics(instance) self._query_event(instance) # For our own sanity self._clean() thread_crashed = False try: while True: self.log.critical(self.exceptionq.get_nowait()) thread_crashed = True except Empty: pass if thread_crashed: self.stop_pool() raise Exception("One thread in the pool crashed, check the logs") ### <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'])
class NetworkCheck(AgentCheck): SOURCE_TYPE_NAME = 'servicecheck' SERVICE_CHECK_PREFIX = 'network_check' STATUS_TO_SERVICE_CHECK = { Status.UP: AgentCheck.OK, Status.WARNING: AgentCheck.WARNING, Status.CRITICAL: AgentCheck.CRITICAL, Status.DOWN: AgentCheck.CRITICAL, } def __init__(self, name, init_config, agentConfig, instances): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.statuses = {} self.notified = {} self.nb_failures = 0 self.pool_started = False names = [] for inst in instances: name = inst.get('name', None) if not name: raise Exception("All instances should have a 'name' parameter," " error on instance: {0}".format(inst)) if name in names: raise Exception( "Duplicate names for instances with name {0}".format( inst['name'])) def stop(self): self.stop_pool() self.pool_started = False def start_pool(self): self.log.info("Starting Thread Pool") default_size = min(self.instance_count(), DEFAULT_SIZE_POOL) self.pool_size = int( self.init_config.get('threads_count', default_size)) self.pool = Pool(self.pool_size) self.resultsq = Queue() self.jobs_status = {} self.jobs_results = {} self.pool_started = True def stop_pool(self): self.log.info("Stopping Thread Pool") if self.pool_started: self.pool.terminate() self.pool.join() self.jobs_status.clear() assert self.pool.get_nworkers() == 0 def restart_pool(self): self.stop_pool() self.start_pool() def check(self, instance): if not self.pool_started: self.start_pool() if threading.activeCount() > 5 * self.pool_size + 5: raise Exception( "Thread number (%s) is exploding. Skipping this check" % threading.activeCount()) self._process_results() self._clean() name = instance.get('name', None) if name is None: self.log.error('Each service check must have a name') return if name not in self.jobs_status: self.jobs_status[name] = time.time() self.jobs_results[name] = self.pool.apply_async(self._process, args=(instance, )) else: self.log.error( "Instance: %s skipped because it's already running." % name) def _process(self, instance): try: statuses = self._check(instance) if isinstance(statuses, tuple): status, msg = statuses self.resultsq.put((status, msg, None, instance)) elif isinstance(statuses, list): for status in statuses: sc_name, status, msg = status self.resultsq.put((status, msg, sc_name, instance)) except Exception: result = (FAILURE, FAILURE, FAILURE, instance) self.resultsq.put(result) def _process_results(self): for i in xrange(MAX_LOOP_ITERATIONS): try: status, msg, sc_name, instance = self.resultsq.get_nowait() except Empty: break instance_name = instance['name'] if status == FAILURE: self.nb_failures += 1 if self.nb_failures >= self.pool_size - 1: self.nb_failures = 0 self.restart_pool() self._clean_job(instance_name) continue self.report_as_service_check(sc_name, status, instance, msg) skip_event = _is_affirmative(instance.get('skip_event', False)) if not skip_event: self.warning( "Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Monitor Agent." ) event = None if instance_name not in self.statuses: self.statuses[instance_name] = defaultdict(list) self.statuses[instance_name][sc_name].append(status) window = int(instance.get('window', 1)) if window > 256: self.log.warning( "Maximum window size (256) exceeded, defaulting it to 256" ) window = 256 threshold = instance.get('threshold', 1) if len(self.statuses[instance_name][sc_name]) > window: self.statuses[instance_name][sc_name].pop(0) nb_failures = self.statuses[instance_name][sc_name].count( Status.DOWN) if nb_failures >= threshold: if self.notified.get( (instance_name, sc_name), Status.UP) != Status.DOWN: event = self._create_status_event( sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.DOWN else: if self.notified.get( (instance_name, sc_name), Status.UP) != Status.UP: event = self._create_status_event( sc_name, status, msg, instance) self.notified[(instance_name, sc_name)] = Status.UP if event is not None: self.events.append(event) self._clean_job(instance_name) def _clean_job(self, instance_name): if instance_name in self.jobs_status: self.log.debug("Instance: %s cleaned from jobs status." % instance_name) del self.jobs_status[instance_name] if instance_name in self.jobs_results: self.log.debug("Instance: %s cleaned from jobs results." % instance_name) ret = self.jobs_results[instance_name].get() if isinstance(ret, Exception): self.log.exception( "Exception in worker thread: {0}".format(ret)) del self.jobs_results[instance_name] def _check(self, instance): raise NotImplementedError def _clean(self): now = time.time() for name, start_time in self.jobs_status.iteritems(): if now - start_time > TIMEOUT: self.log.critical("Restarting Pool. One check is stuck: %s" % name) self.restart_pool() break