def refresh_metrics_metadata_cache(self):
        """Request the list of counters (metrics) from vSphere and store them in a cache."""
        self.log.debug(
            "Refreshing the metrics metadata cache. Collecting all counters metadata for collection_level=%d",
            self.config.collection_level,
        )
        t0 = Timer()
        counters = self.api.get_perf_counter_by_level(
            self.config.collection_level)
        self.gauge(
            "datadog.vsphere.refresh_metrics_metadata_cache.time",
            t0.total(),
            tags=self.config.base_tags,
            raw=True,
            hostname=self._hostname,
        )
        self.log.debug("Collected %d counters metadata in %.3f seconds.",
                       len(counters), t0.total())

        for mor_type in self.config.collected_resource_types:
            allowed_counters = []
            for c in counters:
                metric_name = format_metric_name(c)
                if metric_name in ALLOWED_METRICS_FOR_MOR[
                        mor_type] and not is_metric_excluded_by_filters(
                            metric_name, mor_type, self.config.metric_filters):
                    allowed_counters.append(c)
            metadata = {c.key: format_metric_name(c) for c in allowed_counters}
            self.metrics_metadata_cache.set_metadata(mor_type, metadata)
    def collect_events(self):
        self.log.debug("Starting events collection.")
        try:
            t0 = Timer()
            new_events = self.api.get_new_events(
                start_time=self.latest_event_query)
            self.gauge(
                'datadog.vsphere.collect_events.time',
                t0.total(),
                tags=self.config.base_tags,
                raw=True,
                hostname=self._hostname,
            )
            self.log.debug("Got %s new events from the vCenter event manager",
                           len(new_events))
            event_config = {'collect_vcenter_alarms': True}
            for event in new_events:
                normalized_event = VSphereEvent(event, event_config,
                                                self.config.base_tags)
                # Can return None if the event if filtered out
                event_payload = normalized_event.get_datadog_payload()
                if event_payload is not None:
                    self.event(event_payload)
        except Exception as e:
            # Don't get stuck on a failure to fetch an event
            # Ignore them for next pass
            self.log.warning("Unable to fetch Events %s", e)

        self.latest_event_query = self.api.get_latest_event_timestamp(
        ) + timedelta(seconds=1)
Example #3
0
    def collect_tags(self, infrastructure_data):
        # type: (InfrastructureData) -> ResourceTags
        """
        Fetch the all tags, build tags for each monitored resources and store all of that into the tags_cache.
        """
        if not self.api_rest:
            return {}

        # In order to be more efficient in tag collection, the infrastructure data is filtered as much as possible.
        # All filters are applied except the ones based on tags of course.
        resource_filters_without_tags = [f for f in self._config.resource_filters if not isinstance(f, TagFilter)]
        filtered_infra_data = {
            mor: props
            for mor, props in iteritems(infrastructure_data)
            if isinstance(mor, tuple(self._config.collected_resource_types))
            and is_resource_collected_by_filters(mor, infrastructure_data, resource_filters_without_tags)
        }

        t0 = Timer()
        mors_list = list(filtered_infra_data.keys())
        try:
            mor_tags = self.api_rest.get_resource_tags_for_mors(mors_list)
        except Exception as e:
            self.log.error("Failed to collect tags: %s", e)
            return {}

        self.gauge(
            'datadog.vsphere.query_tags.time',
            t0.total(),
            tags=self._config.base_tags,
            raw=True,
            hostname=self._hostname,
        )

        return mor_tags
Example #4
0
    def _cache_metrics_metadata(self, instance):
        """
        Get all the performance counters metadata meaning name/group/description...
        from the server instance, attached with the corresponding ID
        """
        # ## <TEST-INSTRUMENTATION>
        t = Timer()
        # ## </TEST-INSTRUMENTATION>

        i_key = self._instance_key(instance)
        self.metadata_cache.init_instance(i_key)
        self.log.info("Warming metrics metadata cache for instance %s", i_key)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        new_metadata = {}
        metric_ids = []
        # Use old behaviour with metrics to collect defined by our constants
        if self.in_compatibility_mode(instance, log_warning=True):
            for counter in perfManager.perfCounter:
                metric_name = self.format_metric_name(counter,
                                                      compatibility=True)
                new_metadata[counter.key] = {
                    'name': metric_name,
                    'unit': counter.unitInfo.key
                }
                # Build the list of metrics we will want to collect
                if instance.get("all_metrics") or metric_name in BASIC_METRICS:
                    metric_ids.append(
                        vim.PerformanceManager.MetricId(counterId=counter.key,
                                                        instance="*"))
        else:
            collection_level = instance.get("collection_level", 1)
            for counter in perfManager.QueryPerfCounterByLevel(
                    collection_level):
                new_metadata[counter.key] = {
                    "name": self.format_metric_name(counter),
                    "unit": counter.unitInfo.key
                }
                # Build the list of metrics we will want to collect
                metric_ids.append(
                    vim.PerformanceManager.MetricId(counterId=counter.key,
                                                    instance="*"))

        self.log.info("Finished metadata collection for instance %s", i_key)
        # Reset metadata
        self.metadata_cache.set_metadata(i_key, new_metadata)
        self.metadata_cache.set_metric_ids(i_key, metric_ids)

        self.cache_config.set_last(CacheConfig.Metadata, i_key, time.time())

        # ## <TEST-INSTRUMENTATION>
        custom_tags = instance.get('tags', []) + ['instance:{}'.format(i_key)]
        self.histogram('datadog.agent.vsphere.metric_metadata_collection.time',
                       t.total(),
                       tags=custom_tags)
 def query_metrics_wrapper(self, query_specs):
     """Just an instrumentation wrapper around the VSphereAPI.query_metrics method
     Warning: called in threads
     """
     t0 = Timer()
     metrics_values = self.api.query_metrics(query_specs)
     self.histogram('datadog.vsphere.query_metrics.time',
                    t0.total(),
                    tags=self.config.base_tags,
                    raw=True)
     return metrics_values
Example #6
0
 def query_metrics_wrapper(self, query_specs):
     # type: (List[vim.PerformanceManager.QuerySpec]) -> List[vim.PerformanceManager.EntityMetricBase]
     """Just an instrumentation wrapper around the VSphereAPI.query_metrics method
     Warning: called in threads
     """
     t0 = Timer()
     metrics_values = self.api.query_metrics(query_specs)
     self.histogram('datadog.vsphere.query_metrics.time',
                    t0.total(),
                    tags=self.config.base_tags,
                    raw=True)
     return metrics_values
Example #7
0
    def collect_events(self):
        # type: () -> None
        self.log.debug("Starting events collection (query start time: %s).",
                       self.latest_event_query)
        latest_event_time = None
        collect_start_time = get_current_datetime()
        try:
            t0 = Timer()
            new_events = self.api.get_new_events(
                start_time=self.latest_event_query)
            self.gauge(
                'datadog.vsphere.collect_events.time',
                t0.total(),
                tags=self.config.base_tags,
                raw=True,
                hostname=self._hostname,
            )
            self.log.debug("Got %s new events from the vCenter event manager",
                           len(new_events))
            event_config = {'collect_vcenter_alarms': True}
            for event in new_events:
                self.log.debug("Processing event with id:%s, type:%s: msg:%s",
                               event.key, type(event),
                               event.fullFormattedMessage)
                normalized_event = VSphereEvent(event, event_config,
                                                self.config.base_tags)
                # Can return None if the event if filtered out
                event_payload = normalized_event.get_datadog_payload()
                if event_payload is not None:
                    self.log.debug("Submit event with id:%s, type:%s: msg:%s",
                                   event.key, type(event),
                                   event.fullFormattedMessage)
                    self.event(event_payload)
                if latest_event_time is None or event.createdTime > latest_event_time:
                    latest_event_time = event.createdTime
        except Exception as e:
            # Don't get stuck on a failure to fetch an event
            # Ignore them for next pass
            self.log.warning("Unable to fetch Events %s", e)

        if latest_event_time is not None:
            self.latest_event_query = latest_event_time + dt.timedelta(
                seconds=1)
        else:
            # Let's set `self.latest_event_query` to `collect_start_time` as safeguard in case no events are reported
            # OR something bad happened (which might happen again indefinitely).
            self.latest_event_query = collect_start_time
 def refresh_tags_cache(self):
     """
     Fetch the all tags, build tags for each monitored resources and store all of that into the tags_cache.
     """
     if not self.api_rest:
         return
     t0 = Timer()
     try:
         mor_tags = self.api_rest.get_resource_tags()
     except Exception as e:
         self.log.error("Failed to collect tags: %s", e)
         return
     self.gauge('datadog.vsphere.query_tags.time',
                t0.total(),
                tags=self.config.base_tags,
                raw=True)
     self.tags_cache.set_all_tags(mor_tags)
Example #9
0
    def _collect_metrics_async(self, instance, query_specs):
        """ Task that collects the metrics listed in the morlist for one MOR
        """
        # ## <TEST-INSTRUMENTATION>
        t = Timer()
        # ## </TEST-INSTRUMENTATION>
        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager
        results = perfManager.QueryPerf(query_specs)
        if results:
            for mor_perfs in results:
                mor_name = str(mor_perfs.entity)
                try:
                    mor = self.mor_cache.get_mor(i_key, mor_name)
                except MorNotFoundError:
                    self.log.error(
                        "Trying to get metrics from object %s deleted from the cache, skipping. "
                        "Consider increasing the parameter `clean_morlist_interval` to avoid that",
                        mor_name,
                    )
                    continue

                for result in mor_perfs.value:
                    counter_id = result.id.counterId
                    if not self.metadata_cache.contains(i_key, counter_id):
                        self.log.debug(
                            "Skipping value for counter %s, because there is no metadata about it",
                            ensure_unicode(counter_id),
                        )
                        continue

                    # Metric types are absolute, delta, and rate
                    metric_name = self.metadata_cache.get_metadata(
                        i_key, result.id.counterId).get('name')

                    if self.in_compatibility_mode(instance):
                        if metric_name not in ALL_METRICS:
                            self.log.debug("Skipping unknown `%s` metric.",
                                           ensure_unicode(metric_name))
                            continue

                    if not result.value:
                        self.log.debug(
                            "Skipping `%s` metric because the value is empty",
                            ensure_unicode(metric_name))
                        continue

                    instance_name = result.id.instance or "none"
                    value = self._transform_value(instance,
                                                  result.id.counterId,
                                                  result.value[0])

                    hostname = mor['hostname']

                    tags = [
                        'instance:{}'.format(ensure_unicode(instance_name))
                    ]
                    if not hostname:  # no host tags available
                        tags.extend(mor['tags'])
                    else:
                        hostname = to_string(hostname)

                    tags.extend(instance.get('tags', []))

                    # vsphere "rates" should be submitted as gauges (rate is
                    # precomputed).
                    self.gauge("vsphere.{}".format(
                        ensure_unicode(metric_name)),
                               value,
                               hostname=hostname,
                               tags=tags)

        # ## <TEST-INSTRUMENTATION>
        custom_tags = instance.get('tags', []) + ['instance:{}'.format(i_key)]
        self.histogram('datadog.agent.vsphere.metric_colection.time',
                       t.total(),
                       tags=custom_tags)
Example #10
0
    def refresh_infrastructure_cache(self):
        # type: () -> None
        """Fetch the complete infrastructure, generate tags for each monitored resources and store all of that
        into the infrastructure_cache. It also computes the resource `hostname` property to be used when submitting
        metrics for this mor."""
        self.log.debug("Refreshing the infrastructure cache...")
        t0 = Timer()
        infrastructure_data = self.api.get_infrastructure()
        self.gauge(
            "datadog.vsphere.refresh_infrastructure_cache.time",
            t0.total(),
            tags=self.config.base_tags,
            raw=True,
            hostname=self._hostname,
        )
        self.log.debug("Infrastructure cache refreshed in %.3f seconds.",
                       t0.total())
        self.log.debug("Infrastructure cache: %s", infrastructure_data)

        all_tags = {}
        if self.config.should_collect_tags:
            all_tags = self.collect_tags(infrastructure_data)
        self.infrastructure_cache.set_all_tags(all_tags)

        for mor, properties in iteritems(infrastructure_data):
            if not isinstance(mor, tuple(
                    self.config.collected_resource_types)):
                # Do nothing for the resource types we do not collect
                continue

            if not is_resource_collected_by_filters(
                    mor, infrastructure_data, self.config.resource_filters,
                    self.infrastructure_cache.get_mor_tags(mor)):
                # The resource does not match the specified whitelist/blacklist patterns.
                continue

            mor_name = to_string(properties.get("name", "unknown"))
            mor_type_str = MOR_TYPE_AS_STRING[type(mor)]
            hostname = None
            tags = []

            if isinstance(mor, vim.VirtualMachine):
                power_state = properties.get("runtime.powerState")
                if power_state != vim.VirtualMachinePowerState.poweredOn:
                    # Skipping because the VM is not powered on
                    # TODO: Sometimes VM are "poweredOn" but "disconnected" and thus have no metrics
                    self.log.debug("Skipping VM %s in state %s", mor_name,
                                   to_string(power_state))
                    continue

                # Hosts are not considered as parents of the VMs they run, we use the `runtime.host` property
                # to get the name of the ESXi host
                runtime_host = properties.get("runtime.host")
                runtime_host_props = infrastructure_data[
                    runtime_host] if runtime_host else {}
                runtime_hostname = to_string(
                    runtime_host_props.get("name", "unknown"))
                tags.append('vsphere_host:{}'.format(runtime_hostname))

                if self.config.use_guest_hostname:
                    hostname = properties.get("guest.hostName", mor_name)
                else:
                    hostname = mor_name
            elif isinstance(mor, vim.HostSystem):
                hostname = mor_name
            else:
                tags.append('vsphere_{}:{}'.format(mor_type_str, mor_name))

            tags.extend(get_parent_tags_recursively(mor, infrastructure_data))
            tags.append('vsphere_type:{}'.format(mor_type_str))

            # Attach tags from fetched attributes.
            tags.extend(properties.get('attributes', []))

            mor_payload = {"tags": tags}  # type: Dict[str, Any]

            if hostname:
                mor_payload['hostname'] = hostname

            self.infrastructure_cache.set_mor_props(mor, mor_payload)