Ejemplo n.º 1
0
    def start_pool(self):
        self.log.info("Starting Thread Pool")
        self.pool_size = int(self.init_config.get('threads_count', DEFAULT_SIZE_POOL))

        self.pool = Pool(self.pool_size)
        self.pool_started = True
        self.jobs_status = {}
Ejemplo n.º 2
0
    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        pool_size = int(self.init_config.get('threads_count', default_size))

        self.pool = Pool(pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
Ejemplo n.º 3
0
    def start_pool(self):

        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(
            self.init_config.get('threads_count', default_size))

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.jobs_results = {}
        self.pool_started = True
Ejemplo n.º 4
0
    def start_pool(self):
        self.log.info("Starting Thread Pool")
        self.pool_size = int(self.init_config.get('threads_count', DEFAULT_SIZE_POOL))

        self.pool = Pool(self.pool_size)
        self.pool_started = True
        self.jobs_status = {}
    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(self.init_config.get('threads_count', default_size))

        # To keep track on the total number of threads we should have running
        NetworkCheck._global_current_pool_size += self.pool_size

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.jobs_results = {}
        self.pool_started = True
Ejemplo n.º 6
0
    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(self.init_config.get('threads_count', default_size))

        # To keep track on the total number of threads we should have running
        NetworkCheck._global_current_pool_size += self.pool_size

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.jobs_results = {}
        self.pool_started = True
Ejemplo n.º 7
0
class VSphereCheck(AgentCheck):
    """ Get performance metrics from a vCenter server and upload them to Datadog
    References:
        http://pubs.vmware.com/vsphere-51/index.jsp#com.vmware.wssdk.apiref.doc/vim.PerformanceManager.html

    *_atomic jobs perform one single task asynchronously in the ThreadPool, we
    don't know exactly when they will finish, but we reap them if they're stuck.
    The other calls are performed synchronously.
    """

    SERVICE_CHECK_NAME = 'vcenter.can_connect'

    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self.time_started = time.time()
        self.pool_started = False
        self.exceptionq = Queue()

        # Connections open to vCenter instances
        self.server_instances = {}

        # Event configuration
        self.event_config = {}
        # Caching resources, timeouts
        self.cache_times = {}
        for instance in self.instances:
            i_key = self._instance_key(instance)
            self.cache_times[i_key] = {
                MORLIST: {
                    LAST:
                    0,
                    INTERVAL:
                    init_config.get('refresh_morlist_interval',
                                    REFRESH_MORLIST_INTERVAL)
                },
                METRICS_METADATA: {
                    LAST:
                    0,
                    INTERVAL:
                    init_config.get('refresh_metrics_metadata_interval',
                                    REFRESH_METRICS_METADATA_INTERVAL)
                }
            }

            self.event_config[i_key] = instance.get('event_config')

        # First layer of cache (get entities from the tree)
        self.morlist_raw = {}
        # Second layer, processed from the first one
        self.morlist = {}
        # Metrics metadata, basically perfCounterId -> {name, group, description}
        self.metrics_metadata = {}

        self.latest_event_query = {}

    def stop(self):
        self.stop_pool()

    def start_pool(self):
        self.log.info("Starting Thread Pool")
        self.pool_size = int(
            self.init_config.get('threads_count', DEFAULT_SIZE_POOL))

        self.pool = Pool(self.pool_size)
        self.pool_started = True
        self.jobs_status = {}

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0
            self.pool_started = False

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def _clean(self):
        now = time.time()
        # TODO: use that
        for name in self.jobs_status.keys():
            start_time = self.jobs_status[name]
            if now - start_time > JOB_TIMEOUT:
                self.log.critical("Restarting Pool. One check is stuck.")
                self.restart_pool()
                break

    def _query_event(self, instance):
        i_key = self._instance_key(instance)
        last_time = self.latest_event_query.get(i_key)

        server_instance = self._get_server_instance(instance)
        event_manager = server_instance.content.eventManager

        # Be sure we don't duplicate any event, never query the "past"
        if not last_time:
            last_time = self.latest_event_query[i_key] = \
                event_manager.latestEvent.createdTime + timedelta(seconds=1)

        query_filter = vim.event.EventFilterSpec()
        time_filter = vim.event.EventFilterSpec.ByTime(
            beginTime=self.latest_event_query[i_key])
        query_filter.time = time_filter

        try:
            new_events = event_manager.QueryEvents(query_filter)
            self.log.debug("Got {0} events from vCenter event manager".format(
                len(new_events)))
            for event in new_events:
                normalized_event = VSphereEvent(event,
                                                self.event_config[i_key])
                # Can return None if the event if filtered out
                event_payload = normalized_event.get_datadog_payload()
                if event_payload is not None:
                    self.event(event_payload)
                last_time = event.createdTime + timedelta(seconds=1)
        except Exception as e:
            # Don't get stuck on a failure to fetch an event
            # Ignore them for next pass
            self.log.warning("Unable to fetch Events %s", e)
            last_time = event_manager.latestEvent.createdTime + timedelta(
                seconds=1)

        self.latest_event_query[i_key] = last_time

    def _instance_key(self, instance):
        i_key = instance.get('name')
        if i_key is None:
            raise Exception("Must define a unique 'name' per vCenter instance")
        return i_key

    def _should_cache(self, instance, entity):
        i_key = self._instance_key(instance)
        now = time.time()
        return now - self.cache_times[i_key][entity][LAST] > self.cache_times[
            i_key][entity][INTERVAL]

    def _get_server_instance(self, instance):
        i_key = self._instance_key(instance)

        service_check_tags = [
            'vcenter_server:{0}'.format(instance.get('name')),
            'vcenter_host:{0}'.format(instance.get('host')),
        ]

        # Check for ssl configs and generate an appropriate ssl context object
        ssl_verify = instance.get('ssl_verify', True)
        ssl_capath = instance.get('ssl_capath', None)
        if not ssl_verify:
            context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
            context.verify_mode = ssl.CERT_NONE
        elif ssl_capath:
            context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
            context.verify_mode = ssl.CERT_REQUIRED
            context.load_verify_locations(capath=ssl_capath)

        # If both configs are used, log a message explaining the default
        if not ssl_verify and ssl_capath:
            self.log.debug("Your configuration is incorrectly attempting to "
                           "specify both a CA path, and to disable SSL "
                           "verification. You cannot do both. Proceeding with "
                           "disabling ssl verification.")

        if i_key not in self.server_instances:
            try:
                # Object returned by SmartConnect is a ServerInstance
                #   https://www.vmware.com/support/developer/vc-sdk/visdk2xpubs/ReferenceGuide/vim.ServiceInstance.html
                server_instance = connect.SmartConnect(
                    host=instance.get('host'),
                    user=instance.get('username'),
                    pwd=instance.get('password'),
                    sslContext=context
                    if not ssl_verify or ssl_capath else None)
            except Exception as e:
                err_msg = "Connection to %s failed: %s" % (
                    instance.get('host'), e)
                self.service_check(self.SERVICE_CHECK_NAME,
                                   AgentCheck.CRITICAL,
                                   tags=service_check_tags,
                                   message=err_msg)
                raise Exception(err_msg)

            self.server_instances[i_key] = server_instance

        # Test if the connection is working
        try:
            self.server_instances[i_key].RetrieveContent()
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.OK,
                               tags=service_check_tags)
        except Exception as e:
            err_msg = "Connection to %s died unexpectedly: %s" % (
                instance.get('host'), e)
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags,
                               message=err_msg)
            raise Exception(err_msg)

        return self.server_instances[i_key]

    def _compute_needed_metrics(self, instance, available_metrics):
        """ Compare the available metrics for one MOR we have computed and intersect them
        with the set of metrics we want to report
        """
        if instance.get('all_metrics', False):
            return available_metrics

        i_key = self._instance_key(instance)
        wanted_metrics = []
        # Get only the basic metrics
        for metric in available_metrics:
            # No cache yet, skip it for now
            if (i_key not in self.metrics_metadata
                    or metric.counterId not in self.metrics_metadata[i_key]):
                continue
            if self.metrics_metadata[i_key][
                    metric.counterId]['name'] in BASIC_METRICS:
                wanted_metrics.append(metric)

        return wanted_metrics

    def get_external_host_tags(self):
        """ Returns a list of tags for every host that is detected by the vSphere
        integration.
        List of pairs (hostname, list_of_tags)
        """
        self.log.info("Sending external_host_tags now")
        external_host_tags = []
        for instance in self.instances:
            i_key = self._instance_key(instance)
            mor_list = self.morlist[i_key].items()
            for mor_name, mor in mor_list:
                external_host_tags.append((mor['hostname'], {
                    SOURCE_TYPE: mor['tags']
                }))

        return external_host_tags

    @atomic_method
    def _cache_morlist_raw_atomic(self,
                                  i_key,
                                  obj_type,
                                  obj,
                                  tags,
                                  regexes=None,
                                  include_only_marked=False):
        """ Compute tags for a single node in the vCenter rootFolder
        and queue other such jobs for children nodes.
        Usual hierarchy:
        rootFolder
            - datacenter1
                - compute_resource1 == cluster
                    - host1
                    - host2
                    - host3
                - compute_resource2
                    - host5
                        - vm1
                        - vm2
        If it's a node we want to query metric for, queue it in self.morlist_raw
        that will be processed by another job.
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        self.log.debug("job_atomic: Exploring MOR {0} (type={1})".format(
            obj, obj_type))
        ### </TEST-INSTRUMENTATION>
        tags_copy = deepcopy(tags)

        if obj_type == 'rootFolder':
            for datacenter in obj.childEntity:
                # Skip non-datacenter
                if not hasattr(datacenter, 'hostFolder'):
                    continue
                self.pool.apply_async(self._cache_morlist_raw_atomic,
                                      args=(i_key, 'datacenter', datacenter,
                                            tags_copy, regexes,
                                            include_only_marked))

        elif obj_type == 'datacenter':
            dc_tag = "vsphere_datacenter:%s" % obj.name
            tags_copy.append(dc_tag)
            for compute_resource in obj.hostFolder.childEntity:
                # Skip non-compute resource
                if not hasattr(compute_resource, 'host'):
                    continue
                self.pool.apply_async(self._cache_morlist_raw_atomic,
                                      args=(i_key, 'compute_resource',
                                            compute_resource, tags_copy,
                                            regexes, include_only_marked))

        elif obj_type == 'compute_resource':
            if obj.__class__ == vim.ClusterComputeResource:
                cluster_tag = "vsphere_cluster:%s" % obj.name
                tags_copy.append(cluster_tag)
            for host in obj.host:
                # Skip non-host
                if not hasattr(host, 'vm'):
                    continue
                self.pool.apply_async(self._cache_morlist_raw_atomic,
                                      args=(i_key, 'host', host, tags_copy,
                                            regexes, include_only_marked))

        elif obj_type == 'host':
            if regexes and regexes.get('host_include') is not None:
                match = re.search(regexes['host_include'], obj.name)
                if not match:
                    self.log.debug(
                        u"Filtered out VM {0} because of host_include_only_regex"
                        .format(obj.name))
                    return
            watched_mor = dict(mor_type='host',
                               mor=obj,
                               hostname=obj.name,
                               tags=tags_copy + ['vsphere_type:host'])
            self.morlist_raw[i_key].append(watched_mor)

            host_tag = "vsphere_host:%s" % obj.name
            tags_copy.append(host_tag)
            for vm in obj.vm:
                if vm.runtime.powerState != 'poweredOn':
                    continue
                self.pool.apply_async(self._cache_morlist_raw_atomic,
                                      args=(i_key, 'vm', vm, tags_copy,
                                            regexes, include_only_marked))

        elif obj_type == 'vm':
            if regexes and regexes.get('vm_include') is not None:
                match = re.search(regexes['vm_include'], obj.name)
                if not match:
                    self.log.debug(
                        u"Filtered out VM {0} because of vm_include_only_regex"
                        .format(obj.name))
                    return
            # Also, if include_only_marked is true, then check if there exists a
            # custom field with the value DatadogMonitored
            if include_only_marked:
                monitored = False
                for field in obj.customValue:
                    if field.value == VM_MONITORING_FLAG:
                        monitored = True
                        break  # we shall monitor
                if not monitored:
                    self.log.debug(
                        u"Filtered out VM {0} because of include_only_marked".
                        format(obj.name))
                    return

            watched_mor = dict(mor_type='vm',
                               mor=obj,
                               hostname=obj.name,
                               tags=tags_copy + ['vsphere_type:vm'])
            self.morlist_raw[i_key].append(watched_mor)

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.morlist_raw_atomic.time',
                       t.total())
        ### </TEST-INSTRUMENTATION>

    def _cache_morlist_raw(self, instance):
        """ Initiate the first layer to refresh self.morlist by queueing
        _cache_morlist_raw_atomic on the rootFolder in a recursive/asncy approach
        """

        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)
        if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0:
            self.log.debug(
                "Skipping morlist collection now, RAW results "
                "processing not over (latest refresh was {0}s ago)".format(
                    time.time() - self.cache_times[i_key][MORLIST][LAST]))
            return
        self.morlist_raw[i_key] = []

        server_instance = self._get_server_instance(instance)
        root_folder = server_instance.content.rootFolder

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        include_only_marked = _is_affirmative(
            instance.get('include_only_marked', False))
        self.pool.apply_async(self._cache_morlist_raw_atomic,
                              args=(i_key, 'rootFolder', root_folder,
                                    [instance_tag
                                     ], regexes, include_only_marked))
        self.cache_times[i_key][MORLIST][LAST] = time.time()

    @atomic_method
    def _cache_morlist_process_atomic(self, instance, mor):
        """ Process one item of the self.morlist_raw list by querying the available
        metrics for this MOR and then putting it in self.morlist
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>
        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        self.log.debug("job_atomic: Querying available metrics"
                       " for MOR {0} (type={1})".format(
                           mor['mor'], mor['mor_type']))

        available_metrics = perfManager.QueryAvailablePerfMetric(
            mor['mor'], intervalId=REAL_TIME_INTERVAL)

        mor['metrics'] = self._compute_needed_metrics(instance,
                                                      available_metrics)
        mor_name = str(mor['mor'])

        if mor_name in self.morlist[i_key]:
            # Was already here last iteration
            self.morlist[i_key][mor_name]['metrics'] = mor['metrics']
        else:
            self.morlist[i_key][mor_name] = mor

        self.morlist[i_key][mor_name]['last_seen'] = time.time()

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.morlist_process_atomic.time',
                       t.total())
        ### </TEST-INSTRUMENTATION>

    def _cache_morlist_process(self, instance):
        """ Empties the self.morlist_raw by popping items and running asynchronously
        the _cache_morlist_process_atomic operation that will get the available
        metrics for this MOR and put it in self.morlist
        """
        i_key = self._instance_key(instance)
        if i_key not in self.morlist:
            self.morlist[i_key] = {}

        batch_size = self.init_config.get('batch_morlist_size',
                                          BATCH_MORLIST_SIZE)

        for i in xrange(batch_size):
            try:
                mor = self.morlist_raw[i_key].pop()
                self.pool.apply_async(self._cache_morlist_process_atomic,
                                      args=(instance, mor))
            except (IndexError, KeyError):
                self.log.debug("No more work to process in morlist_raw")
                return

    def _vacuum_morlist(self, instance):
        """ Check if self.morlist doesn't have some old MORs that are gone, ie
        we cannot get any metrics from them anyway (or =0)
        """
        i_key = self._instance_key(instance)
        morlist = self.morlist[i_key].items()

        for mor_name, mor in morlist:
            last_seen = mor['last_seen']
            if (time.time() - last_seen) > 2 * REFRESH_MORLIST_INTERVAL:
                del self.morlist[i_key][mor_name]

    def _cache_metrics_metadata(self, instance):
        """ Get from the server instance, all the performance counters metadata
        meaning name/group/description... attached with the corresponding ID
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>

        i_key = self._instance_key(instance)
        self.log.info(
            "Warming metrics metadata cache for instance {0}".format(i_key))
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        new_metadata = {}
        for counter in perfManager.perfCounter:
            d = dict(
                name="%s.%s" % (counter.groupInfo.key, counter.nameInfo.key),
                unit=counter.unitInfo.key,
                instance_tag='instance'  # FIXME: replace by what we want to tag!
            )
            new_metadata[counter.key] = d
        self.cache_times[i_key][METRICS_METADATA][LAST] = time.time()

        self.log.info(
            "Finished metadata collection for instance {0}".format(i_key))
        # Reset metadata
        self.metrics_metadata[i_key] = new_metadata

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.metric_metadata_collection.time',
                       t.total())
        ### </TEST-INSTRUMENTATION>

    def _transform_value(self, instance, counter_id, value):
        """ Given the counter_id, look up for the metrics metadata to check the vsphere
        type of the counter and apply pre-reporting transformation if needed.
        """
        i_key = self._instance_key(instance)
        if counter_id in self.metrics_metadata[i_key]:
            unit = self.metrics_metadata[i_key][counter_id]['unit']
            if unit == 'percent':
                return float(value) / 100

        # Defaults to return the value without transformation
        return value

    @atomic_method
    def _collect_metrics_atomic(self, instance, mor):
        """ Task that collects the metrics listed in the morlist for one MOR
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>

        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager
        query = vim.PerformanceManager.QuerySpec(maxSample=1,
                                                 entity=mor['mor'],
                                                 metricId=mor['metrics'],
                                                 intervalId=20,
                                                 format='normal')
        results = perfManager.QueryPerf(querySpec=[query])
        if results:
            for result in results[0].value:
                if result.id.counterId not in self.metrics_metadata[i_key]:
                    self.log.debug(
                        "Skipping this metric value, because there is no metadata about it"
                    )
                    continue
                instance_name = result.id.instance or "none"
                value = self._transform_value(instance, result.id.counterId,
                                              result.value[0])

                # Metric types are absolute, delta, and rate
                if ALL_METRICS[self.metrics_metadata[i_key][
                        result.id.counterId]['name']]['s_type'] == 'rate':
                    record_metric = self.rate
                else:
                    record_metric = self.gauge
                record_metric(
                    "vsphere.%s" %
                    self.metrics_metadata[i_key][result.id.counterId]['name'],
                    value,
                    hostname=mor['hostname'],
                    tags=['instance:%s' % instance_name])

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.metric_colection.time',
                       t.total())
        ### </TEST-INSTRUMENTATION>

    def collect_metrics(self, instance):
        """ Calls asynchronously _collect_metrics_atomic on all MORs, as the
        job queue is processed the Aggregator will receive the metrics.
        """
        i_key = self._instance_key(instance)
        if i_key not in self.morlist:
            self.log.debug(
                "Not collecting metrics for this instance, nothing to do yet: {0}"
                .format(i_key))
            return

        mors = self.morlist[i_key].items()
        self.log.debug("Collecting metrics of %d mors" % len(mors))

        vm_count = 0

        for mor_name, mor in mors:
            if mor['mor_type'] == 'vm':
                vm_count += 1
            if 'metrics' not in mor:
                # self.log.debug("Skipping entity %s collection because we didn't cache its metrics yet" % mor['hostname'])
                continue

            self.pool.apply_async(self._collect_metrics_atomic,
                                  args=(instance, mor))

        self.gauge('vsphere.vm.count',
                   vm_count,
                   tags=["vcenter_server:%s" % instance.get('name')])

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        ### <TEST-INSTRUMENTATION>
        self.gauge('datadog.agent.vsphere.queue_size',
                   self.pool._workq.qsize(),
                   tags=['instant:initial'])
        ### </TEST-INSTRUMENTATION>

        # First part: make sure our object repository is neat & clean
        if self._should_cache(instance, METRICS_METADATA):
            self._cache_metrics_metadata(instance)

        if self._should_cache(instance, MORLIST):
            self._cache_morlist_raw(instance)
        self._cache_morlist_process(instance)
        self._vacuum_morlist(instance)

        # Second part: do the job
        self.collect_metrics(instance)
        self._query_event(instance)

        # For our own sanity
        self._clean()

        thread_crashed = False
        try:
            while True:
                self.log.critical(self.exceptionq.get_nowait())
                thread_crashed = True
        except Empty:
            pass
        if thread_crashed:
            self.stop_pool()
            raise Exception("One thread in the pool crashed, check the logs")

        ### <TEST-INSTRUMENTATION>
        self.gauge('datadog.agent.vsphere.queue_size',
                   self.pool._workq.qsize(),
                   tags=['instant:final'])
Ejemplo n.º 8
0
class NetworkCheck(AgentCheck):
    SOURCE_TYPE_NAME = 'servicecheck'
    SERVICE_CHECK_PREFIX = 'network_check'

    STATUS_TO_SERVICE_CHECK = {
            Status.UP  : AgentCheck.OK,
            Status.WARNING : AgentCheck.WARNING,
            Status.DOWN : AgentCheck.CRITICAL
        }

    """
    Services checks inherits from this class.
    This class should never be directly instanciated.

    Work flow:
        The main agent loop will call the check function for each instance for
        each iteration of the loop.
        The check method will make an asynchronous call to the _process method in
        one of the thread initiated in the thread pool created in this class constructor.
        The _process method will call the _check method of the inherited class
        which will perform the actual check.

        The _check method must return a tuple which first element is either
            Status.UP or Status.DOWN.
            The second element is a short error message that will be displayed
            when the service turns down.

    """

    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # A dictionary to keep track of service statuses
        self.statuses = {}
        self.notified = {}
        self.nb_failures = 0
        self.pool_started = False

        # Make sure every instance has a name that we use as a unique key
        # to keep track of statuses
        names = []
        for inst in instances:
            if 'name' not in inst:
                raise Exception("All instances should have a 'name' parameter,"
                                " error on instance: {0}".format(inst))
            if inst['name'] in names:
                raise Exception("Duplicate names for instances with name {0}"
                                .format(inst['name']))

    def stop(self):
        self.stop_pool()
        self.pool_started = False

    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(self.init_config.get('threads_count', default_size))

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.pool_started = True

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        if threading.activeCount() > 5 * self.pool_size + 5: # On Windows the agent runs on multiple threads so we need to have an offset of 5 in case the pool_size is 1
            raise Exception("Thread number (%s) is exploding. Skipping this check" % threading.activeCount())
        self._process_results()
        self._clean()
        name = instance.get('name', None)
        if name is None:
            self.log.error('Each service check must have a name')
            return

        if name not in self.jobs_status:
            # A given instance should be processed one at a time
            self.jobs_status[name] = time.time()
            self.pool.apply_async(self._process, args=(instance,))
        else:
            self.log.error("Instance: %s skipped because it's already running." % name)

    def _process(self, instance):
        try:
            statuses = self._check(instance)

            if isinstance(statuses, tuple):
                # Assume the check only returns one service check
                status, msg = statuses
                self.resultsq.put((status, msg, None, instance))

            elif isinstance(statuses, list):
                for status in statuses:
                    sc_name, status, msg = status
                    self.resultsq.put((status, msg, sc_name, instance))

        except Exception:
            result = (FAILURE, FAILURE, FAILURE, FAILURE)
            self.resultsq.put(result)

    def _process_results(self):
        for i in range(MAX_LOOP_ITERATIONS):
            try:
                # We want to fetch the result in a non blocking way
                status, msg, sc_name, instance = self.resultsq.get_nowait()
            except Empty:
                break

            if status == FAILURE:
                self.nb_failures += 1
                if self.nb_failures >= self.pool_size - 1:
                    self.nb_failures = 0
                    self.restart_pool()
                continue
            self.report_as_service_check(sc_name, status, instance, msg)

            # FIXME: 5.3, this has been deprecated before, get rid of events
            # Don't create any event to avoid duplicates with server side
            # service_checks
            skip_event = _is_affirmative(instance.get('skip_event', False))
            instance_name = instance['name']
            if not skip_event:
                self.warning("Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Datadog Agent.")
                event = None

                if instance_name not in self.statuses:
                    self.statuses[instance_name] = defaultdict(list)

                self.statuses[instance_name][sc_name].append(status)

                window = int(instance.get('window', 1))

                if window > 256:
                    self.log.warning("Maximum window size (256) exceeded, defaulting it to 256")
                    window = 256

                threshold = instance.get('threshold', 1)

                if len(self.statuses[instance_name][sc_name]) > window:
                    self.statuses[instance_name][sc_name].pop(0)

                nb_failures = self.statuses[instance_name][sc_name].count(Status.DOWN)

                if nb_failures >= threshold:
                    if self.notified.get((instance_name, sc_name), Status.UP) != Status.DOWN:
                        event = self._create_status_event(sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.DOWN
                else:
                    if self.notified.get((instance_name, sc_name), Status.UP) != Status.UP:
                        event = self._create_status_event(sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.UP

                if event is not None:
                    self.events.append(event)

            # The job is finished here, this instance can be re processed
            if instance_name in self.jobs_status:
                del self.jobs_status[instance_name]

    def _check(self, instance):
        """This function should be implemented by inherited classes"""
        raise NotImplementedError


    def _clean(self):
        now = time.time()
        for name in self.jobs_status.keys():
            start_time = self.jobs_status[name]
            if now - start_time > TIMEOUT:
                self.log.critical("Restarting Pool. One check is stuck: %s" % name)
                self.restart_pool()
                break
Ejemplo n.º 9
0
class VSphereCheck(AgentCheck):
    """ Get performance metrics from a vCenter server and upload them to Datadog
    References:
        http://pubs.vmware.com/vsphere-51/index.jsp#com.vmware.wssdk.apiref.doc/vim.PerformanceManager.html

    *_atomic jobs perform one single task asynchronously in the ThreadPool, we
    don't know exactly when they will finish, but we reap them if they're stuck.
    The other calls are performed synchronously.
    """

    SERVICE_CHECK_NAME = 'vcenter.can_connect'

    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self.time_started = time.time()
        self.pool_started = False
        self.exceptionq = Queue()

        # Connections open to vCenter instances
        self.server_instances = {}

        # Event configuration
        self.event_config = {}
        # Caching resources, timeouts
        self.cache_times = {}
        for instance in self.instances:
            i_key = self._instance_key(instance)
            self.cache_times[i_key] = {
                MORLIST: {
                    LAST: 0,
                    INTERVAL: init_config.get('refresh_morlist_interval',
                                    REFRESH_MORLIST_INTERVAL)
                },
                METRICS_METADATA: {
                    LAST: 0,
                    INTERVAL: init_config.get('refresh_metrics_metadata_interval',
                                    REFRESH_METRICS_METADATA_INTERVAL)
                }
            }

            self.event_config[i_key] = instance.get('event_config')

        # First layer of cache (get entities from the tree)
        self.morlist_raw = {}
        # Second layer, processed from the first one
        self.morlist = {}
        # Metrics metadata, basically perfCounterId -> {name, group, description}
        self.metrics_metadata = {}

        self.latest_event_query = {}

    def stop(self):
        self.stop_pool()

    def start_pool(self):
        self.log.info("Starting Thread Pool")
        self.pool_size = int(self.init_config.get('threads_count', DEFAULT_SIZE_POOL))

        self.pool = Pool(self.pool_size)
        self.pool_started = True
        self.jobs_status = {}

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0
            self.pool_started = False

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def _clean(self):
        now = time.time()
        # TODO: use that
        for name in self.jobs_status.keys():
            start_time = self.jobs_status[name]
            if now - start_time > JOB_TIMEOUT:
                self.log.critical("Restarting Pool. One check is stuck.")
                self.restart_pool()
                break

    def _query_event(self, instance):
        i_key = self._instance_key(instance)
        last_time = self.latest_event_query.get(i_key)

        server_instance = self._get_server_instance(instance)
        event_manager = server_instance.content.eventManager

        # Be sure we don't duplicate any event, never query the "past"
        if not last_time:
            last_time = self.latest_event_query[i_key] = \
                event_manager.latestEvent.createdTime + timedelta(seconds=1)

        query_filter = vim.event.EventFilterSpec()
        time_filter = vim.event.EventFilterSpec.ByTime(beginTime=self.latest_event_query[i_key])
        query_filter.time = time_filter

        try:
            new_events = event_manager.QueryEvents(query_filter)
            self.log.debug("Got {0} events from vCenter event manager".format(len(new_events)))
            for event in new_events:
                normalized_event = VSphereEvent(event, self.event_config[i_key])
                # Can return None if the event if filtered out
                event_payload = normalized_event.get_datadog_payload()
                if event_payload is not None:
                    self.event(event_payload)
                last_time = event.createdTime + timedelta(seconds=1)
        except Exception as e:
            # Don't get stuck on a failure to fetch an event
            # Ignore them for next pass
            self.log.warning("Unable to fetch Events %s", e)
            last_time = event_manager.latestEvent.createdTime + timedelta(seconds=1)

        self.latest_event_query[i_key] = last_time

    def _instance_key(self, instance):
        i_key = instance.get('name')
        if i_key is None:
            raise Exception("Must define a unique 'name' per vCenter instance")
        return i_key

    def _should_cache(self, instance, entity):
        i_key = self._instance_key(instance)
        now = time.time()
        return now - self.cache_times[i_key][entity][LAST] > self.cache_times[i_key][entity][INTERVAL]

    def _get_server_instance(self, instance):
        i_key = self._instance_key(instance)

        service_check_tags = [
            'vcenter_server:{0}'.format(instance.get('name')),
            'vcenter_host:{0}'.format(instance.get('host')),
        ]

        # Check for ssl configs and generate an appropriate ssl context object
        ssl_verify = instance.get('ssl_verify', True)
        ssl_capath = instance.get('ssl_capath', None)
        if not ssl_verify:
            context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
            context.verify_mode = ssl.CERT_NONE
        elif ssl_capath:
            context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
            context.verify_mode = ssl.CERT_REQUIRED
            context.load_verify_locations(capath=ssl_capath)

        # If both configs are used, log a message explaining the default
        if not ssl_verify and ssl_capath:
            self.log.debug("Your configuration is incorrectly attempting to "
                           "specify both a CA path, and to disable SSL "
                           "verification. You cannot do both. Proceeding with "
                           "disabling ssl verification.")

        if i_key not in self.server_instances:
            try:
                server_instance = connect.SmartConnect(
                    host = instance.get('host'),
                    user = instance.get('username'),
                    pwd = instance.get('password'),
                    sslContext = context if not ssl_verify or ssl_capath else None
                )
            except Exception as e:
                err_msg = "Connection to %s failed: %s" % (instance.get('host'), e)
                self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                        tags=service_check_tags, message=err_msg)
                raise Exception(err_msg)

            self.server_instances[i_key] = server_instance

        # Test if the connection is working
        try:
            self.server_instances[i_key].RetrieveContent()
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK,
                    tags=service_check_tags)
        except Exception as e:
            err_msg = "Connection to %s died unexpectedly: %s" % (instance.get('host'), e)
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                    tags=service_check_tags, message=err_msg)
            raise Exception(err_msg)

        return self.server_instances[i_key]

    def _compute_needed_metrics(self, instance, available_metrics):
        """ Compare the available metrics for one MOR we have computed and intersect them
        with the set of metrics we want to report
        """
        if instance.get('all_metrics', False):
            return available_metrics

        i_key = self._instance_key(instance)
        wanted_metrics = []
        # Get only the basic metrics
        for metric in available_metrics:
            # No cache yet, skip it for now
            if (i_key not in self.metrics_metadata
                    or metric.counterId not in self.metrics_metadata[i_key]):
                continue
            if self.metrics_metadata[i_key][metric.counterId]['name'] in BASIC_METRICS:
                wanted_metrics.append(metric)

        return wanted_metrics


    def get_external_host_tags(self):
        """ Returns a list of tags for every host that is detected by the vSphere
        integration.
        List of pairs (hostname, list_of_tags)
        """
        self.log.info("Sending external_host_tags now")
        external_host_tags = []
        for instance in self.instances:
            i_key = self._instance_key(instance)
            mor_list = self.morlist[i_key].items()
            for mor_name, mor in mor_list:
                external_host_tags.append((mor['hostname'], {SOURCE_TYPE: mor['tags']}))

        return external_host_tags

    @atomic_method
    def _cache_morlist_raw_atomic(self, i_key, obj_type, obj, tags, regexes=None):
        """ Compute tags for a single node in the vCenter rootFolder
        and queue other such jobs for children nodes.
        Usual hierarchy:
        rootFolder
            - datacenter1
                - compute_resource1 == cluster
                    - host1
                    - host2
                    - host3
                - compute_resource2
                    - host5
                        - vm1
                        - vm2
        If it's a node we want to query metric for, queue it in self.morlist_raw
        that will be processed by another job.
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        self.log.debug("job_atomic: Exploring MOR {0} (type={1})".format(obj, obj_type))
        ### </TEST-INSTRUMENTATION>
        tags_copy = deepcopy(tags)

        if obj_type == 'rootFolder':
            for datacenter in obj.childEntity:
                # Skip non-datacenter
                if not hasattr(datacenter, 'hostFolder'):
                    continue
                self.pool.apply_async(
                    self._cache_morlist_raw_atomic,
                    args=(i_key, 'datacenter', datacenter, tags_copy, regexes)
                )

        elif obj_type == 'datacenter':
            dc_tag = "vsphere_datacenter:%s" % obj.name
            tags_copy.append(dc_tag)
            for compute_resource in obj.hostFolder.childEntity:
                # Skip non-compute resource
                if not hasattr(compute_resource, 'host'):
                    continue
                self.pool.apply_async(
                    self._cache_morlist_raw_atomic,
                    args=(i_key, 'compute_resource', compute_resource, tags_copy, regexes)
                )

        elif obj_type == 'compute_resource':
            if obj.__class__ == vim.ClusterComputeResource:
                cluster_tag = "vsphere_cluster:%s" % obj.name
                tags_copy.append(cluster_tag)
            for host in obj.host:
                # Skip non-host
                if not hasattr(host, 'vm'):
                    continue
                self.pool.apply_async(
                    self._cache_morlist_raw_atomic,
                    args=(i_key, 'host', host, tags_copy, regexes)
                )

        elif obj_type == 'host':
            if regexes and regexes.get('host_include') is not None:
                match = re.search(regexes['host_include'], obj.name)
                if not match:
                    self.log.debug(u"Filtered out VM {0} because of host_include_only_regex".format(obj.name))
                    return
            watched_mor = dict(mor_type='host', mor=obj, hostname=obj.name, tags=tags_copy+['vsphere_type:host'])
            self.morlist_raw[i_key].append(watched_mor)

            host_tag = "vsphere_host:%s" % obj.name
            tags_copy.append(host_tag)
            for vm in obj.vm:
                if vm.runtime.powerState != 'poweredOn':
                    continue
                self.pool.apply_async(
                    self._cache_morlist_raw_atomic,
                    args=(i_key, 'vm', vm, tags_copy, regexes)
                )

        elif obj_type == 'vm':
            if regexes and regexes.get('vm_include') is not None:
                match = re.search(regexes['vm_include'], obj.name)
                if not match:
                    self.log.debug(u"Filtered out VM {0} because of vm_include_only_regex".format(obj.name))
                    return
            watched_mor = dict(mor_type='vm', mor=obj, hostname=obj.name, tags=tags_copy+['vsphere_type:vm'])
            self.morlist_raw[i_key].append(watched_mor)

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.morlist_raw_atomic.time', t.total())
        ### </TEST-INSTRUMENTATION>

    def _cache_morlist_raw(self, instance):
        """ Initiate the first layer to refresh self.morlist by queueing
        _cache_morlist_raw_atomic on the rootFolder in a recursive/asncy approach
        """

        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)
        if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0:
            self.log.debug(
                "Skipping morlist collection now, RAW results "
                "processing not over (latest refresh was {0}s ago)".format(
                    time.time() - self.cache_times[i_key][MORLIST][LAST])
            )
            return
        self.morlist_raw[i_key] = []

        server_instance = self._get_server_instance(instance)
        root_folder = server_instance.content.rootFolder

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        self.pool.apply_async(
            self._cache_morlist_raw_atomic,
            args=(i_key, 'rootFolder', root_folder, [instance_tag], regexes)
        )
        self.cache_times[i_key][MORLIST][LAST] = time.time()

    @atomic_method
    def _cache_morlist_process_atomic(self, instance, mor):
        """ Process one item of the self.morlist_raw list by querying the available
        metrics for this MOR and then putting it in self.morlist
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>
        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        self.log.debug(
            "job_atomic: Querying available metrics"
            " for MOR {0} (type={1})".format(mor['mor'], mor['mor_type'])
        )

        available_metrics = perfManager.QueryAvailablePerfMetric(
            mor['mor'], intervalId=REAL_TIME_INTERVAL)

        mor['metrics'] = self._compute_needed_metrics(instance, available_metrics)
        mor_name = str(mor['mor'])

        if mor_name in self.morlist[i_key]:
            # Was already here last iteration
            self.morlist[i_key][mor_name]['metrics'] = mor['metrics']
        else:
            self.morlist[i_key][mor_name] = mor

        self.morlist[i_key][mor_name]['last_seen'] = time.time()

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.morlist_process_atomic.time', t.total())
        ### </TEST-INSTRUMENTATION>

    def _cache_morlist_process(self, instance):
        """ Empties the self.morlist_raw by popping items and running asynchronously
        the _cache_morlist_process_atomic operation that will get the available
        metrics for this MOR and put it in self.morlist
        """
        i_key = self._instance_key(instance)
        if i_key not in self.morlist:
            self.morlist[i_key] = {}

        batch_size = self.init_config.get('batch_morlist_size', BATCH_MORLIST_SIZE)

        for i in xrange(batch_size):
            try:
                mor = self.morlist_raw[i_key].pop()
                self.pool.apply_async(self._cache_morlist_process_atomic, args=(instance, mor))
            except (IndexError, KeyError):
                self.log.debug("No more work to process in morlist_raw")
                return

    def _vacuum_morlist(self, instance):
        """ Check if self.morlist doesn't have some old MORs that are gone, ie
        we cannot get any metrics from them anyway (or =0)
        """
        i_key = self._instance_key(instance)
        morlist = self.morlist[i_key].items()

        for mor_name, mor in morlist:
            last_seen = mor['last_seen']
            if (time.time() - last_seen) > 2 * REFRESH_MORLIST_INTERVAL:
                del self.morlist[i_key][mor_name]

    def _cache_metrics_metadata(self, instance):
        """ Get from the server instance, all the performance counters metadata
        meaning name/group/description... attached with the corresponding ID
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>

        i_key = self._instance_key(instance)
        self.log.info("Warming metrics metadata cache for instance {0}".format(i_key))
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        new_metadata = {}
        for counter in perfManager.perfCounter:
            d = dict(
                name = "%s.%s" % (counter.groupInfo.key, counter.nameInfo.key),
                unit = counter.unitInfo.key,
                instance_tag = 'instance'  # FIXME: replace by what we want to tag!
            )
            new_metadata[counter.key] = d
        self.cache_times[i_key][METRICS_METADATA][LAST] = time.time()

        self.log.info("Finished metadata collection for instance {0}".format(i_key))
        # Reset metadata
        self.metrics_metadata[i_key] = new_metadata

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.metric_metadata_collection.time', t.total())
        ### </TEST-INSTRUMENTATION>

    def _transform_value(self, instance, counter_id, value):
        """ Given the counter_id, look up for the metrics metadata to check the vsphere
        type of the counter and apply pre-reporting transformation if needed.
        """
        i_key = self._instance_key(instance)
        if counter_id in self.metrics_metadata[i_key]:
            unit = self.metrics_metadata[i_key][counter_id]['unit']
            if unit == 'percent':
                return float(value) / 100

        # Defaults to return the value without transformation
        return value

    @atomic_method
    def _collect_metrics_atomic(self, instance, mor):
        """ Task that collects the metrics listed in the morlist for one MOR
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>

        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager
        query = vim.PerformanceManager.QuerySpec(maxSample=1,
                                                 entity=mor['mor'],
                                                 metricId=mor['metrics'],
                                                 intervalId=20,
                                                 format='normal')
        results = perfManager.QueryPerf(querySpec=[query])
        if results:
            for result in results[0].value:
                if result.id.counterId not in self.metrics_metadata[i_key]:
                    self.log.debug("Skipping this metric value, because there is no metadata about it")
                    continue
                instance_name = result.id.instance or "none"
                value = self._transform_value(instance, result.id.counterId, result.value[0])

                # Metric types are absolute, delta, and rate
                if ALL_METRICS[self.metrics_metadata[i_key][result.id.counterId]['name']]['s_type'] == 'rate':
                    record_metric = self.rate
                else:
                    record_metric = self.gauge
                record_metric(
                    "vsphere.%s" % self.metrics_metadata[i_key][result.id.counterId]['name'],
                    value,
                    hostname=mor['hostname'],
                    tags=['instance:%s' % instance_name]
                )

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.metric_colection.time', t.total())
        ### </TEST-INSTRUMENTATION>

    def collect_metrics(self, instance):
        """ Calls asynchronously _collect_metrics_atomic on all MORs, as the
        job queue is processed the Aggregator will receive the metrics.
        """
        i_key = self._instance_key(instance)
        if i_key not in self.morlist:
            self.log.debug("Not collecting metrics for this instance, nothing to do yet: {0}".format(i_key))
            return

        mors = self.morlist[i_key].items()
        self.log.debug("Collecting metrics of %d mors" % len(mors))

        vm_count = 0

        for mor_name, mor in mors:
            if mor['mor_type'] == 'vm':
                vm_count += 1
            if 'metrics' not in mor:
                # self.log.debug("Skipping entity %s collection because we didn't cache its metrics yet" % mor['hostname'])
                continue

            self.pool.apply_async(self._collect_metrics_atomic, args=(instance, mor))

        self.gauge('vsphere.vm.count', vm_count, tags=["vcenter_server:%s" % instance.get('name')])

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        ### <TEST-INSTRUMENTATION>
        self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial'])
        ### </TEST-INSTRUMENTATION>

        # First part: make sure our object repository is neat & clean
        if self._should_cache(instance, METRICS_METADATA):
            self._cache_metrics_metadata(instance)

        if self._should_cache(instance, MORLIST):
            self._cache_morlist_raw(instance)
        self._cache_morlist_process(instance)
        self._vacuum_morlist(instance)

        # Second part: do the job
        self.collect_metrics(instance)
        self._query_event(instance)

        # For our own sanity
        self._clean()

        thread_crashed = False
        try:
            while True:
                self.log.critical(self.exceptionq.get_nowait())
                thread_crashed = True
        except Empty:
            pass
        if thread_crashed:
            self.stop_pool()
            raise Exception("One thread in the pool crashed, check the logs")

        ### <TEST-INSTRUMENTATION>
        self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'])
Ejemplo n.º 10
0
class NetworkCheck(AgentCheck):
    SOURCE_TYPE_NAME = 'servicecheck'
    SERVICE_CHECK_PREFIX = 'network_check'

    STATUS_TO_SERVICE_CHECK = {
        Status.UP: AgentCheck.OK,
        Status.WARNING: AgentCheck.WARNING,
        Status.CRITICAL: AgentCheck.CRITICAL,
        Status.DOWN: AgentCheck.CRITICAL,
    }
    """
    Services checks inherits from this class.
    This class should never be directly instanciated.

    Work flow:
        The main agent loop will call the check function for each instance for
        each iteration of the loop.
        The check method will make an asynchronous call to the _process method in
        one of the thread initiated in the thread pool created in this class constructor.
        The _process method will call the _check method of the inherited class
        which will perform the actual check.

        The _check method must return a tuple which first element is either
            Status.UP or Status.DOWN.
            The second element is a short error message that will be displayed
            when the service turns down.

    """
    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # A dictionary to keep track of service statuses
        self.statuses = {}
        self.notified = {}
        self.nb_failures = 0
        self.pool_started = False

        # Make sure every instance has a name that we use as a unique key
        # to keep track of statuses
        names = []
        for inst in instances:
            name = inst.get('name', None)
            if not name:
                raise Exception("All instances should have a 'name' parameter,"
                                " error on instance: {0}".format(inst))
            if name in names:
                raise Exception(
                    "Duplicate names for instances with name {0}".format(
                        inst['name']))

    def stop(self):
        self.stop_pool()
        self.pool_started = False

    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(
            self.init_config.get('threads_count', default_size))

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.jobs_results = {}
        self.pool_started = True

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        if threading.activeCount(
        ) > 5 * self.pool_size + 5:  # On Windows the agent runs on multiple threads so we need to have an offset of 5 in case the pool_size is 1
            raise Exception(
                "Thread number (%s) is exploding. Skipping this check" %
                threading.activeCount())
        self._process_results()
        self._clean()
        name = instance.get('name', None)
        if name is None:
            self.log.error('Each service check must have a name')
            return

        if name not in self.jobs_status:
            # A given instance should be processed one at a time
            self.jobs_status[name] = time.time()
            self.jobs_results[name] = self.pool.apply_async(self._process,
                                                            args=(instance, ))
        else:
            self.log.error(
                "Instance: %s skipped because it's already running." % name)

    def _process(self, instance):
        try:
            statuses = self._check(instance)

            if isinstance(statuses, tuple):
                # Assume the check only returns one service check
                status, msg = statuses
                self.resultsq.put((status, msg, None, instance))

            elif isinstance(statuses, list):
                for status in statuses:
                    sc_name, status, msg = status
                    self.resultsq.put((status, msg, sc_name, instance))

        except Exception:
            result = (FAILURE, FAILURE, FAILURE, instance)
            self.resultsq.put(result)

    def _process_results(self):
        for i in xrange(MAX_LOOP_ITERATIONS):
            try:
                # We want to fetch the result in a non blocking way
                status, msg, sc_name, instance = self.resultsq.get_nowait()
            except Empty:
                break

            instance_name = instance['name']
            if status == FAILURE:
                self.nb_failures += 1
                if self.nb_failures >= self.pool_size - 1:
                    self.nb_failures = 0
                    self.restart_pool()

                # clean failed job
                self._clean_job(instance_name)
                continue

            self.report_as_service_check(sc_name, status, instance, msg)

            # FIXME: 5.3, this has been deprecated before, get rid of events
            # Don't create any event to avoid duplicates with server side
            # service_checks
            skip_event = _is_affirmative(instance.get('skip_event', False))
            if not skip_event:
                self.warning(
                    "Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Datadog Agent."
                )
                event = None

                if instance_name not in self.statuses:
                    self.statuses[instance_name] = defaultdict(list)

                self.statuses[instance_name][sc_name].append(status)

                window = int(instance.get('window', 1))

                if window > 256:
                    self.log.warning(
                        "Maximum window size (256) exceeded, defaulting it to 256"
                    )
                    window = 256

                threshold = instance.get('threshold', 1)

                if len(self.statuses[instance_name][sc_name]) > window:
                    self.statuses[instance_name][sc_name].pop(0)

                nb_failures = self.statuses[instance_name][sc_name].count(
                    Status.DOWN)

                if nb_failures >= threshold:
                    if self.notified.get(
                        (instance_name, sc_name), Status.UP) != Status.DOWN:
                        event = self._create_status_event(
                            sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.DOWN
                else:
                    if self.notified.get(
                        (instance_name, sc_name), Status.UP) != Status.UP:
                        event = self._create_status_event(
                            sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.UP

                if event is not None:
                    self.events.append(event)

            self._clean_job(instance_name)

    def _clean_job(self, instance_name):
        # The job is finished here, this instance can be re processed
        if instance_name in self.jobs_status:
            self.log.debug("Instance: %s cleaned from jobs status." %
                           instance_name)
            del self.jobs_status[instance_name]

        # if an exception happened, log it
        if instance_name in self.jobs_results:
            self.log.debug("Instance: %s cleaned from jobs results." %
                           instance_name)
            ret = self.jobs_results[instance_name].get()
            if isinstance(ret, Exception):
                self.log.exception(
                    "Exception in worker thread: {0}".format(ret))
            del self.jobs_results[instance_name]

    def _check(self, instance):
        """This function should be implemented by inherited classes"""
        raise NotImplementedError

    def _clean(self):
        now = time.time()
        for name, start_time in self.jobs_status.iteritems():
            if now - start_time > TIMEOUT:
                self.log.critical("Restarting Pool. One check is stuck: %s" %
                                  name)
                self.restart_pool()
                break
Ejemplo n.º 11
0
class NetworkCheck(AgentCheck):
    SOURCE_TYPE_NAME = 'servicecheck'
    SERVICE_CHECK_PREFIX = 'network_check'

    STATUS_TO_SERVICE_CHECK = {
        Status.UP: AgentCheck.OK,
        Status.DOWN: AgentCheck.CRITICAL
    }
    """
    Services checks inherits from this class.
    This class should never be directly instanciated.

    Work flow:
        The main agent loop will call the check function for each instance for
        each iteration of the loop.
        The check method will make an asynchronous call to the _process method in
        one of the thread initiated in the thread pool created in this class constructor.
        The _process method will call the _check method of the inherited class
        which will perform the actual check.

        The _check method must return a tuple which first element is either
            Status.UP or Status.DOWN.
            The second element is a short error message that will be displayed
            when the service turns down.

    """
    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # A dictionary to keep track of service statuses
        self.statuses = {}
        self.notified = {}
        self.nb_failures = 0
        self.pool_started = False

    def stop(self):
        self.stop_pool()
        self.pool_started = False

    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(
            self.init_config.get('threads_count', default_size))

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.pool_started = True

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        if threading.activeCount(
        ) > 5 * self.pool_size + 5:  # On Windows the agent runs on multiple threads so we need to have an offset of 5 in case the pool_size is 1
            raise Exception(
                "Thread number (%s) is exploding. Skipping this check" %
                threading.activeCount())
        self._process_results()
        self._clean()
        name = instance.get('name', None)
        if name is None:
            self.log.error('Each service check must have a name')
            return

        if name not in self.jobs_status:
            # A given instance should be processed one at a time
            self.jobs_status[name] = time.time()
            self.pool.apply_async(self._process, args=(instance, ))
        else:
            self.log.error(
                "Instance: %s skipped because it's already running." % name)

    def _process(self, instance):
        name = instance.get('name', None)

        try:
            status, msg = self._check(instance)

            result = (status, msg, name, instance)
            # We put the results in the result queue
            self.resultsq.put(result)

        except Exception, e:
            result = (FAILURE, FAILURE, FAILURE, FAILURE)
            self.resultsq.put(result)
Ejemplo n.º 12
0
class NetworkCheck(AgentCheck):
    SOURCE_TYPE_NAME = 'servicecheck'
    SERVICE_CHECK_PREFIX = 'network_check'
    _global_current_pool_size = 0

    STATUS_TO_SERVICE_CHECK = {
        Status.UP : AgentCheck.OK,
        Status.WARNING : AgentCheck.WARNING,
        Status.CRITICAL : AgentCheck.CRITICAL,
        Status.DOWN : AgentCheck.CRITICAL,
    }

    """
    Services checks inherits from this class.
    This class should never be directly instanciated.

    Work flow:
        The main agent loop will call the check function for each instance for
        each iteration of the loop.
        The check method will make an asynchronous call to the _process method in
        one of the thread initiated in the thread pool created in this class constructor.
        The _process method will call the _check method of the inherited class
        which will perform the actual check.

        The _check method must return a tuple which first element is either
            Status.UP or Status.DOWN.
            The second element is a short error message that will be displayed
            when the service turns down.

    """

    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # A dictionary to keep track of service statuses
        self.statuses = {}
        self.notified = {}
        self.nb_failures = 0
        self.pool_size = 0
        self.pool_started = False

        # Make sure every instance has a name that we use as a unique key
        # to keep track of statuses
        names = []
        for inst in instances:
            inst_name = inst.get('name', None)
            if not inst_name:
                raise Exception("All instances should have a 'name' parameter,"
                                " error on instance: {0}".format(inst))
            if inst_name in names:
                raise Exception("Duplicate names for instances with name {0}"
                                .format(inst_name))
            names.append(inst_name)

    def stop(self):
        self.stop_pool()
        self.pool_started = False

    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(self.init_config.get('threads_count', default_size))

        # To keep track on the total number of threads we should have running
        NetworkCheck._global_current_pool_size += self.pool_size

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.jobs_results = {}
        self.pool_started = True

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")

        # To keep track on the total number of threads we should have running
        NetworkCheck._global_current_pool_size -= self.pool_size

        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        if threading.activeCount() > 5 * NetworkCheck._global_current_pool_size + 6:
            # On Windows the agent runs on multiple threads because of WMI so we need an offset of 6
            raise Exception("Thread number (%s) is exploding. Skipping this check" % threading.activeCount())
        self._process_results()
        self._clean()
        name = instance.get('name', None)
        if name is None:
            self.log.error('Each service check must have a name')
            return

        if name not in self.jobs_status:
            # A given instance should be processed one at a time
            self.jobs_status[name] = time.time()
            self.jobs_results[name] = self.pool.apply_async(self._process, args=(instance,))
        else:
            self.log.error("Instance: %s skipped because it's already running." % name)

    def _process(self, instance):
        try:
            statuses = self._check(instance)

            if isinstance(statuses, tuple):
                # Assume the check only returns one service check
                status, msg = statuses
                self.resultsq.put((status, msg, None, instance))

            elif isinstance(statuses, list):
                for status in statuses:
                    sc_name, status, msg = status
                    self.resultsq.put((status, msg, sc_name, instance))

        except Exception:
            self.log.exception(
                u"Failed to process instance '%s'.", instance.get('name', u"")
            )
            result = (FAILURE, FAILURE, FAILURE, instance)
            self.resultsq.put(result)

    def _process_results(self):
        for i in xrange(MAX_LOOP_ITERATIONS):
            try:
                # We want to fetch the result in a non blocking way
                status, msg, sc_name, instance = self.resultsq.get_nowait()
            except Empty:
                break

            instance_name = instance['name']
            if status == FAILURE:
                self.nb_failures += 1
                if self.nb_failures >= self.pool_size - 1:
                    self.nb_failures = 0
                    self.restart_pool()

                # clean failed job
                self._clean_job(instance_name)
                continue

            self.report_as_service_check(sc_name, status, instance, msg)
            self._clean_job(instance_name)

    def _clean_job(self, instance_name):
        # The job is finished here, this instance can be re processed
        if instance_name in self.jobs_status:
            self.log.debug("Instance: %s cleaned from jobs status." % instance_name)
            del self.jobs_status[instance_name]

        # if an exception happened, log it
        if instance_name in self.jobs_results:
            self.log.debug("Instance: %s cleaned from jobs results." % instance_name)
            ret = self.jobs_results[instance_name].get()
            if isinstance(ret, Exception):
                self.log.exception("Exception in worker thread: {0}".format(ret))
            del self.jobs_results[instance_name]


    def _check(self, instance):
        """This function should be implemented by inherited classes"""
        raise NotImplementedError


    def _clean(self):
        now = time.time()
        for name, start_time in self.jobs_status.iteritems():
            if now - start_time > TIMEOUT:
                self.log.critical("Restarting Pool. One check is stuck: %s" % name)
                self.restart_pool()
                break
Ejemplo n.º 13
0
class ServicesCheck(AgentCheck):
    SOURCE_TYPE_NAME = 'servicecheck'
    SERVICE_CHECK_PREFIX = 'service_check'

    STATUS_TO_SERVICE_CHECK = {
            Status.UP  : AgentCheck.OK,
            Status.DOWN : AgentCheck.CRITICAL
        }

    """
    Services checks inherits from this class.
    This class should never be directly instanciated.

    Work flow:
        The main agent loop will call the check function for each instance for
        each iteration of the loop.
        The check method will make an asynchronous call to the _process method in
        one of the thread initiated in the thread pool created in this class constructor.
        The _process method will call the _check method of the inherited class
        which will perform the actual check.

        The _check method must return a tuple which first element is either
            Status.UP or Status.DOWN.
            The second element is a short error message that will be displayed
            when the service turns down.

    """

    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # A dictionary to keep track of service statuses
        self.statuses = {}
        self.notified = {}
        self.nb_failures = 0
        self.pool_started = False

    def stop(self):
        self.stop_pool()
        self.pool_started = False

    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(self.init_config.get('threads_count', default_size))

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.pool_started = True

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        if threading.activeCount() > 5 * self.pool_size + 5: # On Windows the agent runs on multiple threads so we need to have an offset of 5 in case the pool_size is 1
            raise Exception("Thread number (%s) is exploding. Skipping this check" % threading.activeCount())
        self._process_results()
        self._clean()
        name = instance.get('name', None)
        if name is None:
            self.log.error('Each service check must have a name')
            return

        if name not in self.jobs_status:
            # A given instance should be processed one at a time
            self.jobs_status[name] = time.time()
            self.pool.apply_async(self._process, args=(instance,))
        else:
            self.log.error("Instance: %s skipped because it's already running." % name)


    def _process(self, instance):
        name = instance.get('name', None)

        try:
            status, msg = self._check(instance)

            result = (status, msg, name, instance)
            # We put the results in the result queue
            self.resultsq.put(result)

        except Exception, e:
            result = (FAILURE, FAILURE, FAILURE, FAILURE)
            self.resultsq.put(result)
Ejemplo n.º 14
0
class VSphereCheck(AgentCheck):
    """ Get performance metrics from a vCenter server and upload them to Datadog
    References:
        http://pubs.vmware.com/vsphere-51/index.jsp#com.vmware.wssdk.apiref.doc/vim.PerformanceManager.html

    *_atomic jobs perform one single task asynchronously in the ThreadPool, we
    don't know exactly when they will finish, but we reap them if they're stuck.
    The other calls are performed synchronously.
    """

    SERVICE_CHECK_NAME = 'vcenter.can_connect'

    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self.time_started = time.time()
        self.pool_started = False
        self.exceptionq = Queue()

        # Connections open to vCenter instances
        self.server_instances = {}

        # Event configuration
        self.event_config = {}
        # Caching resources, timeouts
        self.cache_times = {}
        for instance in self.instances:
            i_key = self._instance_key(instance)
            self.cache_times[i_key] = {
                MORLIST: {
                    LAST: 0,
                    INTERVAL: init_config.get('refresh_morlist_interval',
                                    REFRESH_MORLIST_INTERVAL)
                },
                METRICS_METADATA: {
                    LAST: 0,
                    INTERVAL: init_config.get('refresh_metrics_metadata_interval',
                                    REFRESH_METRICS_METADATA_INTERVAL)
                }
            }

            self.event_config[i_key] = instance.get('event_config')

        # managed entity raw view
        self.registry = {}
        # First layer of cache (get entities from the tree)
        self.morlist_raw = {}
        # Second layer, processed from the first one
        self.morlist = {}
        # Metrics metadata, basically perfCounterId -> {name, group, description}
        self.metrics_metadata = {}

        self.latest_event_query = {}

    def stop(self):
        self.stop_pool()

    def start_pool(self):
        self.log.info("Starting Thread Pool")
        self.pool_size = int(self.init_config.get('threads_count', DEFAULT_SIZE_POOL))

        self.pool = Pool(self.pool_size)
        self.pool_started = True
        self.jobs_status = {}

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0
            self.pool_started = False

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def _clean(self):
        now = time.time()
        # TODO: use that
        for name in self.jobs_status.keys():
            start_time = self.jobs_status[name]
            if now - start_time > JOB_TIMEOUT:
                self.log.critical("Restarting Pool. One check is stuck.")
                self.restart_pool()
                break

    def _query_event(self, instance):
        i_key = self._instance_key(instance)
        last_time = self.latest_event_query.get(i_key)

        server_instance = self._get_server_instance(instance)
        event_manager = server_instance.content.eventManager

        # Be sure we don't duplicate any event, never query the "past"
        if not last_time:
            last_time = self.latest_event_query[i_key] = \
                event_manager.latestEvent.createdTime + timedelta(seconds=1)

        query_filter = vim.event.EventFilterSpec()
        time_filter = vim.event.EventFilterSpec.ByTime(beginTime=self.latest_event_query[i_key])
        query_filter.time = time_filter

        try:
            new_events = event_manager.QueryEvents(query_filter)
            self.log.debug("Got {0} events from vCenter event manager".format(len(new_events)))
            for event in new_events:
                normalized_event = VSphereEvent(event, self.event_config[i_key])
                # Can return None if the event if filtered out
                event_payload = normalized_event.get_datadog_payload()
                if event_payload is not None:
                    self.event(event_payload)
                last_time = event.createdTime + timedelta(seconds=1)
        except Exception as e:
            # Don't get stuck on a failure to fetch an event
            # Ignore them for next pass
            self.log.warning("Unable to fetch Events %s", e)
            last_time = event_manager.latestEvent.createdTime + timedelta(seconds=1)

        self.latest_event_query[i_key] = last_time

    def _instance_key(self, instance):
        i_key = instance.get('name')
        if i_key is None:
            raise Exception("Must define a unique 'name' per vCenter instance")
        return i_key

    def _should_cache(self, instance, entity):
        i_key = self._instance_key(instance)
        now = time.time()
        return now - self.cache_times[i_key][entity][LAST] > self.cache_times[i_key][entity][INTERVAL]

    def _get_server_instance(self, instance):
        i_key = self._instance_key(instance)

        service_check_tags = [
            'vcenter_server:{0}'.format(instance.get('name')),
            'vcenter_host:{0}'.format(instance.get('host')),
        ]

        # Check for ssl configs and generate an appropriate ssl context object
        ssl_verify = instance.get('ssl_verify', True)
        ssl_capath = instance.get('ssl_capath', None)
        if not ssl_verify:
            context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
            context.verify_mode = ssl.CERT_NONE
        elif ssl_capath:
            context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
            context.verify_mode = ssl.CERT_REQUIRED
            context.load_verify_locations(capath=ssl_capath)

        # If both configs are used, log a message explaining the default
        if not ssl_verify and ssl_capath:
            self.log.debug("Your configuration is incorrectly attempting to "
                           "specify both a CA path, and to disable SSL "
                           "verification. You cannot do both. Proceeding with "
                           "disabling ssl verification.")

        if i_key not in self.server_instances:
            try:
                # Object returned by SmartConnect is a ServerInstance
                #   https://www.vmware.com/support/developer/vc-sdk/visdk2xpubs/ReferenceGuide/vim.ServiceInstance.html
                server_instance = connect.SmartConnect(
                    host = instance.get('host'),
                    user = instance.get('username'),
                    pwd = instance.get('password'),
                    sslContext = context if not ssl_verify or ssl_capath else None
                )
            except Exception as e:
                err_msg = "Connection to %s failed: %s" % (instance.get('host'), e)
                self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                        tags=service_check_tags, message=err_msg)
                raise Exception(err_msg)

            self.server_instances[i_key] = server_instance

        # Test if the connection is working
        try:
            self.server_instances[i_key].RetrieveContent()
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK,
                    tags=service_check_tags)
        except Exception as e:
            err_msg = "Connection to %s died unexpectedly: %s" % (instance.get('host'), e)
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                    tags=service_check_tags, message=err_msg)
            raise Exception(err_msg)

        return self.server_instances[i_key]

    def _compute_needed_metrics(self, instance, available_metrics):
        """ Compare the available metrics for one MOR we have computed and intersect them
        with the set of metrics we want to report
        """
        if instance.get('all_metrics', False):
            return available_metrics

        i_key = self._instance_key(instance)
        wanted_metrics = []
        # Get only the basic metrics
        for metric in available_metrics:
            # No cache yet, skip it for now
            if (i_key not in self.metrics_metadata
                    or metric.counterId not in self.metrics_metadata[i_key]):
                continue
            if self.metrics_metadata[i_key][metric.counterId]['name'] in BASIC_METRICS:
                wanted_metrics.append(metric)

        return wanted_metrics

    def get_external_host_tags(self):
        """ Returns a list of tags for every host that is detected by the vSphere
        integration.
        List of pairs (hostname, list_of_tags)
        """
        self.log.debug(u"Sending external_host_tags now")
        external_host_tags = []
        for instance in self.instances:
            i_key = self._instance_key(instance)
            mor_by_mor_name = self.morlist.get(i_key)

            if not mor_by_mor_name:
                self.log.warning(
                    u"Unable to extract hosts' tags for `%s` vSphere instance."
                    u"Is the check failing on this instance?", instance
                )
                continue

            for mor in mor_by_mor_name.itervalues():
                if mor['hostname']: # some mor's have a None hostname
                    external_host_tags.append((mor['hostname'], {SOURCE_TYPE: mor['tags']}))

        return external_host_tags


    def _discover_mor(self, instance, tags, regexes=None, include_only_marked=False):
        """
        Explore vCenter infrastructure to discover hosts, virtual machines
        and compute their associated tags.


        Start with the vCenter `rootFolder` and proceed recursively,
        queueing other such jobs for children nodes.

        Example topology:
            ```
            rootFolder
                - datacenter1
                    - compute_resource1 == cluster
                        - host1
                        - host2
                        - host3
                    - compute_resource2
                        - host5
                            - vm1
                            - vm2
            ```

        If it's a node we want to query metric for, queue it in `self.morlist_raw` that
        will be processed by another job.
        """
        def _get_parent_tags(mor):
            tags = []
            if mor.parent:
                tag = []
                if isinstance(mor.parent, vim.HostSystem):
                    tag.append(u'vsphere_host:{}'.format(mor.parent.name))
                elif isinstance(mor.parent, vim.Folder):
                    tag.append(u'vsphere_folder:{}'.format(mor.parent.name))
                elif isinstance(mor.parent, vim.ComputeResource):
                    if isinstance(mor.parent, vim.ClusterComputeResource):
                        tag.append(u'vsphere_cluster:{}'.format(mor.parent.name))
                    tag.append(u'vsphere_compute:{}'.format(mor.parent.name))
                elif isinstance(mor.parent, vim.Datacenter):
                    tag.append(u'vsphere_datacenter:{}'.format(mor.parent.name))

                tags = _get_parent_tags(mor.parent)
                if tag:
                    tags.extend(tag)

            return tags


        def _get_all_objs(content, vimtype, regexes=None, include_only_marked=False, tags=[]):
            """
            Get all the vsphere objects associated with a given type
            """
            obj_list = []
            container = content.viewManager.CreateContainerView(
                content.rootFolder,
                [RESOURCE_TYPE_MAP[vimtype]],
                True)

            for c in container.view:
                instance_tags = []
                if not self._is_excluded(c, regexes, include_only_marked):
                    hostname = c.name
                    if c.parent:
                        instance_tags += _get_parent_tags(c)

                    vsphere_type = None
                    if isinstance(c, vim.VirtualMachine):
                        vsphere_type = u'vsphere_type:vm'
                        if c.runtime.powerState == vim.VirtualMachinePowerState.poweredOff:
                            continue
                        host = c.runtime.host.name
                        instance_tags.append(u'vsphere_host:{}'.format(host))
                    elif isinstance(c, vim.HostSystem):
                        vsphere_type = u'vsphere_type:host'
                    elif isinstance(c, vim.Datastore):
                        vsphere_type = u'vsphere_type:datastore'
                        instance_tags.append(u'vsphere_datastore:{}'.format(c.name))
                        hostname = None
                    elif isinstance(c, vim.Datacenter):
                        vsphere_type = u'vsphere_type:datacenter'
                        hostname = None

                    if vsphere_type:
                        instance_tags.append(vsphere_type)
                    obj_list.append(dict(mor_type=vimtype, mor=c, hostname=hostname, tags=tags+instance_tags))

            return obj_list

        # @atomic_method
        def build_resource_registry(instance, tags, regexes=None, include_only_marked=False):
            i_key = self._instance_key(instance)
            server_instance = self._get_server_instance(instance)
            if i_key not in self.morlist_raw:
                self.morlist_raw[i_key] = {}

            for resource in sorted(RESOURCE_TYPE_MAP):
                self.morlist_raw[i_key][resource] = _get_all_objs(
                    server_instance.RetrieveContent(),
                    resource,
                    regexes,
                    include_only_marked,
                    tags
                )

        # collect...
        self.pool.apply_async(
            build_resource_registry,
            args=(instance, tags, regexes, include_only_marked)
        )

    @staticmethod
    def _is_excluded(obj, regexes, include_only_marked):
        """
        Return `True` if the given host or virtual machine is excluded by the user configuration,
        i.e. violates any of the following rules:
        * Do not match the corresponding `*_include_only` regular expressions
        * Is "non-labeled" while `include_only_marked` is enabled (virtual machine only)
        """
        # Host
        if isinstance(obj, vim.HostSystem):
            # Based on `host_include_only_regex`
            if regexes and regexes.get('host_include') is not None:
                match = re.search(regexes['host_include'], obj.name)
                if not match:
                    return True

        # VirtualMachine
        elif isinstance(obj, vim.VirtualMachine):
            # Based on `vm_include_only_regex`
            if regexes and regexes.get('vm_include') is not None:
                match = re.search(regexes['vm_include'], obj.name)
                if not match:
                    return True

            # Based on `include_only_marked`
            if include_only_marked:
                monitored = False
                for field in obj.customValue:
                    if field.value == VM_MONITORING_FLAG:
                        monitored = True
                        break  # we shall monitor
                if not monitored:
                    return True

        return False

    def _cache_morlist_raw(self, instance):
        """
        Initiate the first layer to refresh the list of MORs (`self.morlist`).

        Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery.

        """

        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)
        for resource_type in RESOURCE_TYPE_MAP:
            if i_key in self.morlist_raw and len(self.morlist_raw[i_key].get(resource_type, [])) > 0:
                self.log.debug(
                    "Skipping morlist collection now, RAW results "
                    "processing not over (latest refresh was {0}s ago)".format(
                        time.time() - self.cache_times[i_key][MORLIST][LAST])
                )
                return
        self.morlist_raw[i_key] = {}

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        include_only_marked = _is_affirmative(instance.get('include_only_marked', False))

        # Discover hosts and virtual machines
        self._discover_mor(instance, [instance_tag], regexes, include_only_marked)

        self.cache_times[i_key][MORLIST][LAST] = time.time()

    @atomic_method
    def _cache_morlist_process_atomic(self, instance, mor):
        """ Process one item of the self.morlist_raw list by querying the available
        metrics for this MOR and then putting it in self.morlist
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>
        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        self.log.debug(
            "job_atomic: Querying available metrics"
            " for MOR {0} (type={1})".format(mor['mor'], mor['mor_type'])
        )

        mor['interval'] = REAL_TIME_INTERVAL if mor['mor_type'] in REALTIME_RESOURCES else None

        available_metrics = perfManager.QueryAvailablePerfMetric(
            mor['mor'], intervalId=mor['interval'])

        mor['metrics'] = self._compute_needed_metrics(instance, available_metrics)

        mor_name = str(mor['mor'])
        if mor_name in self.morlist[i_key]:
            # Was already here last iteration
            self.morlist[i_key][mor_name]['metrics'] = mor['metrics']
        else:
            self.morlist[i_key][mor_name] = mor

        self.morlist[i_key][mor_name]['last_seen'] = time.time()

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.morlist_process_atomic.time', t.total())
        ### </TEST-INSTRUMENTATION>

    def _cache_morlist_process(self, instance):
        """ Empties the self.morlist_raw by popping items and running asynchronously
        the _cache_morlist_process_atomic operation that will get the available
        metrics for this MOR and put it in self.morlist
        """
        i_key = self._instance_key(instance)
        if i_key not in self.morlist:
            self.morlist[i_key] = {}

        batch_size = self.init_config.get('batch_morlist_size', BATCH_MORLIST_SIZE)

        processed = 0
        for resource_type in RESOURCE_TYPE_MAP:
            for i in xrange(batch_size):
                try:
                    mor = self.morlist_raw[i_key][resource_type].pop()
                    self.pool.apply_async(self._cache_morlist_process_atomic, args=(instance, mor))

                    processed += 1
                    if processed == batch_size:
                        break
                except (IndexError, KeyError):
                    self.log.debug("No more work to process in morlist_raw")
                    break

            if processed == batch_size:
                break
        return

    def _vacuum_morlist(self, instance):
        """ Check if self.morlist doesn't have some old MORs that are gone, ie
        we cannot get any metrics from them anyway (or =0)
        """
        i_key = self._instance_key(instance)
        morlist = self.morlist[i_key].items()

        for mor_name, mor in morlist:
            last_seen = mor['last_seen']
            if (time.time() - last_seen) > 2 * REFRESH_MORLIST_INTERVAL:
                del self.morlist[i_key][mor_name]

    def _cache_metrics_metadata(self, instance):
        """ Get from the server instance, all the performance counters metadata
        meaning name/group/description... attached with the corresponding ID
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>

        i_key = self._instance_key(instance)
        self.log.info("Warming metrics metadata cache for instance {0}".format(i_key))
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        new_metadata = {}
        for counter in perfManager.perfCounter:
            d = dict(
                name = "%s.%s" % (counter.groupInfo.key, counter.nameInfo.key),
                unit = counter.unitInfo.key,
                instance_tag = 'instance'  # FIXME: replace by what we want to tag!
            )
            new_metadata[counter.key] = d
        self.cache_times[i_key][METRICS_METADATA][LAST] = time.time()

        self.log.info("Finished metadata collection for instance {0}".format(i_key))
        # Reset metadata
        self.metrics_metadata[i_key] = new_metadata

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.metric_metadata_collection.time', t.total())
        ### </TEST-INSTRUMENTATION>

    def _transform_value(self, instance, counter_id, value):
        """ Given the counter_id, look up for the metrics metadata to check the vsphere
        type of the counter and apply pre-reporting transformation if needed.
        """
        i_key = self._instance_key(instance)
        if counter_id in self.metrics_metadata[i_key]:
            unit = self.metrics_metadata[i_key][counter_id]['unit']
            if unit == 'percent':
                return float(value) / 100

        # Defaults to return the value without transformation
        return value

    @atomic_method
    def _collect_metrics_atomic(self, instance, mor):
        """ Task that collects the metrics listed in the morlist for one MOR
        """
        ### <TEST-INSTRUMENTATION>
        t = Timer()
        ### </TEST-INSTRUMENTATION>

        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        query = vim.PerformanceManager.QuerySpec(maxSample=1,
                                                 entity=mor['mor'],
                                                 metricId=mor['metrics'],
                                                 intervalId=mor['interval'],
                                                 format='normal')
        results = perfManager.QueryPerf(querySpec=[query])
        if results:
            for result in results[0].value:
                if result.id.counterId not in self.metrics_metadata[i_key]:
                    self.log.debug("Skipping this metric value, because there is no metadata about it")
                    continue
                instance_name = result.id.instance or "none"
                value = self._transform_value(instance, result.id.counterId, result.value[0])

                # Metric types are absolute, delta, and rate
                metric_name = self.metrics_metadata[i_key][result.id.counterId]['name']

                if metric_name not in ALL_METRICS:
                    self.log.debug(u"Skipping unknown `%s` metric.", metric_name)
                    continue

                tags = ['instance:%s' % instance_name]
                if not mor['hostname']: # no host tags available
                    tags.extend(mor['tags'])

                # vsphere "rates" should be submitted as gauges (rate is
                # precomputed).
                self.gauge(
                    "vsphere.%s" % metric_name,
                    value,
                    hostname=mor['hostname'],
                    tags=['instance:%s' % instance_name]
                )

        ### <TEST-INSTRUMENTATION>
        self.histogram('datadog.agent.vsphere.metric_colection.time', t.total())
        ### </TEST-INSTRUMENTATION>

    def collect_metrics(self, instance):
        """ Calls asynchronously _collect_metrics_atomic on all MORs, as the
        job queue is processed the Aggregator will receive the metrics.
        """
        i_key = self._instance_key(instance)
        if i_key not in self.morlist:
            self.log.debug("Not collecting metrics for this instance, nothing to do yet: {0}".format(i_key))
            return

        mors = self.morlist[i_key].items()
        self.log.debug("Collecting metrics of %d mors" % len(mors))

        vm_count = 0

        for mor_name, mor in mors:
            if mor['mor_type'] == 'vm':
                vm_count += 1
            if 'metrics' not in mor or not mor['metrics']:
                # self.log.debug("Skipping entity %s collection because we didn't cache its metrics yet" % mor['hostname'])
                continue

            self.pool.apply_async(self._collect_metrics_atomic, args=(instance, mor))

        self.gauge('vsphere.vm.count', vm_count, tags=["vcenter_server:%s" % instance.get('name')])

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        ### <TEST-INSTRUMENTATION>
        self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial'])
        ### </TEST-INSTRUMENTATION>

        # First part: make sure our object repository is neat & clean
        if self._should_cache(instance, METRICS_METADATA):
            self._cache_metrics_metadata(instance)

        if self._should_cache(instance, MORLIST):
            self._cache_morlist_raw(instance)
        self._cache_morlist_process(instance)
        self._vacuum_morlist(instance)

        # Second part: do the job
        self.collect_metrics(instance)
        self._query_event(instance)

        # For our own sanity
        self._clean()

        thread_crashed = False
        try:
            while True:
                self.log.critical(self.exceptionq.get_nowait())
                thread_crashed = True
        except Empty:
            pass
        if thread_crashed:
            self.stop_pool()
            raise Exception("One thread in the pool crashed, check the logs")

        ### <TEST-INSTRUMENTATION>
        self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'])
Ejemplo n.º 15
0
class VSphereCheck(AgentCheck):
    SERVICE_CHECK_NAME = 'vcenter.can_connect'

    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self.time_started = time.time()
        self.pool_started = False
        self.exceptionq = Queue()

        self.server_instances = {}

        self.event_config = {}
        self.cache_times = {}
        for instance in self.instances:
            i_key = self._instance_key(instance)
            self.cache_times[i_key] = {
                MORLIST: {
                    LAST: 0,
                    INTERVAL: init_config.get('refresh_morlist_interval',
                                              REFRESH_MORLIST_INTERVAL)
                },
                METRICS_METADATA: {
                    LAST: 0,
                    INTERVAL: init_config.get('refresh_metrics_metadata_interval',
                                              REFRESH_METRICS_METADATA_INTERVAL)
                }
            }

            self.event_config[i_key] = instance.get('event_config')

        self.morlist_raw = {}
        self.morlist = {}
        self.metrics_metadata = {}

        self.latest_event_query = {}

    def stop(self):
        self.stop_pool()

    def start_pool(self):
        self.log.info("Starting Thread Pool")
        self.pool_size = int(self.init_config.get('threads_count', DEFAULT_SIZE_POOL))

        self.pool = Pool(self.pool_size)
        self.pool_started = True
        self.jobs_status = {}

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0
            self.pool_started = False

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def _clean(self):
        now = time.time()
        for name in self.jobs_status.keys():
            start_time = self.jobs_status[name]
            if now - start_time > JOB_TIMEOUT:
                self.log.critical("Restarting Pool. One check is stuck.")
                self.restart_pool()
                break

    def _query_event(self, instance):
        i_key = self._instance_key(instance)
        last_time = self.latest_event_query.get(i_key)

        server_instance = self._get_server_instance(instance)
        event_manager = server_instance.content.eventManager

        if not last_time:
            last_time = self.latest_event_query[i_key] = \
                event_manager.latestEvent.createdTime + timedelta(seconds=1)

        query_filter = vim.event.EventFilterSpec()
        time_filter = vim.event.EventFilterSpec.ByTime(beginTime=self.latest_event_query[i_key])
        query_filter.time = time_filter

        try:
            new_events = event_manager.QueryEvents(query_filter)
            self.log.debug("Got {0} events from vCenter event manager".format(len(new_events)))
            for event in new_events:
                normalized_event = VSphereEvent(event, self.event_config[i_key])
                event_payload = normalized_event.get_datamonitor_payload()
                if event_payload is not None:
                    self.event(event_payload)
                last_time = event.createdTime + timedelta(seconds=1)
        except Exception as e:
            self.log.warning("Unable to fetch Events %s", e)
            last_time = event_manager.latestEvent.createdTime + timedelta(seconds=1)

        self.latest_event_query[i_key] = last_time

    def _instance_key(self, instance):
        i_key = instance.get('name')
        if i_key is None:
            raise Exception("Must define a unique 'name' per vCenter instance")
        return i_key

    def _should_cache(self, instance, entity):
        i_key = self._instance_key(instance)
        now = time.time()
        return now - self.cache_times[i_key][entity][LAST] > self.cache_times[i_key][entity][INTERVAL]

    def _get_server_instance(self, instance):
        i_key = self._instance_key(instance)

        service_check_tags = [
            'vcenter_server:{0}'.format(instance.get('name')),
            'vcenter_host:{0}'.format(instance.get('host')),
        ]

        ssl_verify = instance.get('ssl_verify', True)
        ssl_capath = instance.get('ssl_capath', None)
        if not ssl_verify:
            context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
            context.verify_mode = ssl.CERT_NONE
        elif ssl_capath:
            context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
            context.verify_mode = ssl.CERT_REQUIRED
            context.load_verify_locations(capath=ssl_capath)

        if not ssl_verify and ssl_capath:
            self.log.debug("Your configuration is incorrectly attempting to "
                           "specify both a CA path, and to disable SSL "
                           "verification. You cannot do both. Proceeding with "
                           "disabling ssl verification.")

        if i_key not in self.server_instances:
            try:
                server_instance = connect.SmartConnect(
                    host=instance.get('host'),
                    user=instance.get('username'),
                    pwd=instance.get('password'),
                    sslContext=context if not ssl_verify or ssl_capath else None
                )
            except Exception as e:
                err_msg = "Connection to %s failed: %s" % (instance.get('host'), e)
                self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                                   tags=service_check_tags, message=err_msg)
                raise Exception(err_msg)

            self.server_instances[i_key] = server_instance

        try:
            self.server_instances[i_key].RetrieveContent()
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK,
                               tags=service_check_tags)
        except Exception as e:
            err_msg = "Connection to %s died unexpectedly: %s" % (instance.get('host'), e)
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               tags=service_check_tags, message=err_msg)
            raise Exception(err_msg)

        return self.server_instances[i_key]

    def _compute_needed_metrics(self, instance, available_metrics):
        if instance.get('all_metrics', False):
            return available_metrics

        i_key = self._instance_key(instance)
        wanted_metrics = []
        for metric in available_metrics:
            if (i_key not in self.metrics_metadata
                or metric.counterId not in self.metrics_metadata[i_key]):
                continue
            if self.metrics_metadata[i_key][metric.counterId]['name'] in BASIC_METRICS:
                wanted_metrics.append(metric)

        return wanted_metrics

    def get_external_host_tags(self):
        self.log.info("Sending external_host_tags now")
        external_host_tags = []
        for instance in self.instances:
            i_key = self._instance_key(instance)
            mor_list = self.morlist[i_key].items()
            for mor_name, mor in mor_list:
                external_host_tags.append((mor['hostname'], {SOURCE_TYPE: mor['tags']}))

        return external_host_tags

    @atomic_method
    def _cache_morlist_raw_atomic(self, i_key, obj_type, obj, tags, regexes=None):
        t = Timer()
        self.log.debug("job_atomic: Exploring MOR {0} (type={1})".format(obj, obj_type))
        tags_copy = deepcopy(tags)

        if obj_type == 'rootFolder':
            for datacenter in obj.childEntity:
                if not hasattr(datacenter, 'hostFolder'):
                    continue
                self.pool.apply_async(
                    self._cache_morlist_raw_atomic,
                    args=(i_key, 'datacenter', datacenter, tags_copy, regexes)
                )

        elif obj_type == 'datacenter':
            dc_tag = "vsphere_datacenter:%s" % obj.name
            tags_copy.append(dc_tag)
            for compute_resource in obj.hostFolder.childEntity:
                if not hasattr(compute_resource, 'host'):
                    continue
                self.pool.apply_async(
                    self._cache_morlist_raw_atomic,
                    args=(i_key, 'compute_resource', compute_resource, tags_copy, regexes)
                )

        elif obj_type == 'compute_resource':
            if obj.__class__ == vim.ClusterComputeResource:
                cluster_tag = "vsphere_cluster:%s" % obj.name
                tags_copy.append(cluster_tag)
            for host in obj.host:
                if not hasattr(host, 'vm'):
                    continue
                self.pool.apply_async(
                    self._cache_morlist_raw_atomic,
                    args=(i_key, 'host', host, tags_copy, regexes)
                )

        elif obj_type == 'host':
            if regexes and regexes.get('host_include') is not None:
                match = re.search(regexes['host_include'], obj.name)
                if not match:
                    self.log.debug(u"Filtered out VM {0} because of host_include_only_regex".format(obj.name))
                    return
            watched_mor = dict(mor_type='host', mor=obj, hostname=obj.name, tags=tags_copy + ['vsphere_type:host'])
            self.morlist_raw[i_key].append(watched_mor)

            host_tag = "vsphere_host:%s" % obj.name
            tags_copy.append(host_tag)
            for vm in obj.vm:
                if vm.runtime.powerState != 'poweredOn':
                    continue
                self.pool.apply_async(
                    self._cache_morlist_raw_atomic,
                    args=(i_key, 'vm', vm, tags_copy, regexes)
                )

        elif obj_type == 'vm':
            if regexes and regexes.get('vm_include') is not None:
                match = re.search(regexes['vm_include'], obj.name)
                if not match:
                    self.log.debug(u"Filtered out VM {0} because of vm_include_only_regex".format(obj.name))
                    return
            watched_mor = dict(mor_type='vm', mor=obj, hostname=obj.name, tags=tags_copy + ['vsphere_type:vm'])
            self.morlist_raw[i_key].append(watched_mor)

        self.histogram('datamonitor.agent.vsphere.morlist_raw_atomic.time', t.total())

    def _cache_morlist_raw(self, instance):

        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)
        if i_key in self.morlist_raw and len(self.morlist_raw[i_key]) > 0:
            self.log.debug(
                "Skipping morlist collection now, RAW results "
                "processing not over (latest refresh was {0}s ago)".format(
                    time.time() - self.cache_times[i_key][MORLIST][LAST])
            )
            return
        self.morlist_raw[i_key] = []

        server_instance = self._get_server_instance(instance)
        root_folder = server_instance.content.rootFolder

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        self.pool.apply_async(
            self._cache_morlist_raw_atomic,
            args=(i_key, 'rootFolder', root_folder, [instance_tag], regexes)
        )
        self.cache_times[i_key][MORLIST][LAST] = time.time()

    @atomic_method
    def _cache_morlist_process_atomic(self, instance, mor):
        t = Timer()
        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        self.log.debug(
            "job_atomic: Querying available metrics"
            " for MOR {0} (type={1})".format(mor['mor'], mor['mor_type'])
        )

        available_metrics = perfManager.QueryAvailablePerfMetric(
            mor['mor'], intervalId=REAL_TIME_INTERVAL)

        mor['metrics'] = self._compute_needed_metrics(instance, available_metrics)
        mor_name = str(mor['mor'])

        if mor_name in self.morlist[i_key]:
            self.morlist[i_key][mor_name]['metrics'] = mor['metrics']
        else:
            self.morlist[i_key][mor_name] = mor

        self.morlist[i_key][mor_name]['last_seen'] = time.time()

        self.histogram('datamonitor.agent.vsphere.morlist_process_atomic.time', t.total())

    def _cache_morlist_process(self, instance):
        i_key = self._instance_key(instance)
        if i_key not in self.morlist:
            self.morlist[i_key] = {}

        batch_size = self.init_config.get('batch_morlist_size', BATCH_MORLIST_SIZE)

        for i in xrange(batch_size):
            try:
                mor = self.morlist_raw[i_key].pop()
                self.pool.apply_async(self._cache_morlist_process_atomic, args=(instance, mor))
            except (IndexError, KeyError):
                self.log.debug("No more work to process in morlist_raw")
                return

    def _vacuum_morlist(self, instance):
        i_key = self._instance_key(instance)
        morlist = self.morlist[i_key].items()

        for mor_name, mor in morlist:
            last_seen = mor['last_seen']
            if (time.time() - last_seen) > 2 * REFRESH_MORLIST_INTERVAL:
                del self.morlist[i_key][mor_name]

    def _cache_metrics_metadata(self, instance):

        t = Timer()

        i_key = self._instance_key(instance)
        self.log.info("Warming metrics metadata cache for instance {0}".format(i_key))
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        new_metadata = {}
        for counter in perfManager.perfCounter:
            d = dict(
                name="%s.%s" % (counter.groupInfo.key, counter.nameInfo.key),
                unit=counter.unitInfo.key,
                instance_tag='instance'
            )
            new_metadata[counter.key] = d
        self.cache_times[i_key][METRICS_METADATA][LAST] = time.time()

        self.log.info("Finished metadata collection for instance {0}".format(i_key))

        self.metrics_metadata[i_key] = new_metadata

        self.histogram('datamonitor.agent.vsphere.metric_metadata_collection.time', t.total())

    def _transform_value(self, instance, counter_id, value):

        i_key = self._instance_key(instance)
        if counter_id in self.metrics_metadata[i_key]:
            unit = self.metrics_metadata[i_key][counter_id]['unit']
            if unit == 'percent':
                return float(value) / 100

        return value

    @atomic_method
    def _collect_metrics_atomic(self, instance, mor):

        t = Timer()

        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager
        query = vim.PerformanceManager.QuerySpec(maxSample=1,
                                                 entity=mor['mor'],
                                                 metricId=mor['metrics'],
                                                 intervalId=20,
                                                 format='normal')
        results = perfManager.QueryPerf(querySpec=[query])

        if results:
            for result in results[0].value:
                if result.id.counterId not in self.metrics_metadata[i_key]:
                    self.log.debug("Skipping this metric value, because there is no metadata about it")
                    continue
                instance_name = result.id.instance or "none"
                value = self._transform_value(instance, result.id.counterId, result.value[0])

                if ALL_METRICS[self.metrics_metadata[i_key][result.id.counterId]['name']]['s_type'] == 'rate':
                    record_metric = self.rate
                else:
                    record_metric = self.gauge
                ip = "unknown"
                content = server_instance.RetrieveContent()
                for child in content.rootFolder.childEntity:
                    if hasattr(child, 'vmFolder'):
                        datacenter = child
                        vmFolder = datacenter.vmFolder
                        vmList = vmFolder.childEntity
                        for vm in vmList:
                            if isinstance(vm, vim.VirtualMachine):
                                ip = vm.summary.guest.ipAddress
                                self.log.info("Get VM ip {} by VMtools".format(ip))
                if ip != "unknown" and ip != "None":
                    record_metric(
                        "vsphere.%s" % self.metrics_metadata[i_key][result.id.counterId]['name'],
                        value,
                        hostname=mor['hostname'],
                        tags=['instance:%s' % instance_name, 'ip:%s' % ip, 'type:VM']
                    )
                else:
                    record_metric(
                        "vsphere.%s" % self.metrics_metadata[i_key][result.id.counterId]['name'],
                        value,
                        hostname=mor['hostname'],
                        tags=['instance:%s' % instance_name, "type:VM"]
                    )

        self.histogram('datamonitor.agent.vsphere.metric_colection.time', t.total())

    def collect_metrics(self, instance):

        i_key = self._instance_key(instance)
        if i_key not in self.morlist:
            self.log.debug("Not collecting metrics for this instance, nothing to do yet: {0}".format(i_key))
            return

        mors = self.morlist[i_key].items()
        self.log.debug("Collecting metrics of %d mors" % len(mors))

        vm_count = 0

        for mor_name, mor in mors:
            if mor['mor_type'] == 'vm':
                vm_count += 1
            if 'metrics' not in mor:
                continue

            self.pool.apply_async(self._collect_metrics_atomic, args=(instance, mor))

        self.gauge('vsphere.vm.count', vm_count, tags=["vcenter_server:%s" % instance.get('name')])

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        self.gauge('datamonitor.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial'])

        if self._should_cache(instance, METRICS_METADATA):
            self._cache_metrics_metadata(instance)

        if self._should_cache(instance, MORLIST):
            self._cache_morlist_raw(instance)
        self._cache_morlist_process(instance)
        self._vacuum_morlist(instance)

        self.collect_metrics(instance)
        self._query_event(instance)

        self._clean()

        thread_crashed = False
        try:
            while True:
                self.log.critical(self.exceptionq.get_nowait())
                thread_crashed = True
        except Empty:
            pass
        if thread_crashed:
            self.stop_pool()
            raise Exception("One thread in the pool crashed, check the logs")

        self.gauge('datamonitor.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'])
Ejemplo n.º 16
0
class ServicesCheck(AgentCheck):
    SOURCE_TYPE_NAME = 'servicecheck'

    """
    Services checks inherits from this class.
    This class should never be directly instanciated.

    Work flow:
        The main agent loop will call the check function for each instance for 
        each iteration of the loop.
        The check method will make an asynchronous call to the _process method in 
        one of the thread initiated in the thread pool created in this class constructor.
        The _process method will call the _check method of the inherited class
        which will perform the actual check.

        The _check method must return a tuple which first element is either
            Status.UP or Status.DOWN.
            The second element is a short error message that will be displayed 
            when the service turns down.

    """
    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # A dictionary to keep track of service statuses
        self.statuses = {}
        self.start_pool()

    def start_pool(self):
        # The pool size should be the minimum between the number of instances
        # and the DEFAULT_SIZE_POOL. It can also be overridden by the 'threads_count'
        # parameter in the init_config of the check
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        pool_size = int(self.init_config.get('threads_count', default_size))

        self.pool = Pool(pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}

    def stop_pool(self):
        self.pool.terminate()

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def check(self, instance):
        self._process_results()
        self._clean()
        name = instance.get('name', None)
        if name is None:
            self.log.error('Each service check must have a name')
            return

        if name not in self.jobs_status: 
            # A given instance should be processed one at a time
            self.jobs_status[name] = time.time()
            self.pool.apply_async(self._process, args=(instance,))
        else:
            self.log.error("Instance: %s skipped because it's already running." % name)


    def _process(self, instance):
        name = instance.get('name', None)

        try:
            status, msg = self._check(instance)

            result = (status, msg, name, instance)
            # We put the results in the result queue
            self.resultsq.put(result)

        except Exception, e:
            self.log.exception(e)
            self.restart_pool()
Ejemplo n.º 17
0
class NetworkCheck(AgentCheck):
    SOURCE_TYPE_NAME = 'servicecheck'
    SERVICE_CHECK_PREFIX = 'network_check'

    STATUS_TO_SERVICE_CHECK = {
        Status.UP: AgentCheck.OK,
        Status.WARNING: AgentCheck.WARNING,
        Status.CRITICAL: AgentCheck.CRITICAL,
        Status.DOWN: AgentCheck.CRITICAL,
    }

    def __init__(self, name, init_config, agentConfig, instances):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        self.statuses = {}
        self.notified = {}
        self.nb_failures = 0
        self.pool_started = False

        names = []
        for inst in instances:
            name = inst.get('name', None)
            if not name:
                raise Exception("All instances should have a 'name' parameter,"
                                " error on instance: {0}".format(inst))
            if name in names:
                raise Exception(
                    "Duplicate names for instances with name {0}".format(
                        inst['name']))

    def stop(self):
        self.stop_pool()
        self.pool_started = False

    def start_pool(self):

        self.log.info("Starting Thread Pool")
        default_size = min(self.instance_count(), DEFAULT_SIZE_POOL)
        self.pool_size = int(
            self.init_config.get('threads_count', default_size))

        self.pool = Pool(self.pool_size)

        self.resultsq = Queue()
        self.jobs_status = {}
        self.jobs_results = {}
        self.pool_started = True

    def stop_pool(self):
        self.log.info("Stopping Thread Pool")
        if self.pool_started:
            self.pool.terminate()
            self.pool.join()
            self.jobs_status.clear()
            assert self.pool.get_nworkers() == 0

    def restart_pool(self):
        self.stop_pool()
        self.start_pool()

    def check(self, instance):
        if not self.pool_started:
            self.start_pool()
        if threading.activeCount() > 5 * self.pool_size + 5:
            raise Exception(
                "Thread number (%s) is exploding. Skipping this check" %
                threading.activeCount())
        self._process_results()
        self._clean()
        name = instance.get('name', None)
        if name is None:
            self.log.error('Each service check must have a name')
            return

        if name not in self.jobs_status:
            self.jobs_status[name] = time.time()
            self.jobs_results[name] = self.pool.apply_async(self._process,
                                                            args=(instance, ))
        else:
            self.log.error(
                "Instance: %s skipped because it's already running." % name)

    def _process(self, instance):
        try:
            statuses = self._check(instance)

            if isinstance(statuses, tuple):
                status, msg = statuses
                self.resultsq.put((status, msg, None, instance))

            elif isinstance(statuses, list):
                for status in statuses:
                    sc_name, status, msg = status
                    self.resultsq.put((status, msg, sc_name, instance))

        except Exception:
            result = (FAILURE, FAILURE, FAILURE, instance)
            self.resultsq.put(result)

    def _process_results(self):
        for i in xrange(MAX_LOOP_ITERATIONS):
            try:
                status, msg, sc_name, instance = self.resultsq.get_nowait()
            except Empty:
                break

            instance_name = instance['name']
            if status == FAILURE:
                self.nb_failures += 1
                if self.nb_failures >= self.pool_size - 1:
                    self.nb_failures = 0
                    self.restart_pool()

                self._clean_job(instance_name)
                continue

            self.report_as_service_check(sc_name, status, instance, msg)

            skip_event = _is_affirmative(instance.get('skip_event', False))
            if not skip_event:
                self.warning(
                    "Using events for service checks is deprecated in favor of monitors and will be removed in future versions of the Monitor Agent."
                )
                event = None

                if instance_name not in self.statuses:
                    self.statuses[instance_name] = defaultdict(list)

                self.statuses[instance_name][sc_name].append(status)

                window = int(instance.get('window', 1))

                if window > 256:
                    self.log.warning(
                        "Maximum window size (256) exceeded, defaulting it to 256"
                    )
                    window = 256

                threshold = instance.get('threshold', 1)

                if len(self.statuses[instance_name][sc_name]) > window:
                    self.statuses[instance_name][sc_name].pop(0)

                nb_failures = self.statuses[instance_name][sc_name].count(
                    Status.DOWN)

                if nb_failures >= threshold:
                    if self.notified.get(
                        (instance_name, sc_name), Status.UP) != Status.DOWN:
                        event = self._create_status_event(
                            sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.DOWN
                else:
                    if self.notified.get(
                        (instance_name, sc_name), Status.UP) != Status.UP:
                        event = self._create_status_event(
                            sc_name, status, msg, instance)
                        self.notified[(instance_name, sc_name)] = Status.UP

                if event is not None:
                    self.events.append(event)

            self._clean_job(instance_name)

    def _clean_job(self, instance_name):
        if instance_name in self.jobs_status:
            self.log.debug("Instance: %s cleaned from jobs status." %
                           instance_name)
            del self.jobs_status[instance_name]

        if instance_name in self.jobs_results:
            self.log.debug("Instance: %s cleaned from jobs results." %
                           instance_name)
            ret = self.jobs_results[instance_name].get()
            if isinstance(ret, Exception):
                self.log.exception(
                    "Exception in worker thread: {0}".format(ret))
            del self.jobs_results[instance_name]

    def _check(self, instance):
        raise NotImplementedError

    def _clean(self):
        now = time.time()
        for name, start_time in self.jobs_status.iteritems():
            if now - start_time > TIMEOUT:
                self.log.critical("Restarting Pool. One check is stuck: %s" %
                                  name)
                self.restart_pool()
                break