コード例 #1
0
    def __init__(self, name, init_config, instances):
        # type: (*Any, **Any) -> None
        super(GlusterfsCheck, self).__init__(name, init_config, instances)
        self._tags = self.instance.get('tags', [])

        # Check if customer set gstatus path
        if init_config.get('gstatus_path'):
            self.gstatus_cmd = init_config.get('gstatus_path')
        else:
            if os.path.exists(GSTATUS_PATH):
                self.gstatus_cmd = GSTATUS_PATH
            else:
                raise ConfigurationError(
                    'Glusterfs check requires `gstatus` to be installed or set the path to the installed version.'
                )
        self.log.debug("Using gstatus path `%s`", self.gstatus_cmd)
        self.use_sudo = is_affirmative(self.instance.get('use_sudo', True))
コード例 #2
0
    def check(self, instance):
        # Get the configuration for this specific instance
        scraper_config = self.get_scraper_config(instance)
        # Set up metric_transformers
        transformers = {}
        for metric_from, metric_to in TRANSFORM_VALUE_HISTOGRAMS.items():
            transformers[metric_from] = self._histogram_from_microseconds_to_seconds(metric_to)
        for metric_from, metric_to in TRANSFORM_VALUE_SUMMARIES.items():
            transformers[metric_from] = self._summary_from_microseconds_to_seconds(metric_to)

        self.process(scraper_config, metric_transformers=transformers)
        # Check the leader-election status
        if is_affirmative(instance.get('leader_election', True)):
            leader_config = self.LEADER_ELECTION_CONFIG
            leader_config["tags"] = instance.get("tags", [])
            leader_config["record_kind"] = instance.get('leader_election_kind', 'auto')
            self.check_election_status(leader_config)

        self._perform_service_check(instance)
コード例 #3
0
    def __init__(self, name, init_config, instances):
        super(SQLServer, self).__init__(name, init_config, instances)

        self.connection = None
        self.failed_connections = {}
        self.instance_metrics = []
        self.instance_per_type_metrics = defaultdict(list)
        self.do_check = True

        self.autodiscovery = is_affirmative(
            self.instance.get('database_autodiscovery'))
        if self.autodiscovery and self.instance.get('database'):
            self.log.warning(
                'sqlserver `database_autodiscovery` and `database` options defined in same instance - '
                'autodiscovery will take precedence.')
        self.autodiscovery_include = self.instance.get('autodiscovery_include',
                                                       ['.*'])
        self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude',
                                                       [])
        self._compile_patterns()
        self.autodiscovery_interval = self.instance.get(
            'autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
        self.databases = set()
        self.ad_last_check = 0

        self.proc = self.instance.get('stored_procedure')
        self.proc_type_mapping = {
            'gauge': self.gauge,
            'rate': self.rate,
            'histogram': self.histogram
        }
        self.custom_metrics = init_config.get('custom_metrics', [])

        # use QueryManager to process custom queries
        self._query_manager = QueryManager(self,
                                           self.execute_query_raw,
                                           queries=[],
                                           tags=self.instance.get("tags", []))
        self.check_initializations.append(self._query_manager.compile_queries)
        self.check_initializations.append(self.initialize_connection)
コード例 #4
0
    def _process_mor_objects_queue(self, instance):
        """
        Pops `batch_morlist_size` items from the mor objects queue and run asynchronously
        the _process_mor_objects_queue_async method to fill the Mor cache.
        """
        i_key = self._instance_key(instance)
        self.mor_cache.init_instance(i_key)

        if not self.mor_objects_queue.contains(i_key):
            self.log.debug("Objects queue is not initialized yet for instance %s, skipping processing", i_key)
            return

        for resource_type in RESOURCE_TYPE_METRICS:
            # Batch size can prevent querying large payloads at once if the environment is too large
            # If batch size is set to 0, process everything at once
            batch_size = self.batch_morlist_size or self.mor_objects_queue.size(i_key, resource_type)
            while self.mor_objects_queue.size(i_key, resource_type):
                mors = []
                for _ in range(batch_size):
                    mor = self.mor_objects_queue.pop(i_key, resource_type)
                    if mor is None:
                        self.log.debug("No more objects of type '%s' left in the queue", ensure_unicode(resource_type))
                        break

                    mor_name = str(mor['mor'])
                    mor['interval'] = REAL_TIME_INTERVAL if mor['mor_type'] in REALTIME_RESOURCES else None
                    # Always update the cache to account for Mors that might have changed parent
                    # in the meantime (e.g. a migrated VM).
                    self.mor_cache.set_mor(i_key, mor_name, mor)

                    # Only do this for non real-time resources i.e. datacenter, datastore and cluster
                    # For hosts and VMs, we can rely on a precomputed list of metrics
                    realtime_only = is_affirmative(instance.get("collect_realtime_only", True))
                    if mor["mor_type"] not in REALTIME_RESOURCES and not realtime_only:
                        mors.append(mor)

                # We will actually schedule jobs for non realtime resources only.
                if mors:
                    self.pool.apply_async(self._process_mor_objects_queue_async, args=(instance, mors))
コード例 #5
0
    def initialize_connection(self):
        self.connection = Connection(self.init_config, self.instance,
                                     self.handle_service_check)

        # Pre-process the list of metrics to collect
        try:
            # check to see if the database exists before we try any connections to it
            db_exists, context = self.connection.check_database()

            if db_exists:
                if self.instance.get('stored_procedure') is None:
                    with self.connection.open_managed_default_connection():
                        with self.connection.get_managed_cursor() as cursor:
                            self.autodiscover_databases(cursor)
                        self._make_metric_list_to_collect(self.custom_metrics)
            else:
                # How much do we care that the DB doesn't exist?
                ignore = is_affirmative(
                    self.instance.get("ignore_missing_database", False))
                if ignore is not None and ignore:
                    # not much : we expect it. leave checks disabled
                    self.do_check = False
                    self.log.warning(
                        "Database %s does not exist. Disabling checks for this instance.",
                        context)
                else:
                    # yes we do. Keep trying
                    msg = "Database {} does not exist. Please resolve invalid database and restart agent".format(
                        context)
                    raise ConfigurationError(msg)

        except SQLConnectionError as e:
            self.log.exception("Error connecting to database: %s", e)
        except ConfigurationError:
            raise
        except Exception as e:
            self.log.exception("Initialization exception %s", e)
コード例 #6
0
    def _create_kubernetes_state_prometheus_instance(self, instance):
        """
        Set up the kubernetes_state instance so it can be used in OpenMetricsBaseCheck
        """
        ksm_instance = deepcopy(instance)
        endpoint = instance.get('kube_state_url')
        if endpoint is None:
            raise CheckException("Unable to find kube_state_url in config file.")

        extra_labels = ksm_instance.get('label_joins', {})
        hostname_override = is_affirmative(ksm_instance.get('hostname_override', True))
        join_kube_labels = is_affirmative(ksm_instance.get('join_kube_labels', False))
        join_standard_tags = is_affirmative(ksm_instance.get('join_standard_tags', False))

        ksm_instance.update(
            {
                'namespace': 'kubernetes_state',
                'metrics': [
                    {
                        'kube_daemonset_status_current_number_scheduled': 'daemonset.scheduled',
                        'kube_daemonset_status_desired_number_scheduled': 'daemonset.desired',
                        'kube_daemonset_status_number_misscheduled': 'daemonset.misscheduled',
                        'kube_daemonset_status_number_ready': 'daemonset.ready',
                        'kube_daemonset_updated_number_scheduled': 'daemonset.updated',
                        'kube_deployment_spec_paused': 'deployment.paused',
                        'kube_deployment_spec_replicas': 'deployment.replicas_desired',
                        'kube_deployment_spec_strategy_rollingupdate_max_unavailable': 'deployment.rollingupdate.max_unavailable',  # noqa: E501
                        'kube_deployment_status_replicas': 'deployment.replicas',
                        'kube_deployment_status_replicas_available': 'deployment.replicas_available',
                        'kube_deployment_status_replicas_unavailable': 'deployment.replicas_unavailable',
                        'kube_deployment_status_replicas_updated': 'deployment.replicas_updated',
                        'kube_endpoint_address_available': 'endpoint.address_available',
                        'kube_endpoint_address_not_ready': 'endpoint.address_not_ready',
                        'kube_endpoint_created': 'endpoint.created',
                        'kube_hpa_spec_min_replicas': 'hpa.min_replicas',
                        'kube_hpa_spec_max_replicas': 'hpa.max_replicas',
                        'kube_hpa_status_desired_replicas': 'hpa.desired_replicas',
                        'kube_hpa_status_current_replicas': 'hpa.current_replicas',
                        'kube_hpa_status_condition': 'hpa.condition',
                        'kube_node_info': 'node.count',
                        'kube_node_status_allocatable_cpu_cores': 'node.cpu_allocatable',
                        'kube_node_status_allocatable_memory_bytes': 'node.memory_allocatable',
                        'kube_node_status_allocatable_pods': 'node.pods_allocatable',
                        'kube_node_status_capacity_cpu_cores': 'node.cpu_capacity',
                        'kube_node_status_capacity_memory_bytes': 'node.memory_capacity',
                        'kube_node_status_capacity_pods': 'node.pods_capacity',
                        'kube_node_status_allocatable_nvidia_gpu_cards': 'node.gpu.cards_allocatable',
                        'kube_node_status_capacity_nvidia_gpu_cards': 'node.gpu.cards_capacity',
                        'kube_pod_container_status_terminated': 'container.terminated',
                        'kube_pod_container_status_waiting': 'container.waiting',
                        'kube_persistentvolumeclaim_status_phase': 'persistentvolumeclaim.status',
                        'kube_persistentvolumeclaim_resource_requests_storage_bytes': 'persistentvolumeclaim.request_storage',  # noqa: E501
                        'kube_pod_container_resource_limits_cpu_cores': 'container.cpu_limit',
                        'kube_pod_container_resource_limits_memory_bytes': 'container.memory_limit',
                        'kube_pod_container_resource_requests_cpu_cores': 'container.cpu_requested',
                        'kube_pod_container_resource_requests_memory_bytes': 'container.memory_requested',
                        'kube_pod_container_status_ready': 'container.ready',
                        'kube_pod_container_status_restarts': 'container.restarts',  # up to kube-state-metrics 1.1.x
                        'kube_pod_container_status_restarts_total': 'container.restarts',  # noqa: E501, from kube-state-metrics 1.2.0
                        'kube_pod_container_status_running': 'container.running',
                        'kube_pod_container_resource_requests_nvidia_gpu_devices': 'container.gpu.request',
                        'kube_pod_container_resource_limits_nvidia_gpu_devices': 'container.gpu.limit',
                        'kube_pod_status_ready': 'pod.ready',
                        'kube_pod_status_scheduled': 'pod.scheduled',
                        'kube_pod_status_unschedulable': 'pod.unschedulable',
                        'kube_poddisruptionbudget_status_current_healthy': 'pdb.pods_healthy',
                        'kube_poddisruptionbudget_status_desired_healthy': 'pdb.pods_desired',
                        'kube_poddisruptionbudget_status_pod_disruptions_allowed': 'pdb.disruptions_allowed',
                        'kube_poddisruptionbudget_status_expected_pods': 'pdb.pods_total',
                        'kube_replicaset_spec_replicas': 'replicaset.replicas_desired',
                        'kube_replicaset_status_fully_labeled_replicas': 'replicaset.fully_labeled_replicas',
                        'kube_replicaset_status_ready_replicas': 'replicaset.replicas_ready',
                        'kube_replicaset_status_replicas': 'replicaset.replicas',
                        'kube_replicationcontroller_spec_replicas': 'replicationcontroller.replicas_desired',
                        'kube_replicationcontroller_status_available_replicas': 'replicationcontroller.replicas_available',  # noqa: E501
                        'kube_replicationcontroller_status_fully_labeled_replicas': 'replicationcontroller.fully_labeled_replicas',  # noqa: E501
                        'kube_replicationcontroller_status_ready_replicas': 'replicationcontroller.replicas_ready',
                        'kube_replicationcontroller_status_replicas': 'replicationcontroller.replicas',
                        'kube_statefulset_replicas': 'statefulset.replicas_desired',
                        'kube_statefulset_status_replicas': 'statefulset.replicas',
                        'kube_statefulset_status_replicas_current': 'statefulset.replicas_current',
                        'kube_statefulset_status_replicas_ready': 'statefulset.replicas_ready',
                        'kube_statefulset_status_replicas_updated': 'statefulset.replicas_updated',
                        'kube_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound': (
                            'vpa.lower_bound'
                        ),
                        'kube_verticalpodautoscaler_status_recommendation_containerrecommendations_target': (
                            'vpa.target'
                        ),
                        'kube_verticalpodautoscaler_status_recommendation_containerrecommendations_uncappedtarget': (
                            'vpa.uncapped_target'
                        ),
                        'kube_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound': (
                            'vpa.upperbound'
                        ),
                        'kube_verticalpodautoscaler_spec_updatepolicy_updatemode': 'vpa.update_mode',
                    }
                ],
                'ignore_metrics': [
                    # _info, _labels and _created don't convey any metric
                    'kube_cronjob_info',
                    'kube_cronjob_created',
                    'kube_daemonset_created',
                    'kube_deployment_created',
                    'kube_deployment_labels',
                    'kube_job_created',
                    'kube_job_info',
                    'kube_limitrange_created',
                    'kube_namespace_created',
                    'kube_namespace_labels',
                    'kube_node_created',
                    'kube_node_labels',
                    'kube_pod_created',
                    'kube_pod_container_info',
                    'kube_pod_info',
                    'kube_pod_owner',
                    'kube_pod_start_time',
                    'kube_pod_labels',
                    'kube_poddisruptionbudget_created',
                    'kube_replicaset_created',
                    'kube_replicationcontroller_created',
                    'kube_resourcequota_created',
                    'kube_replicaset_owner',
                    'kube_service_created',
                    'kube_service_info',
                    'kube_service_labels',
                    'kube_service_spec_external_ip',
                    'kube_service_status_load_balancer_ingress',
                    'kube_statefulset_labels',
                    'kube_statefulset_created',
                    'kube_statefulset_status_current_revision',
                    'kube_statefulset_status_update_revision',
                    # Already provided by the kubelet integration
                    'kube_pod_container_status_last_terminated_reason',
                    # _generation metrics are more metadata than metrics, no real use case for now
                    'kube_daemonset_metadata_generation',
                    'kube_deployment_metadata_generation',
                    'kube_deployment_status_observed_generation',
                    'kube_replicaset_metadata_generation',
                    'kube_replicaset_status_observed_generation',
                    'kube_replicationcontroller_metadata_generation',
                    'kube_replicationcontroller_status_observed_generation',
                    'kube_statefulset_metadata_generation',
                    'kube_statefulset_status_observed_generation',
                    'kube_hpa_metadata_generation',
                    # kube_node_status_phase and kube_namespace_status_phase have no use case as a service check
                    'kube_namespace_status_phase',
                    'kube_node_status_phase',
                    # These CronJob and Job metrics need use cases to determine how do implement
                    'kube_cronjob_status_active',
                    'kube_cronjob_status_last_schedule_time',
                    'kube_cronjob_spec_suspend',
                    'kube_cronjob_spec_starting_deadline_seconds',
                    'kube_job_spec_active_dealine_seconds',
                    'kube_job_spec_completions',
                    'kube_job_spec_parallelism',
                    'kube_job_status_active',
                    'kube_job_status_completion_time',  # We could compute the duration=completion-start as a gauge
                    'kube_job_status_start_time',
                    'kube_verticalpodautoscaler_labels',
                ],
                'label_joins': {
                    'kube_pod_info': {'labels_to_match': ['pod', 'namespace'], 'labels_to_get': ['node']},
                    'kube_pod_status_phase': {'labels_to_match': ['pod', 'namespace'], 'labels_to_get': ['phase']},
                    'kube_persistentvolume_info': {
                        'labels_to_match': ['persistentvolume'],  # Persistent Volumes are not namespaced
                        'labels_to_get': ['storageclass'],
                    },
                    'kube_persistentvolumeclaim_info': {
                        'labels_to_match': ['persistentvolumeclaim', 'namespace'],
                        'labels_to_get': ['storageclass'],
                    },
                },
                # Defaults that were set when kubernetes_state was based on PrometheusCheck
                'send_monotonic_counter': ksm_instance.get('send_monotonic_counter', False),
                'health_service_check': ksm_instance.get('health_service_check', False),
            }
        )

        experimental_metrics_mapping = {
            'kube_hpa_spec_target_metric': 'hpa.spec_target_metric',
            'kube_verticalpodautoscaler_spec_resourcepolicy_container_policies_minallowed': (
                'vpa.spec_container_minallowed'
            ),
            'kube_verticalpodautoscaler_spec_resourcepolicy_container_policies_maxallowed': (
                'vpa.spec_container_maxallowed'
            ),
        }
        experimental_metrics = is_affirmative(ksm_instance.get('experimental_metrics', False))
        if experimental_metrics:
            ksm_instance['metrics'].append(experimental_metrics_mapping)
        else:
            ksm_instance['ignore_metrics'].extend(experimental_metrics_mapping.keys())

        ksm_instance['prometheus_url'] = endpoint

        if join_kube_labels:
            ksm_instance['label_joins'].update(
                {
                    'kube_pod_labels': {'labels_to_match': ['pod', 'namespace'], 'labels_to_get': ['*']},
                    'kube_deployment_labels': {'labels_to_match': ['deployment', 'namespace'], 'labels_to_get': ['*']},
                    'kube_daemonset_labels': {'labels_to_match': ['daemonset', 'namespace'], 'labels_to_get': ['*']},
                }
            )

        labels_to_get = [
            "label_tags_datadoghq_com_env",
            "label_tags_datadoghq_com_service",
            "label_tags_datadoghq_com_version",
        ]

        if join_standard_tags:
            ksm_instance['label_joins'].update(
                {
                    "kube_pod_labels": {"labels_to_match": ["pod", "namespace"], "labels_to_get": labels_to_get},
                    "kube_deployment_labels": {
                        "labels_to_match": ["deployment", "namespace"],
                        "labels_to_get": labels_to_get,
                    },
                    "kube_replicaset_labels": {
                        "labels_to_match": ["replicaset", "namespace"],
                        "labels_to_get": labels_to_get,
                    },
                    "kube_daemonset_labels": {
                        "labels_to_match": ["daemonset", "namespace"],
                        "labels_to_get": labels_to_get,
                    },
                    "kube_statefulset_labels": {
                        "labels_to_match": ["statefulset", "namespace"],
                        "labels_to_get": labels_to_get,
                    },
                    "kube_job_labels": {"labels_to_match": ["job_name", "namespace"], "labels_to_get": labels_to_get},
                }
            )

            ksm_instance.setdefault("labels_mapper", {}).update(
                {
                    "label_tags_datadoghq_com_env": "env",
                    "label_tags_datadoghq_com_service": "service",
                    "label_tags_datadoghq_com_version": "version",
                }
            )

        ksm_instance['label_joins'].update(extra_labels)
        if hostname_override:
            ksm_instance['label_to_hostname'] = 'node'
            clustername = get_clustername()
            if clustername != "":
                ksm_instance['label_to_hostname_suffix'] = "-" + clustername

        if 'labels_mapper' in ksm_instance and not isinstance(ksm_instance['labels_mapper'], dict):
            self.log.warning("Option labels_mapper should be a dictionary for %s", endpoint)

        return ksm_instance
コード例 #7
0
ファイル: vsphere.py プロジェクト: nikking/integrations-core
    def collect_metrics(self, instance):
        """
        Calls asynchronously _collect_metrics_async on all MORs, as the
        job queue is processed the Aggregator will receive the metrics.
        """
        i_key = self._instance_key(instance)
        if not self.mor_cache.contains(i_key):
            self.log.debug(
                "Not collecting metrics for instance '%s', nothing to do yet.",
                i_key)
            return

        server_instance = self._get_server_instance(instance)
        max_historical_metrics = DEFAULT_MAX_HIST_METRICS

        if self._should_collect_historical(instance):
            try:
                if 'max_query_metrics' in instance:
                    max_historical_metrics = int(instance['max_query_metrics'])
                    self.log.info("Collecting up to %d metrics",
                                  max_historical_metrics)
                else:
                    vcenter_settings = server_instance.content.setting.QueryOptions(
                        "config.vpxd.stats.maxQueryMetrics")
                    max_historical_metrics = int(vcenter_settings[0].value)
                if max_historical_metrics < 0:
                    max_historical_metrics = float('inf')
            except Exception:
                pass

        # TODO: Remove me once the fix for `max_query_metrics` is here by default
        mors_batch_method = (self.mor_cache.mors_batch if is_affirmative(
            instance.get('fix_max_query_metrics')) else
                             self.mor_cache.legacy_mors_batch)

        vm_count = 0
        custom_tags = instance.get('tags', [])
        tags = [
            "vcenter_server:{}".format(ensure_unicode(instance.get('name')))
        ] + custom_tags

        n_mors = self.mor_cache.instance_size(i_key)
        if not n_mors:
            if self._is_main_instance(instance):
                self.gauge('vsphere.vm.count', vm_count, tags=tags)
            self.log.debug(
                "No Mor objects to process for instance '%s', skip...", i_key)
            return

        self.log.debug("Collecting metrics for %s mors",
                       ensure_unicode(n_mors))

        # Request metrics for several objects at once. We can limit the number of objects with batch_size
        # If batch_size is 0, process everything at once
        batch_size = self.batch_morlist_size or n_mors
        for batch in mors_batch_method(i_key, batch_size,
                                       max_historical_metrics):
            query_specs = []
            for mor in itervalues(batch):
                if mor['mor_type'] == 'vm':
                    vm_count += 1
                if mor['mor_type'] not in REALTIME_RESOURCES and (
                        'metrics' not in mor or not mor['metrics']):
                    continue

                query_spec = vim.PerformanceManager.QuerySpec()
                query_spec.entity = mor["mor"]
                query_spec.intervalId = mor.get("interval")
                query_spec.maxSample = 1
                if mor['mor_type'] in REALTIME_RESOURCES:
                    query_spec.metricId = self.metadata_cache.get_metric_ids(
                        i_key)
                else:
                    query_spec.metricId = mor["metrics"]
                query_specs.append(query_spec)

            if query_specs:
                self.pool.apply_async(self._collect_metrics_async,
                                      args=(instance, query_specs))

        if self._is_main_instance(instance):
            self.gauge('vsphere.vm.count', vm_count, tags=tags)
コード例 #8
0
ファイル: vsphere.py プロジェクト: nikking/integrations-core
 def _is_main_instance(instance):
     """The 'main' instance is the one reporting events, service_checks, external host tags and realtime metrics.
     Note: the main instance can also report `historical` metric for legacy reasons.
     """
     return not is_affirmative(
         instance.get('collect_historical_only', False))
コード例 #9
0
    def __init__(self, name, init_config, instances):
        # We do not support more than one instance of kube-state-metrics
        instance = instances[0]
        kubernetes_state_instance = self._create_kubernetes_state_prometheus_instance(
            instance)

        # First deprecation phase: we keep ksm labels by default
        # Next iteration: remove ksm labels by default
        # Last iteration: remove this option
        self.keep_ksm_labels = is_affirmative(
            kubernetes_state_instance.get('keep_ksm_labels', True))

        generic_instances = [kubernetes_state_instance]
        super(KubernetesState, self).__init__(name,
                                              init_config,
                                              instances=generic_instances)

        self.condition_to_status_positive = {
            'true': self.OK,
            'false': self.CRITICAL,
            'unknown': self.UNKNOWN
        }

        self.condition_to_status_negative = {
            'true': self.CRITICAL,
            'false': self.OK,
            'unknown': self.UNKNOWN
        }

        # Parameters for the count_objects_by_tags method
        self.object_count_params = {
            'kube_persistentvolume_status_phase': {
                'metric_name': 'persistentvolumes.by_phase',
                'allowed_labels': ['storageclass', 'phase'],
            },
            'kube_service_spec_type': {
                'metric_name': 'service.count',
                'allowed_labels': ['namespace', 'type']
            },
        }

        self.METRIC_TRANSFORMERS = {
            'kube_pod_status_phase': self.kube_pod_status_phase,
            'kube_pod_container_status_waiting_reason':
            self.kube_pod_container_status_waiting_reason,
            'kube_pod_container_status_terminated_reason':
            self.kube_pod_container_status_terminated_reason,
            'kube_cronjob_next_schedule_time':
            self.kube_cronjob_next_schedule_time,
            'kube_job_complete': self.kube_job_complete,
            'kube_job_failed': self.kube_job_failed,
            'kube_job_status_failed': self.kube_job_status_failed,
            'kube_job_status_succeeded': self.kube_job_status_succeeded,
            'kube_node_status_condition': self.kube_node_status_condition,
            'kube_node_status_ready': self.kube_node_status_ready,
            'kube_node_status_out_of_disk': self.kube_node_status_out_of_disk,
            'kube_node_status_memory_pressure':
            self.kube_node_status_memory_pressure,
            'kube_node_status_disk_pressure':
            self.kube_node_status_disk_pressure,
            'kube_node_status_network_unavailable':
            self.kube_node_status_network_unavailable,
            'kube_node_spec_unschedulable': self.kube_node_spec_unschedulable,
            'kube_resourcequota': self.kube_resourcequota,
            'kube_limitrange': self.kube_limitrange,
            'kube_persistentvolume_status_phase': self.count_objects_by_tags,
            'kube_service_spec_type': self.count_objects_by_tags,
        }

        # Handling cron jobs succeeded/failed counts
        self.failed_cron_job_counts = defaultdict(KubernetesState.CronJobCount)
        self.succeeded_cron_job_counts = defaultdict(
            KubernetesState.CronJobCount)

        # Logic for Jobs
        self.job_succeeded_count = defaultdict(int)
        self.job_failed_count = defaultdict(int)
コード例 #10
0
    def _make_metric_list_to_collect(self, custom_metrics):
        """
        Store the list of metrics to collect by instance_key.
        Will also create and cache cursors to query the db.
        """

        metrics_to_collect = []
        tags = self.instance.get('tags', [])

        # Load instance-level (previously Performance) metrics)
        # If several check instances are querying the same server host, it can be wise to turn these off
        # to avoid sending duplicate metrics
        if is_affirmative(self.instance.get('include_instance_metrics', True)):
            self._add_performance_counters(
                chain(INSTANCE_METRICS, INSTANCE_METRICS_TOTAL), metrics_to_collect, tags, db=None
            )

        # populated through autodiscovery
        if self.databases:
            for db in self.databases:
                self._add_performance_counters(INSTANCE_METRICS_TOTAL, metrics_to_collect, tags, db=db)

        # Load database statistics
        for name, table, column in DATABASE_METRICS:
            # include database as a filter option
            db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
            for db_name in db_names:
                cfg = {'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags}
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load AlwaysOn metrics
        if is_affirmative(self.instance.get('include_ao_metrics', False)):
            for name, table, column in AO_METRICS + AO_METRICS_PRIMARY + AO_METRICS_SECONDARY:
                db_name = 'master'
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'instance_name': db_name,
                    'tags': tags,
                    'ao_database': self.instance.get('ao_database', None),
                    'availability_group': self.instance.get('availability_group', None),
                    'only_emit_local': is_affirmative(self.instance.get('only_emit_local', False)),
                }
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load FCI metrics
        if is_affirmative(self.instance.get('include_fci_metrics', False)):
            for name, table, column in FCI_METRICS:
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'tags': tags,
                }
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load metrics from scheduler and task tables, if enabled
        if is_affirmative(self.instance.get('include_task_scheduler_metrics', False)):
            for name, table, column in TASK_SCHEDULER_METRICS:
                cfg = {'name': name, 'table': table, 'column': column, 'tags': tags}
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load DB Fragmentation metrics
        if is_affirmative(self.instance.get('include_db_fragmentation_metrics', False)):
            db_fragmentation_object_names = self.instance.get('db_fragmentation_object_names', [])
            db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]

            if not db_fragmentation_object_names:
                self.log.debug(
                    "No fragmentation object names specified, will return fragmentation metrics for all "
                    "object_ids of current database(s): %s",
                    db_names,
                )

            for db_name in db_names:
                for name, table, column in DATABASE_FRAGMENTATION_METRICS:
                    cfg = {
                        'name': name,
                        'table': table,
                        'column': column,
                        'instance_name': db_name,
                        'tags': tags,
                        'db_fragmentation_object_names': db_fragmentation_object_names,
                    }
                    metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load any custom metrics from conf.d/sqlserver.yaml
        for cfg in custom_metrics:
            sql_type = None
            base_name = None

            custom_tags = tags + cfg.get('tags', [])
            cfg['tags'] = custom_tags

            db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE)
            if db_table not in VALID_TABLES:
                self.log.error('%s has an invalid table name: %s', cfg['name'], db_table)
                continue

            if cfg.get('database', None) and cfg.get('database') != self.instance.get('database'):
                self.log.debug(
                    'Skipping custom metric %s for database %s, check instance configured for database %s',
                    cfg['name'],
                    cfg.get('database'),
                    self.instance.get('database'),
                )
                continue

            if db_table == DEFAULT_PERFORMANCE_TABLE:
                user_type = cfg.get('type')
                if user_type is not None and user_type not in VALID_METRIC_TYPES:
                    self.log.error('%s has an invalid metric type: %s', cfg['name'], user_type)
                sql_type = None
                try:
                    if user_type is None:
                        sql_type, base_name = self.get_sql_type(cfg['counter_name'])
                except Exception:
                    self.log.warning("Can't load the metric %s, ignoring", cfg['name'], exc_info=True)
                    continue

                metrics_to_collect.append(
                    self.typed_metric(
                        cfg_inst=cfg, table=db_table, base_name=base_name, user_type=user_type, sql_type=sql_type
                    )
                )

            else:
                for column in cfg['columns']:
                    metrics_to_collect.append(
                        self.typed_metric(
                            cfg_inst=cfg, table=db_table, base_name=base_name, sql_type=sql_type, column=column
                        )
                    )

        self.instance_metrics = metrics_to_collect
        self.log.debug("metrics to collect %s", metrics_to_collect)

        # create an organized grouping of metric names to their metric classes
        for m in metrics_to_collect:
            cls = m.__class__.__name__
            name = m.sql_name or m.column
            self.log.debug("Adding metric class %s named %s", cls, name)

            self.instance_per_type_metrics[cls].append(name)
            if m.base_name:
                self.instance_per_type_metrics[cls].append(m.base_name)
コード例 #11
0
    def __init__(self, name, init_config, instances):
        super(SQLServer, self).__init__(name, init_config, instances)

        # Cache connections
        self.connections = {}
        self.failed_connections = {}
        self.instances_metrics = {}
        self.instances_per_type_metrics = defaultdict(dict)
        self.existing_databases = None
        self.do_check = {}
        self.proc_type_mapping = {
            'gauge': self.gauge,
            'rate': self.rate,
            'histogram': self.histogram
        }
        self.adoprovider = self.default_adoprovider

        self.connector = init_config.get('connector', 'adodbapi')
        if self.connector.lower() not in self.valid_connectors:
            self.log.error(
                "Invalid database connector %s, defaulting to adodbapi",
                self.connector)
            self.connector = 'adodbapi'

        self.adoprovider = init_config.get('adoprovider',
                                           self.default_adoprovider)
        if self.adoprovider.upper() not in self.valid_adoproviders:
            self.log.error(
                "Invalid ADODB provider string %s, defaulting to %s",
                self.adoprovider, self.default_adoprovider)
            self.adoprovider = self.default_adoprovider

        # Pre-process the list of metrics to collect
        self.custom_metrics = init_config.get('custom_metrics', [])
        for instance in instances:
            try:
                instance_key = self._conn_key(instance, self.DEFAULT_DB_KEY)
                self.do_check[instance_key] = True

                # check to see if the database exists before we try any connections to it
                with self.open_managed_db_connections(
                        instance, None, db_name=self.DEFAULT_DATABASE):
                    db_exists, context = self._check_db_exists(instance)

                if db_exists:
                    if instance.get('stored_procedure') is None:
                        with self.open_managed_db_connections(
                                instance, self.DEFAULT_DB_KEY):
                            self._make_metric_list_to_collect(
                                instance, self.custom_metrics)
                else:
                    # How much do we care that the DB doesn't exist?
                    ignore = is_affirmative(
                        instance.get("ignore_missing_database", False))
                    if ignore is not None and ignore:
                        # not much : we expect it. leave checks disabled
                        self.do_check[instance_key] = False
                        self.log.warning(
                            "Database %s does not exist. Disabling checks for this instance.",
                            context)
                    else:
                        # yes we do. Keep trying
                        self.log.error(
                            "Database %s does not exist. Fix issue and restart agent",
                            context)

            except SQLConnectionError:
                self.log.exception("Skipping SQL Server instance")
                continue
            except Exception as e:
                self.log.exception("Initialization exception %s", e)
                continue
コード例 #12
0
    def __init__(self, name, init_config, instances):
        super(SQLServer, self).__init__(name, init_config, instances)

        self._resolved_hostname = None
        self._agent_hostname = None
        self.connection = None
        self.failed_connections = {}
        self.instance_metrics = []
        self.instance_per_type_metrics = defaultdict(set)
        self.do_check = True

        self.tags = self.instance.get("tags", [])
        self.reported_hostname = self.instance.get('reported_hostname')
        self.autodiscovery = is_affirmative(self.instance.get('database_autodiscovery'))
        self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*'])
        self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', [])
        self.autodiscovery_db_service_check = is_affirmative(self.instance.get('autodiscovery_db_service_check', True))
        self.min_collection_interval = self.instance.get('min_collection_interval', 15)
        self._compile_patterns()
        self.autodiscovery_interval = self.instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
        self.databases = set()
        self.ad_last_check = 0

        self.proc = self.instance.get('stored_procedure')
        self.proc_type_mapping = {'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram}
        self.custom_metrics = init_config.get('custom_metrics', [])

        # DBM
        self.dbm_enabled = self.instance.get('dbm', False)
        self.statement_metrics_config = self.instance.get('query_metrics', {}) or {}
        self.statement_metrics = SqlserverStatementMetrics(self)
        self.activity_config = self.instance.get('query_activity', {}) or {}
        self.activity = SqlserverActivity(self)
        self.cloud_metadata = {}
        aws = self.instance.get('aws', {})
        gcp = self.instance.get('gcp', {})
        azure = self.instance.get('azure', {})
        if aws:
            self.cloud_metadata.update({'aws': aws})
        if gcp:
            self.cloud_metadata.update({'gcp': gcp})
        if azure:
            self.cloud_metadata.update({'azure': azure})
        obfuscator_options_config = self.instance.get('obfuscator_options', {}) or {}
        self.obfuscator_options = to_native_string(
            json.dumps(
                {
                    # Valid values for this can be found at
                    # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/database.md#connection-level-attributes
                    'dbms': 'mssql',
                    'replace_digits': is_affirmative(
                        obfuscator_options_config.get(
                            'replace_digits',
                            obfuscator_options_config.get('quantize_sql_tables', False),
                        )
                    ),
                    'keep_sql_alias': is_affirmative(obfuscator_options_config.get('keep_sql_alias', True)),
                    'return_json_metadata': is_affirmative(obfuscator_options_config.get('collect_metadata', True)),
                    'table_names': is_affirmative(obfuscator_options_config.get('collect_tables', True)),
                    'collect_commands': is_affirmative(obfuscator_options_config.get('collect_commands', True)),
                    'collect_comments': is_affirmative(obfuscator_options_config.get('collect_comments', True)),
                }
            )
        )

        self.static_info_cache = TTLCache(
            maxsize=100,
            # cache these for a full day
            ttl=60 * 60 * 24,
        )

        # Query declarations
        check_queries = []
        if is_affirmative(self.instance.get('include_ao_metrics', False)):
            check_queries.extend(
                [
                    QUERY_AO_AVAILABILITY_GROUPS,
                    QUERY_AO_FAILOVER_CLUSTER,
                    QUERY_AO_FAILOVER_CLUSTER_MEMBER,
                ]
            )
        if is_affirmative(self.instance.get('include_fci_metrics', False)):
            check_queries.extend([QUERY_FAILOVER_CLUSTER_INSTANCE])
        self._check_queries = self._new_query_executor(check_queries)
        self.check_initializations.append(self._check_queries.compile_queries)

        self.server_state_queries = self._new_query_executor([QUERY_SERVER_STATIC_INFO])
        self.check_initializations.append(self.server_state_queries.compile_queries)

        # use QueryManager to process custom queries
        self._query_manager = QueryManager(
            self, self.execute_query_raw, tags=self.tags, hostname=self.resolved_hostname
        )

        self._dynamic_queries = None

        self.check_initializations.append(self.config_checks)
        self.check_initializations.append(self._query_manager.compile_queries)
        self.check_initializations.append(self.initialize_connection)
コード例 #13
0
    def _make_metric_list_to_collect(self, custom_metrics):
        """
        Store the list of metrics to collect by instance_key.
        Will also create and cache cursors to query the db.
        """

        metrics_to_collect = []
        tags = self.instance.get('tags', [])

        # Load instance-level (previously Performance) metrics)
        # If several check instances are querying the same server host, it can be wise to turn these off
        # to avoid sending duplicate metrics
        if is_affirmative(self.instance.get('include_instance_metrics', True)):
            for name, counter_name, instance_name in self.INSTANCE_METRICS:
                try:
                    sql_type, base_name = self.get_sql_type(counter_name)
                    cfg = {
                        'name': name,
                        'counter_name': counter_name,
                        'instance_name': instance_name,
                        'tags': tags,
                    }

                    metrics_to_collect.append(
                        self.typed_metric(cfg_inst=cfg,
                                          table=DEFAULT_PERFORMANCE_TABLE,
                                          base_name=base_name,
                                          sql_type=sql_type))
                except SQLConnectionError:
                    raise
                except Exception:
                    self.log.warning("Can't load the metric %s, ignoring",
                                     name,
                                     exc_info=True)
                    continue

        # Load database statistics
        for name, table, column in self.DATABASE_METRICS:
            # include database as a filter option
            db_name = self.instance.get('database',
                                        self.connection.DEFAULT_DATABASE)
            cfg = {
                'name': name,
                'table': table,
                'column': column,
                'instance_name': db_name,
                'tags': tags
            }
            metrics_to_collect.append(
                self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load metrics from scheduler and task tables, if enabled
        if is_affirmative(
                self.instance.get('include_task_scheduler_metrics', False)):
            for name, table, column in self.TASK_SCHEDULER_METRICS:
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'tags': tags
                }
                metrics_to_collect.append(
                    self.typed_metric(cfg_inst=cfg, table=table,
                                      column=column))

        # Load any custom metrics from conf.d/sqlserver.yaml
        for cfg in custom_metrics:
            sql_type = None
            base_name = None

            custom_tags = tags + cfg.get('tags', [])
            cfg['tags'] = custom_tags

            db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE)
            if db_table not in VALID_TABLES:
                self.log.error('%s has an invalid table name: %s', cfg['name'],
                               db_table)
                continue

            if cfg.get('database', None) and cfg.get(
                    'database') != self.instance.get('database'):
                self.log.debug(
                    'Skipping custom metric %s for database %s, check instance configured for database %s',
                    cfg['name'],
                    cfg.get('database'),
                    self.instance.get('database'),
                )
                continue

            if db_table == DEFAULT_PERFORMANCE_TABLE:
                user_type = cfg.get('type')
                if user_type is not None and user_type not in VALID_METRIC_TYPES:
                    self.log.error('%s has an invalid metric type: %s',
                                   cfg['name'], user_type)
                sql_type = None
                try:
                    if user_type is None:
                        sql_type, base_name = self.get_sql_type(
                            cfg['counter_name'])
                except Exception:
                    self.log.warning("Can't load the metric %s, ignoring",
                                     cfg['name'],
                                     exc_info=True)
                    continue

                metrics_to_collect.append(
                    self.typed_metric(cfg_inst=cfg,
                                      table=db_table,
                                      base_name=base_name,
                                      user_type=user_type,
                                      sql_type=sql_type))

            else:
                for column in cfg['columns']:
                    metrics_to_collect.append(
                        self.typed_metric(cfg_inst=cfg,
                                          table=db_table,
                                          base_name=base_name,
                                          sql_type=sql_type,
                                          column=column))

        self.instance_metrics = metrics_to_collect
        self.log.debug("metrics to collect %s", metrics_to_collect)

        # create an organized grouping of metric names to their metric classes
        for m in metrics_to_collect:
            cls = m.__class__.__name__
            name = m.sql_name or m.column
            self.log.debug("Adding metric class %s named %s", cls, name)

            self.instance_per_type_metrics[cls].append(name)
            if m.base_name:
                self.instance_per_type_metrics[cls].append(m.base_name)