def __init__(self, name, init_config, instances): # type: (*Any, **Any) -> None super(GlusterfsCheck, self).__init__(name, init_config, instances) self._tags = self.instance.get('tags', []) # Check if customer set gstatus path if init_config.get('gstatus_path'): self.gstatus_cmd = init_config.get('gstatus_path') else: if os.path.exists(GSTATUS_PATH): self.gstatus_cmd = GSTATUS_PATH else: raise ConfigurationError( 'Glusterfs check requires `gstatus` to be installed or set the path to the installed version.' ) self.log.debug("Using gstatus path `%s`", self.gstatus_cmd) self.use_sudo = is_affirmative(self.instance.get('use_sudo', True))
def check(self, instance): # Get the configuration for this specific instance scraper_config = self.get_scraper_config(instance) # Set up metric_transformers transformers = {} for metric_from, metric_to in TRANSFORM_VALUE_HISTOGRAMS.items(): transformers[metric_from] = self._histogram_from_microseconds_to_seconds(metric_to) for metric_from, metric_to in TRANSFORM_VALUE_SUMMARIES.items(): transformers[metric_from] = self._summary_from_microseconds_to_seconds(metric_to) self.process(scraper_config, metric_transformers=transformers) # Check the leader-election status if is_affirmative(instance.get('leader_election', True)): leader_config = self.LEADER_ELECTION_CONFIG leader_config["tags"] = instance.get("tags", []) leader_config["record_kind"] = instance.get('leader_election_kind', 'auto') self.check_election_status(leader_config) self._perform_service_check(instance)
def __init__(self, name, init_config, instances): super(SQLServer, self).__init__(name, init_config, instances) self.connection = None self.failed_connections = {} self.instance_metrics = [] self.instance_per_type_metrics = defaultdict(list) self.do_check = True self.autodiscovery = is_affirmative( self.instance.get('database_autodiscovery')) if self.autodiscovery and self.instance.get('database'): self.log.warning( 'sqlserver `database_autodiscovery` and `database` options defined in same instance - ' 'autodiscovery will take precedence.') self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*']) self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', []) self._compile_patterns() self.autodiscovery_interval = self.instance.get( 'autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL) self.databases = set() self.ad_last_check = 0 self.proc = self.instance.get('stored_procedure') self.proc_type_mapping = { 'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram } self.custom_metrics = init_config.get('custom_metrics', []) # use QueryManager to process custom queries self._query_manager = QueryManager(self, self.execute_query_raw, queries=[], tags=self.instance.get("tags", [])) self.check_initializations.append(self._query_manager.compile_queries) self.check_initializations.append(self.initialize_connection)
def _process_mor_objects_queue(self, instance): """ Pops `batch_morlist_size` items from the mor objects queue and run asynchronously the _process_mor_objects_queue_async method to fill the Mor cache. """ i_key = self._instance_key(instance) self.mor_cache.init_instance(i_key) if not self.mor_objects_queue.contains(i_key): self.log.debug("Objects queue is not initialized yet for instance %s, skipping processing", i_key) return for resource_type in RESOURCE_TYPE_METRICS: # Batch size can prevent querying large payloads at once if the environment is too large # If batch size is set to 0, process everything at once batch_size = self.batch_morlist_size or self.mor_objects_queue.size(i_key, resource_type) while self.mor_objects_queue.size(i_key, resource_type): mors = [] for _ in range(batch_size): mor = self.mor_objects_queue.pop(i_key, resource_type) if mor is None: self.log.debug("No more objects of type '%s' left in the queue", ensure_unicode(resource_type)) break mor_name = str(mor['mor']) mor['interval'] = REAL_TIME_INTERVAL if mor['mor_type'] in REALTIME_RESOURCES else None # Always update the cache to account for Mors that might have changed parent # in the meantime (e.g. a migrated VM). self.mor_cache.set_mor(i_key, mor_name, mor) # Only do this for non real-time resources i.e. datacenter, datastore and cluster # For hosts and VMs, we can rely on a precomputed list of metrics realtime_only = is_affirmative(instance.get("collect_realtime_only", True)) if mor["mor_type"] not in REALTIME_RESOURCES and not realtime_only: mors.append(mor) # We will actually schedule jobs for non realtime resources only. if mors: self.pool.apply_async(self._process_mor_objects_queue_async, args=(instance, mors))
def initialize_connection(self): self.connection = Connection(self.init_config, self.instance, self.handle_service_check) # Pre-process the list of metrics to collect try: # check to see if the database exists before we try any connections to it db_exists, context = self.connection.check_database() if db_exists: if self.instance.get('stored_procedure') is None: with self.connection.open_managed_default_connection(): with self.connection.get_managed_cursor() as cursor: self.autodiscover_databases(cursor) self._make_metric_list_to_collect(self.custom_metrics) else: # How much do we care that the DB doesn't exist? ignore = is_affirmative( self.instance.get("ignore_missing_database", False)) if ignore is not None and ignore: # not much : we expect it. leave checks disabled self.do_check = False self.log.warning( "Database %s does not exist. Disabling checks for this instance.", context) else: # yes we do. Keep trying msg = "Database {} does not exist. Please resolve invalid database and restart agent".format( context) raise ConfigurationError(msg) except SQLConnectionError as e: self.log.exception("Error connecting to database: %s", e) except ConfigurationError: raise except Exception as e: self.log.exception("Initialization exception %s", e)
def _create_kubernetes_state_prometheus_instance(self, instance): """ Set up the kubernetes_state instance so it can be used in OpenMetricsBaseCheck """ ksm_instance = deepcopy(instance) endpoint = instance.get('kube_state_url') if endpoint is None: raise CheckException("Unable to find kube_state_url in config file.") extra_labels = ksm_instance.get('label_joins', {}) hostname_override = is_affirmative(ksm_instance.get('hostname_override', True)) join_kube_labels = is_affirmative(ksm_instance.get('join_kube_labels', False)) join_standard_tags = is_affirmative(ksm_instance.get('join_standard_tags', False)) ksm_instance.update( { 'namespace': 'kubernetes_state', 'metrics': [ { 'kube_daemonset_status_current_number_scheduled': 'daemonset.scheduled', 'kube_daemonset_status_desired_number_scheduled': 'daemonset.desired', 'kube_daemonset_status_number_misscheduled': 'daemonset.misscheduled', 'kube_daemonset_status_number_ready': 'daemonset.ready', 'kube_daemonset_updated_number_scheduled': 'daemonset.updated', 'kube_deployment_spec_paused': 'deployment.paused', 'kube_deployment_spec_replicas': 'deployment.replicas_desired', 'kube_deployment_spec_strategy_rollingupdate_max_unavailable': 'deployment.rollingupdate.max_unavailable', # noqa: E501 'kube_deployment_status_replicas': 'deployment.replicas', 'kube_deployment_status_replicas_available': 'deployment.replicas_available', 'kube_deployment_status_replicas_unavailable': 'deployment.replicas_unavailable', 'kube_deployment_status_replicas_updated': 'deployment.replicas_updated', 'kube_endpoint_address_available': 'endpoint.address_available', 'kube_endpoint_address_not_ready': 'endpoint.address_not_ready', 'kube_endpoint_created': 'endpoint.created', 'kube_hpa_spec_min_replicas': 'hpa.min_replicas', 'kube_hpa_spec_max_replicas': 'hpa.max_replicas', 'kube_hpa_status_desired_replicas': 'hpa.desired_replicas', 'kube_hpa_status_current_replicas': 'hpa.current_replicas', 'kube_hpa_status_condition': 'hpa.condition', 'kube_node_info': 'node.count', 'kube_node_status_allocatable_cpu_cores': 'node.cpu_allocatable', 'kube_node_status_allocatable_memory_bytes': 'node.memory_allocatable', 'kube_node_status_allocatable_pods': 'node.pods_allocatable', 'kube_node_status_capacity_cpu_cores': 'node.cpu_capacity', 'kube_node_status_capacity_memory_bytes': 'node.memory_capacity', 'kube_node_status_capacity_pods': 'node.pods_capacity', 'kube_node_status_allocatable_nvidia_gpu_cards': 'node.gpu.cards_allocatable', 'kube_node_status_capacity_nvidia_gpu_cards': 'node.gpu.cards_capacity', 'kube_pod_container_status_terminated': 'container.terminated', 'kube_pod_container_status_waiting': 'container.waiting', 'kube_persistentvolumeclaim_status_phase': 'persistentvolumeclaim.status', 'kube_persistentvolumeclaim_resource_requests_storage_bytes': 'persistentvolumeclaim.request_storage', # noqa: E501 'kube_pod_container_resource_limits_cpu_cores': 'container.cpu_limit', 'kube_pod_container_resource_limits_memory_bytes': 'container.memory_limit', 'kube_pod_container_resource_requests_cpu_cores': 'container.cpu_requested', 'kube_pod_container_resource_requests_memory_bytes': 'container.memory_requested', 'kube_pod_container_status_ready': 'container.ready', 'kube_pod_container_status_restarts': 'container.restarts', # up to kube-state-metrics 1.1.x 'kube_pod_container_status_restarts_total': 'container.restarts', # noqa: E501, from kube-state-metrics 1.2.0 'kube_pod_container_status_running': 'container.running', 'kube_pod_container_resource_requests_nvidia_gpu_devices': 'container.gpu.request', 'kube_pod_container_resource_limits_nvidia_gpu_devices': 'container.gpu.limit', 'kube_pod_status_ready': 'pod.ready', 'kube_pod_status_scheduled': 'pod.scheduled', 'kube_pod_status_unschedulable': 'pod.unschedulable', 'kube_poddisruptionbudget_status_current_healthy': 'pdb.pods_healthy', 'kube_poddisruptionbudget_status_desired_healthy': 'pdb.pods_desired', 'kube_poddisruptionbudget_status_pod_disruptions_allowed': 'pdb.disruptions_allowed', 'kube_poddisruptionbudget_status_expected_pods': 'pdb.pods_total', 'kube_replicaset_spec_replicas': 'replicaset.replicas_desired', 'kube_replicaset_status_fully_labeled_replicas': 'replicaset.fully_labeled_replicas', 'kube_replicaset_status_ready_replicas': 'replicaset.replicas_ready', 'kube_replicaset_status_replicas': 'replicaset.replicas', 'kube_replicationcontroller_spec_replicas': 'replicationcontroller.replicas_desired', 'kube_replicationcontroller_status_available_replicas': 'replicationcontroller.replicas_available', # noqa: E501 'kube_replicationcontroller_status_fully_labeled_replicas': 'replicationcontroller.fully_labeled_replicas', # noqa: E501 'kube_replicationcontroller_status_ready_replicas': 'replicationcontroller.replicas_ready', 'kube_replicationcontroller_status_replicas': 'replicationcontroller.replicas', 'kube_statefulset_replicas': 'statefulset.replicas_desired', 'kube_statefulset_status_replicas': 'statefulset.replicas', 'kube_statefulset_status_replicas_current': 'statefulset.replicas_current', 'kube_statefulset_status_replicas_ready': 'statefulset.replicas_ready', 'kube_statefulset_status_replicas_updated': 'statefulset.replicas_updated', 'kube_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound': ( 'vpa.lower_bound' ), 'kube_verticalpodautoscaler_status_recommendation_containerrecommendations_target': ( 'vpa.target' ), 'kube_verticalpodautoscaler_status_recommendation_containerrecommendations_uncappedtarget': ( 'vpa.uncapped_target' ), 'kube_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound': ( 'vpa.upperbound' ), 'kube_verticalpodautoscaler_spec_updatepolicy_updatemode': 'vpa.update_mode', } ], 'ignore_metrics': [ # _info, _labels and _created don't convey any metric 'kube_cronjob_info', 'kube_cronjob_created', 'kube_daemonset_created', 'kube_deployment_created', 'kube_deployment_labels', 'kube_job_created', 'kube_job_info', 'kube_limitrange_created', 'kube_namespace_created', 'kube_namespace_labels', 'kube_node_created', 'kube_node_labels', 'kube_pod_created', 'kube_pod_container_info', 'kube_pod_info', 'kube_pod_owner', 'kube_pod_start_time', 'kube_pod_labels', 'kube_poddisruptionbudget_created', 'kube_replicaset_created', 'kube_replicationcontroller_created', 'kube_resourcequota_created', 'kube_replicaset_owner', 'kube_service_created', 'kube_service_info', 'kube_service_labels', 'kube_service_spec_external_ip', 'kube_service_status_load_balancer_ingress', 'kube_statefulset_labels', 'kube_statefulset_created', 'kube_statefulset_status_current_revision', 'kube_statefulset_status_update_revision', # Already provided by the kubelet integration 'kube_pod_container_status_last_terminated_reason', # _generation metrics are more metadata than metrics, no real use case for now 'kube_daemonset_metadata_generation', 'kube_deployment_metadata_generation', 'kube_deployment_status_observed_generation', 'kube_replicaset_metadata_generation', 'kube_replicaset_status_observed_generation', 'kube_replicationcontroller_metadata_generation', 'kube_replicationcontroller_status_observed_generation', 'kube_statefulset_metadata_generation', 'kube_statefulset_status_observed_generation', 'kube_hpa_metadata_generation', # kube_node_status_phase and kube_namespace_status_phase have no use case as a service check 'kube_namespace_status_phase', 'kube_node_status_phase', # These CronJob and Job metrics need use cases to determine how do implement 'kube_cronjob_status_active', 'kube_cronjob_status_last_schedule_time', 'kube_cronjob_spec_suspend', 'kube_cronjob_spec_starting_deadline_seconds', 'kube_job_spec_active_dealine_seconds', 'kube_job_spec_completions', 'kube_job_spec_parallelism', 'kube_job_status_active', 'kube_job_status_completion_time', # We could compute the duration=completion-start as a gauge 'kube_job_status_start_time', 'kube_verticalpodautoscaler_labels', ], 'label_joins': { 'kube_pod_info': {'labels_to_match': ['pod', 'namespace'], 'labels_to_get': ['node']}, 'kube_pod_status_phase': {'labels_to_match': ['pod', 'namespace'], 'labels_to_get': ['phase']}, 'kube_persistentvolume_info': { 'labels_to_match': ['persistentvolume'], # Persistent Volumes are not namespaced 'labels_to_get': ['storageclass'], }, 'kube_persistentvolumeclaim_info': { 'labels_to_match': ['persistentvolumeclaim', 'namespace'], 'labels_to_get': ['storageclass'], }, }, # Defaults that were set when kubernetes_state was based on PrometheusCheck 'send_monotonic_counter': ksm_instance.get('send_monotonic_counter', False), 'health_service_check': ksm_instance.get('health_service_check', False), } ) experimental_metrics_mapping = { 'kube_hpa_spec_target_metric': 'hpa.spec_target_metric', 'kube_verticalpodautoscaler_spec_resourcepolicy_container_policies_minallowed': ( 'vpa.spec_container_minallowed' ), 'kube_verticalpodautoscaler_spec_resourcepolicy_container_policies_maxallowed': ( 'vpa.spec_container_maxallowed' ), } experimental_metrics = is_affirmative(ksm_instance.get('experimental_metrics', False)) if experimental_metrics: ksm_instance['metrics'].append(experimental_metrics_mapping) else: ksm_instance['ignore_metrics'].extend(experimental_metrics_mapping.keys()) ksm_instance['prometheus_url'] = endpoint if join_kube_labels: ksm_instance['label_joins'].update( { 'kube_pod_labels': {'labels_to_match': ['pod', 'namespace'], 'labels_to_get': ['*']}, 'kube_deployment_labels': {'labels_to_match': ['deployment', 'namespace'], 'labels_to_get': ['*']}, 'kube_daemonset_labels': {'labels_to_match': ['daemonset', 'namespace'], 'labels_to_get': ['*']}, } ) labels_to_get = [ "label_tags_datadoghq_com_env", "label_tags_datadoghq_com_service", "label_tags_datadoghq_com_version", ] if join_standard_tags: ksm_instance['label_joins'].update( { "kube_pod_labels": {"labels_to_match": ["pod", "namespace"], "labels_to_get": labels_to_get}, "kube_deployment_labels": { "labels_to_match": ["deployment", "namespace"], "labels_to_get": labels_to_get, }, "kube_replicaset_labels": { "labels_to_match": ["replicaset", "namespace"], "labels_to_get": labels_to_get, }, "kube_daemonset_labels": { "labels_to_match": ["daemonset", "namespace"], "labels_to_get": labels_to_get, }, "kube_statefulset_labels": { "labels_to_match": ["statefulset", "namespace"], "labels_to_get": labels_to_get, }, "kube_job_labels": {"labels_to_match": ["job_name", "namespace"], "labels_to_get": labels_to_get}, } ) ksm_instance.setdefault("labels_mapper", {}).update( { "label_tags_datadoghq_com_env": "env", "label_tags_datadoghq_com_service": "service", "label_tags_datadoghq_com_version": "version", } ) ksm_instance['label_joins'].update(extra_labels) if hostname_override: ksm_instance['label_to_hostname'] = 'node' clustername = get_clustername() if clustername != "": ksm_instance['label_to_hostname_suffix'] = "-" + clustername if 'labels_mapper' in ksm_instance and not isinstance(ksm_instance['labels_mapper'], dict): self.log.warning("Option labels_mapper should be a dictionary for %s", endpoint) return ksm_instance
def collect_metrics(self, instance): """ Calls asynchronously _collect_metrics_async on all MORs, as the job queue is processed the Aggregator will receive the metrics. """ i_key = self._instance_key(instance) if not self.mor_cache.contains(i_key): self.log.debug( "Not collecting metrics for instance '%s', nothing to do yet.", i_key) return server_instance = self._get_server_instance(instance) max_historical_metrics = DEFAULT_MAX_HIST_METRICS if self._should_collect_historical(instance): try: if 'max_query_metrics' in instance: max_historical_metrics = int(instance['max_query_metrics']) self.log.info("Collecting up to %d metrics", max_historical_metrics) else: vcenter_settings = server_instance.content.setting.QueryOptions( "config.vpxd.stats.maxQueryMetrics") max_historical_metrics = int(vcenter_settings[0].value) if max_historical_metrics < 0: max_historical_metrics = float('inf') except Exception: pass # TODO: Remove me once the fix for `max_query_metrics` is here by default mors_batch_method = (self.mor_cache.mors_batch if is_affirmative( instance.get('fix_max_query_metrics')) else self.mor_cache.legacy_mors_batch) vm_count = 0 custom_tags = instance.get('tags', []) tags = [ "vcenter_server:{}".format(ensure_unicode(instance.get('name'))) ] + custom_tags n_mors = self.mor_cache.instance_size(i_key) if not n_mors: if self._is_main_instance(instance): self.gauge('vsphere.vm.count', vm_count, tags=tags) self.log.debug( "No Mor objects to process for instance '%s', skip...", i_key) return self.log.debug("Collecting metrics for %s mors", ensure_unicode(n_mors)) # Request metrics for several objects at once. We can limit the number of objects with batch_size # If batch_size is 0, process everything at once batch_size = self.batch_morlist_size or n_mors for batch in mors_batch_method(i_key, batch_size, max_historical_metrics): query_specs = [] for mor in itervalues(batch): if mor['mor_type'] == 'vm': vm_count += 1 if mor['mor_type'] not in REALTIME_RESOURCES and ( 'metrics' not in mor or not mor['metrics']): continue query_spec = vim.PerformanceManager.QuerySpec() query_spec.entity = mor["mor"] query_spec.intervalId = mor.get("interval") query_spec.maxSample = 1 if mor['mor_type'] in REALTIME_RESOURCES: query_spec.metricId = self.metadata_cache.get_metric_ids( i_key) else: query_spec.metricId = mor["metrics"] query_specs.append(query_spec) if query_specs: self.pool.apply_async(self._collect_metrics_async, args=(instance, query_specs)) if self._is_main_instance(instance): self.gauge('vsphere.vm.count', vm_count, tags=tags)
def _is_main_instance(instance): """The 'main' instance is the one reporting events, service_checks, external host tags and realtime metrics. Note: the main instance can also report `historical` metric for legacy reasons. """ return not is_affirmative( instance.get('collect_historical_only', False))
def __init__(self, name, init_config, instances): # We do not support more than one instance of kube-state-metrics instance = instances[0] kubernetes_state_instance = self._create_kubernetes_state_prometheus_instance( instance) # First deprecation phase: we keep ksm labels by default # Next iteration: remove ksm labels by default # Last iteration: remove this option self.keep_ksm_labels = is_affirmative( kubernetes_state_instance.get('keep_ksm_labels', True)) generic_instances = [kubernetes_state_instance] super(KubernetesState, self).__init__(name, init_config, instances=generic_instances) self.condition_to_status_positive = { 'true': self.OK, 'false': self.CRITICAL, 'unknown': self.UNKNOWN } self.condition_to_status_negative = { 'true': self.CRITICAL, 'false': self.OK, 'unknown': self.UNKNOWN } # Parameters for the count_objects_by_tags method self.object_count_params = { 'kube_persistentvolume_status_phase': { 'metric_name': 'persistentvolumes.by_phase', 'allowed_labels': ['storageclass', 'phase'], }, 'kube_service_spec_type': { 'metric_name': 'service.count', 'allowed_labels': ['namespace', 'type'] }, } self.METRIC_TRANSFORMERS = { 'kube_pod_status_phase': self.kube_pod_status_phase, 'kube_pod_container_status_waiting_reason': self.kube_pod_container_status_waiting_reason, 'kube_pod_container_status_terminated_reason': self.kube_pod_container_status_terminated_reason, 'kube_cronjob_next_schedule_time': self.kube_cronjob_next_schedule_time, 'kube_job_complete': self.kube_job_complete, 'kube_job_failed': self.kube_job_failed, 'kube_job_status_failed': self.kube_job_status_failed, 'kube_job_status_succeeded': self.kube_job_status_succeeded, 'kube_node_status_condition': self.kube_node_status_condition, 'kube_node_status_ready': self.kube_node_status_ready, 'kube_node_status_out_of_disk': self.kube_node_status_out_of_disk, 'kube_node_status_memory_pressure': self.kube_node_status_memory_pressure, 'kube_node_status_disk_pressure': self.kube_node_status_disk_pressure, 'kube_node_status_network_unavailable': self.kube_node_status_network_unavailable, 'kube_node_spec_unschedulable': self.kube_node_spec_unschedulable, 'kube_resourcequota': self.kube_resourcequota, 'kube_limitrange': self.kube_limitrange, 'kube_persistentvolume_status_phase': self.count_objects_by_tags, 'kube_service_spec_type': self.count_objects_by_tags, } # Handling cron jobs succeeded/failed counts self.failed_cron_job_counts = defaultdict(KubernetesState.CronJobCount) self.succeeded_cron_job_counts = defaultdict( KubernetesState.CronJobCount) # Logic for Jobs self.job_succeeded_count = defaultdict(int) self.job_failed_count = defaultdict(int)
def _make_metric_list_to_collect(self, custom_metrics): """ Store the list of metrics to collect by instance_key. Will also create and cache cursors to query the db. """ metrics_to_collect = [] tags = self.instance.get('tags', []) # Load instance-level (previously Performance) metrics) # If several check instances are querying the same server host, it can be wise to turn these off # to avoid sending duplicate metrics if is_affirmative(self.instance.get('include_instance_metrics', True)): self._add_performance_counters( chain(INSTANCE_METRICS, INSTANCE_METRICS_TOTAL), metrics_to_collect, tags, db=None ) # populated through autodiscovery if self.databases: for db in self.databases: self._add_performance_counters(INSTANCE_METRICS_TOTAL, metrics_to_collect, tags, db=db) # Load database statistics for name, table, column in DATABASE_METRICS: # include database as a filter option db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)] for db_name in db_names: cfg = {'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags} metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load AlwaysOn metrics if is_affirmative(self.instance.get('include_ao_metrics', False)): for name, table, column in AO_METRICS + AO_METRICS_PRIMARY + AO_METRICS_SECONDARY: db_name = 'master' cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags, 'ao_database': self.instance.get('ao_database', None), 'availability_group': self.instance.get('availability_group', None), 'only_emit_local': is_affirmative(self.instance.get('only_emit_local', False)), } metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load FCI metrics if is_affirmative(self.instance.get('include_fci_metrics', False)): for name, table, column in FCI_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'tags': tags, } metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load metrics from scheduler and task tables, if enabled if is_affirmative(self.instance.get('include_task_scheduler_metrics', False)): for name, table, column in TASK_SCHEDULER_METRICS: cfg = {'name': name, 'table': table, 'column': column, 'tags': tags} metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load DB Fragmentation metrics if is_affirmative(self.instance.get('include_db_fragmentation_metrics', False)): db_fragmentation_object_names = self.instance.get('db_fragmentation_object_names', []) db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)] if not db_fragmentation_object_names: self.log.debug( "No fragmentation object names specified, will return fragmentation metrics for all " "object_ids of current database(s): %s", db_names, ) for db_name in db_names: for name, table, column in DATABASE_FRAGMENTATION_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags, 'db_fragmentation_object_names': db_fragmentation_object_names, } metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load any custom metrics from conf.d/sqlserver.yaml for cfg in custom_metrics: sql_type = None base_name = None custom_tags = tags + cfg.get('tags', []) cfg['tags'] = custom_tags db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE) if db_table not in VALID_TABLES: self.log.error('%s has an invalid table name: %s', cfg['name'], db_table) continue if cfg.get('database', None) and cfg.get('database') != self.instance.get('database'): self.log.debug( 'Skipping custom metric %s for database %s, check instance configured for database %s', cfg['name'], cfg.get('database'), self.instance.get('database'), ) continue if db_table == DEFAULT_PERFORMANCE_TABLE: user_type = cfg.get('type') if user_type is not None and user_type not in VALID_METRIC_TYPES: self.log.error('%s has an invalid metric type: %s', cfg['name'], user_type) sql_type = None try: if user_type is None: sql_type, base_name = self.get_sql_type(cfg['counter_name']) except Exception: self.log.warning("Can't load the metric %s, ignoring", cfg['name'], exc_info=True) continue metrics_to_collect.append( self.typed_metric( cfg_inst=cfg, table=db_table, base_name=base_name, user_type=user_type, sql_type=sql_type ) ) else: for column in cfg['columns']: metrics_to_collect.append( self.typed_metric( cfg_inst=cfg, table=db_table, base_name=base_name, sql_type=sql_type, column=column ) ) self.instance_metrics = metrics_to_collect self.log.debug("metrics to collect %s", metrics_to_collect) # create an organized grouping of metric names to their metric classes for m in metrics_to_collect: cls = m.__class__.__name__ name = m.sql_name or m.column self.log.debug("Adding metric class %s named %s", cls, name) self.instance_per_type_metrics[cls].append(name) if m.base_name: self.instance_per_type_metrics[cls].append(m.base_name)
def __init__(self, name, init_config, instances): super(SQLServer, self).__init__(name, init_config, instances) # Cache connections self.connections = {} self.failed_connections = {} self.instances_metrics = {} self.instances_per_type_metrics = defaultdict(dict) self.existing_databases = None self.do_check = {} self.proc_type_mapping = { 'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram } self.adoprovider = self.default_adoprovider self.connector = init_config.get('connector', 'adodbapi') if self.connector.lower() not in self.valid_connectors: self.log.error( "Invalid database connector %s, defaulting to adodbapi", self.connector) self.connector = 'adodbapi' self.adoprovider = init_config.get('adoprovider', self.default_adoprovider) if self.adoprovider.upper() not in self.valid_adoproviders: self.log.error( "Invalid ADODB provider string %s, defaulting to %s", self.adoprovider, self.default_adoprovider) self.adoprovider = self.default_adoprovider # Pre-process the list of metrics to collect self.custom_metrics = init_config.get('custom_metrics', []) for instance in instances: try: instance_key = self._conn_key(instance, self.DEFAULT_DB_KEY) self.do_check[instance_key] = True # check to see if the database exists before we try any connections to it with self.open_managed_db_connections( instance, None, db_name=self.DEFAULT_DATABASE): db_exists, context = self._check_db_exists(instance) if db_exists: if instance.get('stored_procedure') is None: with self.open_managed_db_connections( instance, self.DEFAULT_DB_KEY): self._make_metric_list_to_collect( instance, self.custom_metrics) else: # How much do we care that the DB doesn't exist? ignore = is_affirmative( instance.get("ignore_missing_database", False)) if ignore is not None and ignore: # not much : we expect it. leave checks disabled self.do_check[instance_key] = False self.log.warning( "Database %s does not exist. Disabling checks for this instance.", context) else: # yes we do. Keep trying self.log.error( "Database %s does not exist. Fix issue and restart agent", context) except SQLConnectionError: self.log.exception("Skipping SQL Server instance") continue except Exception as e: self.log.exception("Initialization exception %s", e) continue
def __init__(self, name, init_config, instances): super(SQLServer, self).__init__(name, init_config, instances) self._resolved_hostname = None self._agent_hostname = None self.connection = None self.failed_connections = {} self.instance_metrics = [] self.instance_per_type_metrics = defaultdict(set) self.do_check = True self.tags = self.instance.get("tags", []) self.reported_hostname = self.instance.get('reported_hostname') self.autodiscovery = is_affirmative(self.instance.get('database_autodiscovery')) self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*']) self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', []) self.autodiscovery_db_service_check = is_affirmative(self.instance.get('autodiscovery_db_service_check', True)) self.min_collection_interval = self.instance.get('min_collection_interval', 15) self._compile_patterns() self.autodiscovery_interval = self.instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL) self.databases = set() self.ad_last_check = 0 self.proc = self.instance.get('stored_procedure') self.proc_type_mapping = {'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram} self.custom_metrics = init_config.get('custom_metrics', []) # DBM self.dbm_enabled = self.instance.get('dbm', False) self.statement_metrics_config = self.instance.get('query_metrics', {}) or {} self.statement_metrics = SqlserverStatementMetrics(self) self.activity_config = self.instance.get('query_activity', {}) or {} self.activity = SqlserverActivity(self) self.cloud_metadata = {} aws = self.instance.get('aws', {}) gcp = self.instance.get('gcp', {}) azure = self.instance.get('azure', {}) if aws: self.cloud_metadata.update({'aws': aws}) if gcp: self.cloud_metadata.update({'gcp': gcp}) if azure: self.cloud_metadata.update({'azure': azure}) obfuscator_options_config = self.instance.get('obfuscator_options', {}) or {} self.obfuscator_options = to_native_string( json.dumps( { # Valid values for this can be found at # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/database.md#connection-level-attributes 'dbms': 'mssql', 'replace_digits': is_affirmative( obfuscator_options_config.get( 'replace_digits', obfuscator_options_config.get('quantize_sql_tables', False), ) ), 'keep_sql_alias': is_affirmative(obfuscator_options_config.get('keep_sql_alias', True)), 'return_json_metadata': is_affirmative(obfuscator_options_config.get('collect_metadata', True)), 'table_names': is_affirmative(obfuscator_options_config.get('collect_tables', True)), 'collect_commands': is_affirmative(obfuscator_options_config.get('collect_commands', True)), 'collect_comments': is_affirmative(obfuscator_options_config.get('collect_comments', True)), } ) ) self.static_info_cache = TTLCache( maxsize=100, # cache these for a full day ttl=60 * 60 * 24, ) # Query declarations check_queries = [] if is_affirmative(self.instance.get('include_ao_metrics', False)): check_queries.extend( [ QUERY_AO_AVAILABILITY_GROUPS, QUERY_AO_FAILOVER_CLUSTER, QUERY_AO_FAILOVER_CLUSTER_MEMBER, ] ) if is_affirmative(self.instance.get('include_fci_metrics', False)): check_queries.extend([QUERY_FAILOVER_CLUSTER_INSTANCE]) self._check_queries = self._new_query_executor(check_queries) self.check_initializations.append(self._check_queries.compile_queries) self.server_state_queries = self._new_query_executor([QUERY_SERVER_STATIC_INFO]) self.check_initializations.append(self.server_state_queries.compile_queries) # use QueryManager to process custom queries self._query_manager = QueryManager( self, self.execute_query_raw, tags=self.tags, hostname=self.resolved_hostname ) self._dynamic_queries = None self.check_initializations.append(self.config_checks) self.check_initializations.append(self._query_manager.compile_queries) self.check_initializations.append(self.initialize_connection)
def _make_metric_list_to_collect(self, custom_metrics): """ Store the list of metrics to collect by instance_key. Will also create and cache cursors to query the db. """ metrics_to_collect = [] tags = self.instance.get('tags', []) # Load instance-level (previously Performance) metrics) # If several check instances are querying the same server host, it can be wise to turn these off # to avoid sending duplicate metrics if is_affirmative(self.instance.get('include_instance_metrics', True)): for name, counter_name, instance_name in self.INSTANCE_METRICS: try: sql_type, base_name = self.get_sql_type(counter_name) cfg = { 'name': name, 'counter_name': counter_name, 'instance_name': instance_name, 'tags': tags, } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=DEFAULT_PERFORMANCE_TABLE, base_name=base_name, sql_type=sql_type)) except SQLConnectionError: raise except Exception: self.log.warning("Can't load the metric %s, ignoring", name, exc_info=True) continue # Load database statistics for name, table, column in self.DATABASE_METRICS: # include database as a filter option db_name = self.instance.get('database', self.connection.DEFAULT_DATABASE) cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load metrics from scheduler and task tables, if enabled if is_affirmative( self.instance.get('include_task_scheduler_metrics', False)): for name, table, column in self.TASK_SCHEDULER_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'tags': tags } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load any custom metrics from conf.d/sqlserver.yaml for cfg in custom_metrics: sql_type = None base_name = None custom_tags = tags + cfg.get('tags', []) cfg['tags'] = custom_tags db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE) if db_table not in VALID_TABLES: self.log.error('%s has an invalid table name: %s', cfg['name'], db_table) continue if cfg.get('database', None) and cfg.get( 'database') != self.instance.get('database'): self.log.debug( 'Skipping custom metric %s for database %s, check instance configured for database %s', cfg['name'], cfg.get('database'), self.instance.get('database'), ) continue if db_table == DEFAULT_PERFORMANCE_TABLE: user_type = cfg.get('type') if user_type is not None and user_type not in VALID_METRIC_TYPES: self.log.error('%s has an invalid metric type: %s', cfg['name'], user_type) sql_type = None try: if user_type is None: sql_type, base_name = self.get_sql_type( cfg['counter_name']) except Exception: self.log.warning("Can't load the metric %s, ignoring", cfg['name'], exc_info=True) continue metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=db_table, base_name=base_name, user_type=user_type, sql_type=sql_type)) else: for column in cfg['columns']: metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=db_table, base_name=base_name, sql_type=sql_type, column=column)) self.instance_metrics = metrics_to_collect self.log.debug("metrics to collect %s", metrics_to_collect) # create an organized grouping of metric names to their metric classes for m in metrics_to_collect: cls = m.__class__.__name__ name = m.sql_name or m.column self.log.debug("Adding metric class %s named %s", cls, name) self.instance_per_type_metrics[cls].append(name) if m.base_name: self.instance_per_type_metrics[cls].append(m.base_name)